Chunk ff_vector and ffdf

Chunking method for ff_vector and ffdf objects (row-wise) automatically considering RAM requirements from recordsize as calculated from sum(.rambytes[vmode])

# S3 method for class 'ff_vector'
chunk(x
, RECORDBYTES = .rambytes[vmode(x)], BATCHBYTES = getOption("ffbatchbytes"), ...)
# S3 method for class 'ffdf'
chunk(x
, RECORDBYTES = sum(.rambytes[vmode(x)]), BATCHBYTES = getOption("ffbatchbytes"), ...)

Arguments

x: ff or ffdf
RECORDBYTES: optional integer scalar representing the bytes needed to process an element of the ff_vector a single row of the ffdf
BATCHBYTES: integer scalar limiting the number of bytes to be processed in one chunk, default from getOption("ffbatchbytes"), see also .rambytes
...: further arguments passed to chunk

Value

A list with ri indexes each representing one chunk

Author

Jens Oehlschlägel

Examples

  x <- data.frame(x=as.double(1:26), y=factor(letters), z=ordered(LETTERS), stringsAsFactors = TRUE)
  a <- as.ffdf(x)
  ceiling(26 / (300 %/% sum(.rambytes[vmode(a)])))
#> [1] 2
  chunk(a, BATCHBYTES=300)
#> $`1:13`
#> range index (ri) from 1 to 13 maxindex 26 
#> 
#> $`14:26`
#> range index (ri) from 14 to 26 maxindex 26 
#> 
  ceiling(13 / (100 %/% sum(.rambytes[vmode(a)])))
#> [1] 3
  chunk(a, from=1, to = 13, BATCHBYTES=100)
#> $`1:5`
#> range index (ri) from 1 to 5 maxindex 26 
#> 
#> $`6:10`
#> range index (ri) from 6 to 10 maxindex 26 
#> 
#> $`11:13`
#> range index (ri) from 11 to 13 maxindex 26 
#> 
  rm(a); gc()
#>           used (Mb) gc trigger  (Mb) max used  (Mb)
#> Ncells 1025704 54.8    1994352 106.6  1994352 106.6
#> Vcells 1891478 14.5    8388608  64.0  3877600  29.6

  message("dummy example for linear regression with biglm on ffdf")
#> dummy example for linear regression with biglm on ffdf
  library(biglm)
#> Loading required package: DBI

  message("NOTE that . in formula requires calculating terms manually
    because . as a data-dependant term is not allowed in biglm")
#> NOTE that . in formula requires calculating terms manually
#>     because . as a data-dependant term is not allowed in biglm
  form <- Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species

  lmfit <- lm(form, data=iris)

  firis <- as.ffdf(iris)
  for (i in chunk(firis, by=50)){
    if (i[1]==1){
      message("first chunk is: ", i[[1]],":",i[[2]])
      biglmfit <- biglm(form, data=firis[i,,drop=FALSE])
    }else{
      message("next chunk is: ", i[[1]],":",i[[2]])
      biglmfit <- update(biglmfit, firis[i,,drop=FALSE])
    }
  }
#> first chunk is: 1:50
#> next chunk is: 51:100
#> next chunk is: 101:150

  summary(lmfit)
#> 
#> Call:
#> lm(formula = form, data = iris)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.79424 -0.21874  0.00899  0.20255  0.73103 
#> 
#> Coefficients:
#>                   Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)        2.17127    0.27979   7.760 1.43e-12 ***
#> Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
#> Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
#> Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
#> Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
#> Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.3068 on 144 degrees of freedom
#> Multiple R-squared:  0.8673,	Adjusted R-squared:  0.8627 
#> F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16
#> 
  summary(biglmfit)
#> Large data regression model: biglm(form, data = firis[i, , drop = FALSE])
#> Sample size =  150 
#>                      Coef    (95%     CI)     SE      p
#> (Intercept)        2.1713  1.6117  2.7309 0.2798 0.0000
#> Sepal.Width        0.4959  0.3237  0.6680 0.0861 0.0000
#> Petal.Length       0.8292  0.6922  0.9663 0.0685 0.0000
#> Petal.Width       -0.3152 -0.6175 -0.0128 0.1512 0.0371
#> Speciesversicolor -0.7236 -1.2039 -0.2432 0.2402 0.0026
#> Speciesvirginica  -1.0235 -1.6910 -0.3560 0.3337 0.0022
  stopifnot(all.equal(coef(lmfit), coef(biglmfit)))

Arguments

Value

Author

See also

Examples