Chunking method for ff_vector and ffdf objects (row-wise) automatically considering RAM requirements from recordsize as calculated from sum(.rambytes[vmode])

# S3 method for class 'ff_vector'
chunk(x
, RECORDBYTES = .rambytes[vmode(x)], BATCHBYTES = getOption("ffbatchbytes"), ...)
# S3 method for class 'ffdf'
chunk(x
, RECORDBYTES = sum(.rambytes[vmode(x)]), BATCHBYTES = getOption("ffbatchbytes"), ...)

Arguments

x

ff or ffdf

RECORDBYTES

optional integer scalar representing the bytes needed to process an element of the ff_vector a single row of the ffdf

BATCHBYTES

integer scalar limiting the number of bytes to be processed in one chunk, default from getOption("ffbatchbytes"), see also .rambytes

...

further arguments passed to chunk

Value

A list with ri indexes each representing one chunk

Author

Jens Oehlschlägel

See also

Examples

  x <- data.frame(x=as.double(1:26), y=factor(letters), z=ordered(LETTERS), stringsAsFactors = TRUE)
  a <- as.ffdf(x)
  ceiling(26 / (300 %/% sum(.rambytes[vmode(a)])))
#> [1] 2
  chunk(a, BATCHBYTES=300)
#> $`1:13`
#> range index (ri) from 1 to 13 maxindex 26 
#> 
#> $`14:26`
#> range index (ri) from 14 to 26 maxindex 26 
#> 
  ceiling(13 / (100 %/% sum(.rambytes[vmode(a)])))
#> [1] 3
  chunk(a, from=1, to = 13, BATCHBYTES=100)
#> $`1:5`
#> range index (ri) from 1 to 5 maxindex 26 
#> 
#> $`6:10`
#> range index (ri) from 6 to 10 maxindex 26 
#> 
#> $`11:13`
#> range index (ri) from 11 to 13 maxindex 26 
#> 
  rm(a); gc()
#>           used (Mb) gc trigger  (Mb) max used  (Mb)
#> Ncells 1025704 54.8    1994352 106.6  1994352 106.6
#> Vcells 1891478 14.5    8388608  64.0  3877600  29.6

  message("dummy example for linear regression with biglm on ffdf")
#> dummy example for linear regression with biglm on ffdf
  library(biglm)
#> Loading required package: DBI

  message("NOTE that . in formula requires calculating terms manually
    because . as a data-dependant term is not allowed in biglm")
#> NOTE that . in formula requires calculating terms manually
#>     because . as a data-dependant term is not allowed in biglm
  form <- Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species

  lmfit <- lm(form, data=iris)

  firis <- as.ffdf(iris)
  for (i in chunk(firis, by=50)){
    if (i[1]==1){
      message("first chunk is: ", i[[1]],":",i[[2]])
      biglmfit <- biglm(form, data=firis[i,,drop=FALSE])
    }else{
      message("next chunk is: ", i[[1]],":",i[[2]])
      biglmfit <- update(biglmfit, firis[i,,drop=FALSE])
    }
  }
#> first chunk is: 1:50
#> next chunk is: 51:100
#> next chunk is: 101:150

  summary(lmfit)
#> 
#> Call:
#> lm(formula = form, data = iris)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -0.79424 -0.21874  0.00899  0.20255  0.73103 
#> 
#> Coefficients:
#>                   Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)        2.17127    0.27979   7.760 1.43e-12 ***
#> Sepal.Width        0.49589    0.08607   5.761 4.87e-08 ***
#> Petal.Length       0.82924    0.06853  12.101  < 2e-16 ***
#> Petal.Width       -0.31516    0.15120  -2.084  0.03889 *  
#> Speciesversicolor -0.72356    0.24017  -3.013  0.00306 ** 
#> Speciesvirginica  -1.02350    0.33373  -3.067  0.00258 ** 
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.3068 on 144 degrees of freedom
#> Multiple R-squared:  0.8673,	Adjusted R-squared:  0.8627 
#> F-statistic: 188.3 on 5 and 144 DF,  p-value: < 2.2e-16
#> 
  summary(biglmfit)
#> Large data regression model: biglm(form, data = firis[i, , drop = FALSE])
#> Sample size =  150 
#>                      Coef    (95%     CI)     SE      p
#> (Intercept)        2.1713  1.6117  2.7309 0.2798 0.0000
#> Sepal.Width        0.4959  0.3237  0.6680 0.0861 0.0000
#> Petal.Length       0.8292  0.6922  0.9663 0.0685 0.0000
#> Petal.Width       -0.3152 -0.6175 -0.0128 0.1512 0.0371
#> Speciesversicolor -0.7236 -1.2039 -0.2432 0.2402 0.0026
#> Speciesvirginica  -1.0235 -1.6910 -0.3560 0.3337 0.0022
  stopifnot(all.equal(coef(lmfit), coef(biglmfit)))