chunk.ffdf.rdChunking method for ff_vector and ffdf objects (row-wise) automatically considering RAM requirements from recordsize as calculated from sum(.rambytes[vmode])
optional integer scalar representing the bytes needed to process an element of the ff_vector a single row of the ffdf
integer scalar limiting the number of bytes to be processed in one chunk, default from getOption("ffbatchbytes"), see also .rambytes
further arguments passed to chunk
A list with ri indexes each representing one chunk
x <- data.frame(x=as.double(1:26), y=factor(letters), z=ordered(LETTERS), stringsAsFactors = TRUE)
a <- as.ffdf(x)
ceiling(26 / (300 %/% sum(.rambytes[vmode(a)])))
#> [1] 2
chunk(a, BATCHBYTES=300)
#> $`1:13`
#> range index (ri) from 1 to 13 maxindex 26
#>
#> $`14:26`
#> range index (ri) from 14 to 26 maxindex 26
#>
ceiling(13 / (100 %/% sum(.rambytes[vmode(a)])))
#> [1] 3
chunk(a, from=1, to = 13, BATCHBYTES=100)
#> $`1:5`
#> range index (ri) from 1 to 5 maxindex 26
#>
#> $`6:10`
#> range index (ri) from 6 to 10 maxindex 26
#>
#> $`11:13`
#> range index (ri) from 11 to 13 maxindex 26
#>
rm(a); gc()
#> used (Mb) gc trigger (Mb) max used (Mb)
#> Ncells 1025704 54.8 1994352 106.6 1994352 106.6
#> Vcells 1891478 14.5 8388608 64.0 3877600 29.6
message("dummy example for linear regression with biglm on ffdf")
#> dummy example for linear regression with biglm on ffdf
library(biglm)
#> Loading required package: DBI
message("NOTE that . in formula requires calculating terms manually
because . as a data-dependant term is not allowed in biglm")
#> NOTE that . in formula requires calculating terms manually
#> because . as a data-dependant term is not allowed in biglm
form <- Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width + Species
lmfit <- lm(form, data=iris)
firis <- as.ffdf(iris)
for (i in chunk(firis, by=50)){
if (i[1]==1){
message("first chunk is: ", i[[1]],":",i[[2]])
biglmfit <- biglm(form, data=firis[i,,drop=FALSE])
}else{
message("next chunk is: ", i[[1]],":",i[[2]])
biglmfit <- update(biglmfit, firis[i,,drop=FALSE])
}
}
#> first chunk is: 1:50
#> next chunk is: 51:100
#> next chunk is: 101:150
summary(lmfit)
#>
#> Call:
#> lm(formula = form, data = iris)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -0.79424 -0.21874 0.00899 0.20255 0.73103
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 2.17127 0.27979 7.760 1.43e-12 ***
#> Sepal.Width 0.49589 0.08607 5.761 4.87e-08 ***
#> Petal.Length 0.82924 0.06853 12.101 < 2e-16 ***
#> Petal.Width -0.31516 0.15120 -2.084 0.03889 *
#> Speciesversicolor -0.72356 0.24017 -3.013 0.00306 **
#> Speciesvirginica -1.02350 0.33373 -3.067 0.00258 **
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.3068 on 144 degrees of freedom
#> Multiple R-squared: 0.8673, Adjusted R-squared: 0.8627
#> F-statistic: 188.3 on 5 and 144 DF, p-value: < 2.2e-16
#>
summary(biglmfit)
#> Large data regression model: biglm(form, data = firis[i, , drop = FALSE])
#> Sample size = 150
#> Coef (95% CI) SE p
#> (Intercept) 2.1713 1.6117 2.7309 0.2798 0.0000
#> Sepal.Width 0.4959 0.3237 0.6680 0.0861 0.0000
#> Petal.Length 0.8292 0.6922 0.9663 0.0685 0.0000
#> Petal.Width -0.3152 -0.6175 -0.0128 0.1512 0.0371
#> Speciesversicolor -0.7236 -1.2039 -0.2432 0.2402 0.0026
#> Speciesvirginica -1.0235 -1.6910 -0.3560 0.3337 0.0022
stopifnot(all.equal(coef(lmfit), coef(biglmfit)))