Cut a Numeric Variable into Intervals
cut2.Rdcut2 is a function like cut but left endpoints are inclusive and labels are of
the form [lower, upper), except that last interval is [lower,upper].
If cuts are given, will by default make sure that cuts include entire
range of x.
Also, if cuts are not given, will cut x into quantile groups
(g given) or groups
with a given minimum number of observations (m). Whereas cut creates a
category object, cut2 creates a factor object. m is not guaranteed but is a target.
cutGn guarantees that the grouped variable will have a minimum of m observations in any group. This is done by an exhaustive algorithm that runs fast due to being coded in Fortran.
Usage
cut2(x, cuts, m=150, g, levels.mean=FALSE, digits, minmax=TRUE,
oneval=TRUE, onlycuts=FALSE, formatfun=format, ...)
cutGn(x, m, what=c('mean', 'factor', 'summary', 'cuts', 'function'), rcode=FALSE)Arguments
- x
numeric vector to classify into intervals
- cuts
cut points
- m
desired minimum number of observations in a group. The algorithm does not guarantee that all groups will have at least
mobservations.- g
number of quantile groups
- levels.mean
set to
TRUEto make the new categorical vector have levels attribute that is the group means ofxinstead of interval endpoint labels- digits
number of significant digits to use in constructing levels. Default is 3 (5 if
levels.mean=TRUE)- minmax
if cuts is specified but
min(x)<min(cuts)ormax(x)>max(cuts), augments cuts to include min and maxx- oneval
if an interval contains only one unique value, the interval will be labeled with the formatted version of that value instead of the interval endpoints, unless
oneval=FALSE- onlycuts
set to
TRUEto only return the vector of computed cuts. This consists of the interior values plus outer ranges.- formatfun
formatting function, supports formula notation (if
rlangis installed)- ...
additional arguments passed to
formatfun- what
specifies the kind of vector values to return from
cutGn, the default being like'levels.mean'ofcut2. Specify'summary'to return a numeric 3-column matrix that summarizes the intervals satisfying themrequirement. Usewhat='cuts'to only return the vector of computed cutpoints. To create a function that will recode the variable in play using the same intervals as computed bycutGn, specifywhat='function'. This function will have awhatargument to allow the user to decide later whether to recode into interval means or into afactorvariable.- rcode
set to
TRUEto run thecutgnalgorithm in R. This is useful for speed comparisons with the default compiled code.
Value
a factor variable with levels of the form [a,b) or formatted means
(character strings) unless onlycuts is TRUE in which case
a numeric vector is returned
Examples
set.seed(1)
x <- runif(1000, 0, 100)
z <- cut2(x, c(10,20,30))
table(z)
#> z
#> [ 0.131, 10.000) [ 10.000, 20.000) [ 20.000, 30.000) [ 30.000, 99.993]
#> 96 104 93 707
table(cut2(x, g=10)) # quantile groups
#>
#> [ 0.131, 10.5) [10.505, 20.2) [20.168, 31.2) [31.204, 39.8) [39.784, 48.4)
#> 100 100 100 100 100
#> [48.435, 59.6) [59.645, 70.7) [70.666, 79.7) [79.731, 91.0) [91.037,100.0]
#> 100 100 100 100 100
table(cut2(x, m=50)) # group x into intevals with at least 50 obs.
#>
#> [ 0.131, 5.52) [ 5.516, 10.51) [10.505, 15.48) [15.483, 20.17) [20.168, 25.82)
#> 50 50 50 50 50
#> [25.817, 31.20) [31.204, 35.32) [35.320, 39.78) [39.784, 44.15) [44.146, 48.43)
#> 50 50 50 50 50
#> [48.435, 52.78) [52.778, 59.64) [59.645, 65.09) [65.087, 70.67) [70.666, 74.76)
#> 50 50 50 50 50
#> [74.764, 79.73) [79.731, 85.51) [85.508, 91.04) [91.037, 95.37) [95.373, 99.99]
#> 50 50 50 50 50
table(cutGn(x, m=50, what='factor'))
#>
#> [ 0.1314657, 5.419043] [ 5.5164286,10.498764] [10.5050139,15.278814]
#> 50 50 50
#> [15.4831611,19.944218] [20.1681931,25.801678] [25.8165927,31.177237]
#> 50 50 50
#> [31.2036420,35.166672] [35.3197272,39.770609] [39.7843627,43.854007]
#> 50 50 50
#> [44.1459143,48.216957] [48.4349524,52.731078] [52.7781836,59.571200]
#> 50 50 50
#> [59.6448456,64.957946] [65.0870467,70.585901] [70.6661532,74.669827]
#> 50 50 50
#> [74.7635063,79.683608] [79.7308826,85.400150] [85.5082356,91.006093]
#> 50 50 50
#> [91.0370304,95.354840] [95.3732650,99.993059]
#> 50 50
f <- cutGn(x, m=50, what='function')
f
#> function (x = numeric(0), lower = c(0.131465657614172, 5.51642864011228,
#> 10.5050138663501, 15.4831611318514, 20.1681931037456, 25.8165926672518,
#> 31.2036419520155, 35.3197271935642, 39.7843627491966, 44.1459143068641,
#> 48.4349524369463, 52.7781835990027, 59.6448455704376, 65.087046707049,
#> 70.6661531934515, 74.7635063482448, 79.730882588774, 85.5082356370986,
#> 91.0370304249227, 95.3732650028542), upper = c(5.41904293932021,
#> 10.4987640399486, 15.2788141276687, 19.9442182900384, 25.8016780717298,
#> 31.1772374436259, 35.1666721282527, 39.7706091869622, 43.8540067756549,
#> 48.2169573195279, 52.7310776989907, 59.5711996313184, 64.9579460499808,
#> 70.5859006382525, 74.6698269620538, 79.6836083987728, 85.4001502273604,
#> 91.0060926806182, 95.3548396006227, 99.9930593650788), means = c(2.94814029010013,
#> 7.68022080603987, 12.9809578326531, 17.5481472252868, 22.8589586704038,
#> 28.0511431978084, 33.4899473823607, 37.5125395171344, 41.9874383662827,
#> 46.3290602895431, 50.4720099582337, 55.963427466806, 62.3762246477418,
#> 67.7136786920018, 72.4179831761867, 77.1320453374647, 82.7218419755809,
#> 88.1628532707691, 93.3742601098493, 97.6624671579339), levels = c("[ 0.1314657, 5.419043]",
#> "[ 5.5164286,10.498764]", "[10.5050139,15.278814]", "[15.4831611,19.944218]",
#> "[20.1681931,25.801678]", "[25.8165927,31.177237]", "[31.2036420,35.166672]",
#> "[35.3197272,39.770609]", "[39.7843627,43.854007]", "[44.1459143,48.216957]",
#> "[48.4349524,52.731078]", "[52.7781836,59.571200]", "[59.6448456,64.957946]",
#> "[65.0870467,70.585901]", "[70.6661532,74.669827]", "[74.7635063,79.683608]",
#> "[79.7308826,85.400150]", "[85.5082356,91.006093]", "[91.0370304,95.354840]",
#> "[95.3732650,99.993059]"), what = c("mean", "factor"))
#> {
#> what <- match.arg(what)
#> nint <- length(lower)
#> u <- unique(c(lower, max(upper)))
#> y <- approx(u, 1:length(u), xout = x, method = "constant")$y
#> y[!is.na(y) & y > nint] <- nint
#> if (what == "mean")
#> means[y]
#> else factor(y, 1:nint, levels)
#> }
#> <environment: 0x5e2f82c7af98>
f(c(-1, 2, 10), what='mean')
#> [1] NA 2.948140 7.680221
f(c(-1, 2, 10), what='factor')
#> [1] <NA> [ 0.1314657, 5.419043] [ 5.5164286,10.498764]
#> 20 Levels: [ 0.1314657, 5.419043] ... [95.3732650,99.993059]
if (FALSE) { # \dontrun{
x <- round(runif(200000), 3)
system.time(a <- cutGn(x, m=20)) # 0.02s
system.time(b <- cutGn(x, m=20, rcode=TRUE)) # 1.51s
identical(a, b)
} # }