keyUpdate.RdThe following chores must be handled. 1. If the data.frame has variables which are not currently listed in the variable key's "name_old" variable, then new variables are added to the key. 2. If the data.frame has new values for the previously existing variables, then those values must be added to the keys. 3. If the old key has "name_new" or "class_new" designated for variables, those MUST be preserved in the new key for all new values of those variables.
keyUpdate(key, dframe, append = TRUE, safeNumericToInteger = TRUE)A variable key
A data.frame object.
If long key, should new rows be added to the end of the updated key? Default is TRUE. If FALSE, new rows will be sorted with the original values.
Default TRUE: Should we treat
variables which appear to be integers as integers? In many csv
data sets, the values coded c(1, 2, 3) are really
integers, not floats c(1.0, 2.0, 3.0). See
safeInteger.
## Need to consider implementing this:
## @param ignoreCase
Updated variable key.
This function will not alter key values for "class_old", "value_old" or "value_new" for variables that have no new information.
This function deduces if the key provided is in the wide or long format from the class of the object.
## Original data frame has 2 variables
dat1 <- data.frame("Score" = c(1, 2, 3, 42, 4, 2),
"Gender" = c("M", "M", "M", "F", "F", "F"))
## New data has all of original dat1, plus a new variable "Weight"
#and has new values for "Gender" and "Score"
dat2 <- plyr::rbind.fill(dat1, data.frame("Score" = 7,
"Gender" = "other", "Weight" = rnorm(3)))
## Create a long key for the original data, specify some
## recodes for Score and Gender in value_new
key1.long <- keyTemplate(dat1, long = TRUE, varlab = TRUE)
key1.long$value_new <- gsub("42", "10", key1.long$value_new)
key1.long$value_new[key1.long$name_new == "Gender"] <-
mgsub(c("F", "M"), c("female", "male"),
key1.long$value_new[key1.long$name_new == "Gender"])
key1.long[key1.long$name_old == "Score", "name_new"] <- "NewScore"
keyUpdate(key1.long, dat2, append = TRUE)
#> name_old name_new class_old class_new value_old value_new missings recodes
#> 1 Gender Gender character character F female
#> 2 Gender Gender character character M male
#> 3 Gender Gender character character . .
#> 4 Gender Gender character character other other
#> 5 Score NewScore integer integer 1 1
#> 6 Score NewScore integer integer 2 2
#> 7 Score NewScore integer integer 3 3
#> 8 Score NewScore integer integer 4 4
#> 9 Score NewScore integer integer 42 10
#> 10 Score NewScore integer integer . .
#> 11 Score NewScore integer integer 7 7
#> 12 Weight Weight numeric numeric . .
## Throw away one row, make sure key still has Score values
dat2 <- dat2[-1,]
(key1.long.u <- keyUpdate(key1.long, dat2, append = FALSE))
#> name_old name_new class_old class_new value_old value_new missings recodes
#> 1 Gender Gender character character F female
#> 2 Gender Gender character character M male
#> 3 Gender Gender character character other other
#> 4 Gender Gender character character . .
#> 5 Score NewScore integer integer 1 1
#> 6 Score NewScore integer integer 2 2
#> 7 Score NewScore integer integer 3 3
#> 8 Score NewScore integer integer 4 4
#> 9 Score NewScore integer integer 42 10
#> 10 Score NewScore integer integer 7 7
#> 11 Score NewScore integer integer . .
#> 12 Weight Weight numeric numeric . .
## Key change Score to character variable
key1.longc <- key1.long
key1.longc[key1.longc$name_old == "Score", "class_new"] <- "character"
keyUpdate(key1.longc, dat2, append = TRUE)
#> name_old name_new class_old class_new value_old value_new missings recodes
#> 1 Gender Gender character character F female
#> 2 Gender Gender character character M male
#> 3 Gender Gender character character . .
#> 4 Gender Gender character character other other
#> 5 Score NewScore integer character 1 1
#> 6 Score NewScore integer character 2 2
#> 7 Score NewScore integer character 3 3
#> 8 Score NewScore integer character 4 4
#> 9 Score NewScore integer character 42 10
#> 10 Score NewScore integer character . .
#> 11 Score NewScore integer character 7 7
#> 12 Weight Weight numeric numeric . .
str(dat3 <- keyApply(dat2, key1.longc))
#> Score (old var)
#> NewScore 2 3 4 7 42
#> 10 0 0 0 0 1
#> 2 2 0 0 0 0
#> 3 0 1 0 0 0
#> 4 0 0 1 0 0
#> <NA> 0 0 0 3 0
#> Gender (old var)
#> Gender F M other
#> female 3 0 0
#> male 0 2 0
#> <NA> 0 0 3
#> 'data.frame': 8 obs. of 2 variables:
#> $ NewScore: chr "2" "3" "10" "4" ...
#> $ Gender : chr "male" "male" "female" "female" ...
## Now try a wide key
key1.wide <- keyTemplate(dat1)
## Put in new values, same as in key1.long
key1.wide[key1.wide$name_old == "Score", c("name_new", "value_new")] <-
c("NewScore", "1|2|3|4|10|.")
key1.wide[key1.wide$name_old == "Gender", "value_new"] <- "female|male|."
## Make sure key1.wide equivalent to key1.long:
## If this is not true, it is a fail
all.equal(long2wide(key1.long), key1.wide, check.attributes = FALSE)
#> [1] TRUE
(key1.wide.u <- keyUpdate(key1.wide, dat2))
#> name_old name_new class_old class_new value_old value_new
#> Gender Gender Gender character character F|M|other|. female|male|other|.
#> Score Score NewScore integer integer 1|2|3|4|42|7|. 1|2|3|4|10|7|.
#> Weight Weight Weight numeric numeric . .
#> missings recodes
#> Gender
#> Score
#> Weight
key1.long.to.wide <- long2wide(key1.long.u)
all.equal(key1.long.to.wide, key1.wide.u, check.attributes = FALSE)
#> [1] TRUE
str(keyApply(dat2, key1.wide.u))
#> Gender (old var)
#> Gender F M other
#> female 3 0 0
#> male 0 2 0
#> other 0 0 3
#> Score (old var)
#> NewScore 2 3 4 7 42
#> 2 2 0 0 0 0
#> 3 0 1 0 0 0
#> 4 0 0 1 0 0
#> 7 0 0 0 3 0
#> 10 0 0 0 0 1
#> Weight (old var)
#> Weight -0.407512329321948 0.397845800485835 1.58281715113856 <NA>
#> -0.407512329321948 1 0 0 0
#> 0.397845800485835 0 1 0 0
#> 1.58281715113856 0 0 1 0
#> <NA> 0 0 0 5
#> 'data.frame': 8 obs. of 3 variables:
#> $ Gender : chr "male" "male" "female" "female" ...
#> $ NewScore: int 2 3 10 4 2 7 7 7
#> $ Weight : num NA NA NA NA NA ...
mydf.key.path <- system.file("extdata", "mydf.key.csv", package = "kutils")
mydf.key <- keyImport(mydf.key.path)
#> keyImport guessed that is a wide format key.
##'
set.seed(112233)
N <- 20
## The new Jan data arrived!
mydf2 <- data.frame(x5 = rnorm(N),
x4 = rpois(N, lambda = 3),
x3 = ordered(sample(c("lo", "med", "hi"),
size = N, replace=TRUE),
levels = c("med", "lo", "hi")),
x2 = letters[sample(c(1:4,6), N, replace = TRUE)],
x1 = factor(sample(c("jan"), N, replace = TRUE)),
x7 = ordered(letters[sample(c(1:4,6), N, replace = TRUE)]),
x6 = sample(c(1:5), N, replace = TRUE),
stringsAsFactors = FALSE)
mydf.key2 <- keyUpdate(mydf.key, mydf2)
mydf.key2
#> name_old name_new class_old class_new value_old
#> x1 x1 x1 factor ordered cindy|bobby|peter|marcia|greg|jan|.
#> x2 x2 x2 character ordered f|d|c|b|a|.
#> x3 x3 x3 ordered ordered lo<med<hi<.
#> x4 x4 x4 integer integer 0|1|2|3|4|5|6|.
#> x5 x5 x5 numeric character .
#> x6 x6 x6 integer ordered 1|2|3|4|5|.
#> x7 x7 x7 ordered ordered f<d<c<b<a<.
#> value_new missings recodes
#> x1 Cindy<Bobby<Peter<Marcia<Greg<jan<.
#> x2 f<d<c<b<a<.
#> x3 lo<mid<mid<.
#> x4 0|1|2|3|4|5|6|. >= 999
#> x5 . <= -999
#> x6 F<D<C<B<A<.
#> x7 fail<fail<pass<pass<pass<.
mydf.key2["x1", "value_old"] <- "cindy|bobby|jan|peter|marcia|greg|."
mydf.key2["x1", "value_new"] <- "Cindy<Bobby<Jan<Peter<Marcia<Greg<."
##'
mydf.key.path <- system.file("extdata", "mydf.key.csv", package = "kutils")
mydf.path <- system.file("extdata", "mydf.csv", package = "kutils")
mydf <- read.csv(mydf.path, stringsAsFactors=FALSE)
mydf3 <- rbind(mydf, mydf2)
## Now recode with revised key
mydf4 <- keyApply(mydf3, mydf.key2)
#> x1 (old var)
#> x1 bobby cindy greg jan marcia peter
#> Cindy 0 34 0 0 0 0
#> Bobby 43 0 0 0 0 0
#> Jan 0 0 0 20 0 0
#> Peter 0 0 0 0 0 42
#> Marcia 0 0 0 0 51 0
#> Greg 0 0 30 0 0 0
#> x2 (old var)
#> x2 a b c d f
#> f 0 0 0 0 36
#> d 0 0 0 39 0
#> c 0 0 55 0 0
#> b 0 45 0 0 0
#> a 45 0 0 0 0
#> x3 (old var)
#> x3 hi lo med
#> lo 0 72 0
#> mid 81 0 67
#> x4 (old var)
#> x4 0 1 2 3 4 5 6 7 11 999
#> 0 11 0 0 0 0 0 0 0 0 0
#> 1 0 35 0 0 0 0 0 0 0 0
#> 2 0 0 50 0 0 0 0 0 0 0
#> 3 0 0 0 42 0 0 0 0 0 0
#> 4 0 0 0 0 37 0 0 0 0 0
#> 5 0 0 0 0 0 23 0 0 0 0
#> 6 0 0 0 0 0 0 7 0 0 0
#> <NA> 0 0 0 0 0 0 0 4 1 10
#> [1] "Variable x5 has 20 unique values. Too large for a table."
#> x6 (old var)
#> x6 1 2 3 4 5
#> F 49 0 0 0 0
#> D 0 43 0 0 0
#> C 0 0 35 0 0
#> B 0 0 0 50 0
#> A 0 0 0 0 43
#> x7 (old var)
#> x7 a b c d f
#> fail 0 0 0 50 51
#> pass 39 43 37 0 0
rockchalk::summarize(mydf4)
#> Numeric variables
#> x4
#> min 0
#> med 3
#> max 6
#> mean 2.761
#> sd 1.517
#> skewness 0.171
#> kurtosis -0.762
#> nobs 205
#> nmissing 15
#>
#> Nonnumeric variables
#> x1 x2 x3
#> Marcia : 51 f: 36 lo : 72
#> Bobby : 43 d: 39 mid: 148
#> Peter : 42 c: 55
#> Cindy : 34 b: 45
#> (All Others): 50 a: 45
#> nobs : 220.000 nobs : 220.000 nobs : 220.000
#> nmiss : 0.000 nmiss : 0.000 nmiss : 0.000
#> entropy : 2.528 entropy : 2.306 entropy : 0.912
#> normedEntropy: 0.978 normedEntropy: 0.993 normedEntropy: 0.912
#> x5 x6 x7
#> -0.00611343753042384: 1 F: 49 fail: 101
#> -0.00910333746862187: 1 D: 43 pass: 119
#> -0.0198082450349602 : 1 C: 35
#> -0.0439611186187257 : 1 B: 50
#> (All Others) : 206 A: 43
#> nobs : 210.00 nobs : 220.000 nobs : 220.000
#> nmiss : 10.00 nmiss : 0.000 nmiss : 0.000
#> entropy : 7.71 entropy : 2.311 entropy : 0.995
#> normedEntropy: 1.00 normedEntropy: 0.995 normedEntropy: 0.995