Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

Arguments

x

Model object

...

Additional arguments to be passed to the low level functions

n

Number of simulated values/individuals

p

Parameter value (optional)

normal

Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition

cond

for internal use

sigma

Default residual variance (1)

rho

Default covariance parameter (0.5)

X

Optional matrix of fixed values of variables (manipulation)

Return Inverse link transformed data

latent

Include latent variables (default TRUE)

use.labels

convert categorical variables to factors before applying transformation

seed

Random seed

Author

Klaus K. Holst

Examples

##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#>   y          x z
#> 1 1  2.5577940 1
#> 2 1  0.6289062 1
#> 3 0 -2.5114518 0
#> 4 0 -0.6517820 0
#> 5 1  1.9549094 1
#> 6 0  0.3479851 0

e <- estimate(m,d,estimator="glm")
e
#>              Estimate Std. Error  Z-value  P-value
#> Regressions:                                      
#>    y~x        0.98403    0.08625 11.40896   <1e-12
#>    y~z        1.16438    0.16804  6.92926 4.23e-12
#>     x~z       1.00992    0.06547 15.42639   <1e-12
#> Intercepts:                                       
#>    y         -0.16286    0.09836 -1.65584  0.09775
#>    x         -0.00032    0.04669 -0.00675   0.9946
#> Dispersion:                                       
#>    x          1.07372                             
## Simulate a few observation from estimated model
sim(e,n=5)
#>   y          x z
#> 1 1  1.4902578 1
#> 2 0 -1.4551789 1
#> 3 1  1.8096418 1
#> 4 0 -1.4846894 0
#> 5 1  0.9795036 1

##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#>     y          x z
#> 1  14  1.0060825 1
#> 2   4  0.6183958 1
#> 3   7  0.4521507 1
#> 4   2  0.2320851 1
#> 5 121  2.4243668 1
#> 6   0 -1.2190155 1
estimate(m,d,estimator="glm")
#>                Estimate Std. Error    Z-value  P-value
#> Regressions:                                          
#>    y~x          1.99872    0.00200 1001.16217   <1e-12
#>    y~z          0.98636    0.01193   82.70802   <1e-12
#>     x~z         1.01006    0.02265   44.59182   <1e-12
#> Intercepts:                                           
#>    y           -0.98042    0.01237  -79.23054   <1e-12
#>    x           -0.00907    0.01940   -0.46750   0.6401
#> Dispersion:                                           
#>    x            1.00426                               
mean(d$z); lava:::expit(1)
#> [1] 0.7256
#> [1] 0.7310586

summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#> 
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.2252 -0.9823  0.0325  0.9402  5.4313 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.98956    0.04439   22.29   <2e-16 ***
#> x            4.03106    0.04417   91.25   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.402 on 998 degrees of freedom
#> Multiple R-squared:  0.893,	Adjusted R-squared:  0.8929 
#> F-statistic:  8327 on 1 and 998 DF,  p-value: < 2.2e-16
#> 

##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#> 
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.499081   0.005034   99.14   <2e-16 ***
#> x           1.036826   0.016498   62.85   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> (Dispersion parameter for Gamma family taken to be 0.5158464)
#> 
#>     Null deviance: 8541.5  on 9999  degrees of freedom
#> Residual deviance: 5551.2  on 9998  degrees of freedom
#> AIC: 20680
#> 
#> Number of Fisher Scoring iterations: 6
#> 
if (FALSE) MASS::gamma.shape(g) # \dontrun{}

args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, 
#>     log = FALSE, ...) 
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#>  [1] -0.06146023 -0.15297105 -1.55859772 -2.33899184 -0.13261446 -0.22364401
#>  [7] -0.13945111  0.31993208  0.19293624  0.17868933

##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#>          y
#> y 1.187236
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#>            y
#> y 0.05303781

##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#> 
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -2.9770 -0.6711 -0.0585  0.7070  3.6556 
#> 
#> Coefficients:
#>                Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)    0.007315   0.061044   0.120    0.905    
#> x              0.972089   0.047008  20.679   <2e-16 ***
#> z              0.995315   0.054426  18.287   <2e-16 ***
#> I(z > 0)TRUE   0.025211   0.105441   0.239    0.811    
#> x:I(z > 0)TRUE 1.021637   0.065233  15.661   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.014 on 995 degrees of freedom
#> Multiple R-squared:  0.7703,	Adjusted R-squared:  0.7694 
#> F-statistic: 834.4 on 4 and 995 DF,  p-value: < 2.2e-16
#> 

##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
                               Binary.lvm(),       ## Vector of ones
                               Binary.lvm(0.5),    ##  0.5n 0, 0.5n 1
                               Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#>            x z v w
#> 1  0.0000000 1 0 0
#> 2  0.5555556 1 0 0
#> 3  1.1111111 1 0 1
#> 4  1.6666667 1 0 1
#> 5  2.2222222 1 0 1
#> 6  2.7777778 1 1 0
#> 7  3.3333333 1 1 0
#> 8  3.8888889 1 1 1
#> 9  4.4444444 1 1 1
#> 10 5.0000000 1 1 1

##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)

if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$t)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }

##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
                             loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$y)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#>              y           x          z             t
#> 1    0.4641322  0.90979684  0.9450201  4.386143e-01
#> 2    0.5780113  1.11989582  0.3499724  2.022880e-01
#> 3    0.1906364  0.01722635  0.7479131  1.425794e-01
#> 4    1.0765679 -0.04989872  0.2386936  2.569699e-01
#> 5    1.6427577  0.54597959  0.8221244  1.350551e+00
#> 6  -41.9514857 -0.31551318  4.8776866 -2.046262e+02
#> 7    1.2838462  1.25150386  2.0931514  2.687285e+00
#> 8    1.6555105  0.40068497  0.7282096  1.205559e+00
#> 9    1.4386263  0.07996742 10.6085516  1.526174e+01
#> 10   0.0657008  1.61233073  0.1113936  7.318652e-03
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }

##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)

m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }

m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#>           y1        y2        y3
#> y1 1.7879796 0.8986911 0.7334187
#> y2 0.8986911 1.9711804 0.7104499
#> y3 0.7334187 0.7104499 1.7411134

## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#>           y          x          z
#> 1 1002.5079  1.9003038  1.3207004
#> 2 1001.5635 -0.6600688  0.6022015
#> 3  999.8989  0.0284914  0.5194575
#> 4  997.8830 -1.0465310 -0.6467731
#> 5 1000.2193  0.5358772  0.3953164
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#>            y          x          z
#> 1  1.4379189 -0.5940062  0.2075005
#> 2  0.2787988  0.1730358 -0.2492215
#> 3  0.1981616  0.7085807 -0.5910783
#> 4 -1.2663856 -1.4670154  0.2479143
#> 5 -1.8845258 -1.2254667  1.0021884
sim(m,5,p=c(m1=100))
#>           y          x          z
#> 1 101.61422  1.2893309  0.1184869
#> 2  97.25223 -1.5376382 -1.2306975
#> 3 100.27654 -0.8698709  0.9848374
#> 4 102.17923  1.1440416  0.3391601
#> 5  97.62066 -1.0514781 -0.4981777

##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)

if (interactive()) {
    d <- sim(m,1e3)
    require(survival)
    (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
    plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}

##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"

regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }

m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#> 
#>    A    B    C 
#> 5008 3013 1979 
glm(y~v, data=sim(m,1e4))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>     0.02614      1.99230     -1.00798  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    22520 
#> Residual Deviance: 9971 	AIC: 28360
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>     0.00163      2.98778     -1.00968  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    34460 
#> Residual Deviance: 10310 	AIC: 28690

transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#>    v          y          z    v2
#> 1  A -0.2367056 -0.6076527  TRUE
#> 2  B  1.8585600  3.3210809 FALSE
#> 3  A  0.9679083 -2.1797290  TRUE
#> 4  B  2.8827087  1.8835530 FALSE
#> 5  A -1.3114255  1.6318984  TRUE
#> 6  C -1.2983199 -0.4557392 FALSE
#> 7  B  0.1109659  5.3286566 FALSE
#> 8  B  2.9988011  4.7953034 FALSE
#> 9  A -0.4407522  0.6218654  TRUE
#> 10 A -0.1266534  0.5531538  TRUE

##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#>              y           x
#> 1   0.81233476 -0.97131096
#> 2  -0.64987060  0.20623939
#> 3  -2.01949824 -0.53276023
#> 4   4.48232406  2.44643312
#> 5   2.17489768 -0.08444089
#> 6  -1.39811580 -0.81595413
#> 7   2.19349947  0.96794188
#> 8  -0.03079853  0.05816394
#> 9   1.07318999  1.10372137
#> 10 -2.22883211 -1.61141878
sim(m2,10) ## Faster
#>             y          x
#> 1  -3.2338549 -1.7731064
#> 2   0.3875014  0.1975684
#> 3   3.2081255  1.5528790
#> 4  -2.0997483 -0.3177804
#> 5   2.2187278  0.6917520
#> 6   1.1863361  0.4877073
#> 7   2.5965386  1.4051690
#> 8   1.4001423  0.6924867
#> 9  -0.4259144  0.6072422
#> 10 -1.5955365 -1.6338646