outliers.RdThis dataset is approximately bell shaped, but with some outliers. It is meant to be used for demonstration purposes. If students are tempted to throw out all outliers, then have them work with this data (or use a scaled/centered/shuffled version as errors in a regression problem) and see how many throw away 3/4 of the data before rethinking their strategy.
data(outliers)The format is: num [1:100] -1.548 0.172 -0.638 0.233 -0.228 ...
This is simulated data meant to demonstrate "outliers".
Simulated, see the examples section.
data(outliers)
qqnorm(outliers)
qqline(outliers)
hist(outliers)
o.chuck <- function(x) { # function to throw away outliers
qq <- quantile(x, c(1,3)/4, names=FALSE)
r <- diff(qq) * 1.5
tst <- x < qq[1] - r | x > qq[2] + r
if(any(tst)) {
cat('Removing ', paste(x[tst], collapse=', '), '\n')
x <- x[!tst]
out <- Recall(x)
} else {
out <- x
}
out
}
x <- o.chuck( outliers )
#> Removing -38.0188470520892, 41.3998761630706
#> Removing 37.7254590518428
#> Removing -32.617721056949, 34.7123828196885, -33.3173386701734
#> Removing -30.5897598576629, -31.3666838246065, 32.2045826847981
#> Removing -25.7792864466526, 28.8645653266685
#> Removing -25.5510450110133, 27.1310262012897, 27.2200556465563
#> Removing -24.2465147302268, 24.6533216159894
#> Removing -21.0247217967932, 23.4707729629435
#> Removing 20.1533689283997, 21.6405669647568, -20.2171851949336, 20.8010219332559
#> Removing 19.2641800976782, 19.0013586098618, 18.7616896977341, 19.457788375338, -18.8293239848544, -18.9437702752492
#> Removing 17.530663157418, 17.148643517833, 17.4680712209061, -17.268238994146, -17.3201673776458, -17.5384160660842
#> Removing -15.9924647726683, 16.0548292023337, -16.3808896481785, 16.1103064916208, -16.648796575613, -16.4504844329899, -16.7398192623181, -16.8005148185563
#> Removing -13.8846170350603, 14.5899851932271, -13.9943191858823, 14.8025506681819
#> Removing -12.489787807559, 13.0077936620969
#> Removing -10.1881202676564, 12.114181708785
#> Removing -7.82792554259193, -8.00899605308949, 9.73686423081927
#> Removing 8.13154010665404
#> Removing -5.72028441291228, -5.7026701005752, 7.1332010769708
#> Removing 6.5934127371113
#> Removing -4.50871071033652, 5.60745401739877
#> Removing -3.79409383578585, 4.55653891000088, -3.82862237780094, -4.21014222590597, 4.58832966905574, -3.72496986013893, 4.65320297619834, -3.89352522213875
#> Removing 4.05151352045196, -3.17976890362346, 3.43609572821794, -3.04709006951779, 4.0309149972316, 4.33018518245162
#> Removing 2.96065350034087
length(x)
#> [1] 25
if(require(MASS)) {
char2seed('robust')
x <- 1:100
y <- 3 + 2*x + sample(scale(outliers))*10
plot(x,y)
fit <- lm(y~x)
abline(fit, col='red')
fit.r <- rlm(y~x)
abline(fit.r, col='blue', lty='dashed')
rbind(coef(fit), coef(fit.r))
length(o.chuck(resid(fit)))
}
#> Loading required package: MASS
#> Removing 25.8775509543776
#> Removing 23.1101877518972
#> [1] 98
### The data was generated using code similar to:
char2seed('outlier')
outliers <- rnorm(25)
dir <- 1
while( length(outliers) < 100 ){
qq <- quantile(c(outliers, dir*Inf), c(1,3)/4)
outliers <- c(outliers,
qq[ 1.5 + dir/2 ] + dir*1.55*diff(qq) + dir*abs(rnorm(1)) )
dir <- -dir
}