Statistics with R, Notes
Foreword
Output options: the ‘tango’ syntax and the ‘readable’ theme.
Codes and snippets.
Descriptive Statistics
Extract basic, exploratory statistics from datasets with apply
, summary
, fivenum
, describe
, and stat.desc
.
# dataset
head ( longley , 3 )
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## 1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
## 1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
## 1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
# apply a function
# excluding missing values
sapply ( longley , mean , na.rm = TRUE )
## GNP.deflator GNP Unemployed Armed.Forces Population
## 101.6813 387.6984 319.3313 260.6687 117.4240
## Year Employed
## 1954.5000 65.3170
# mean, median, 25th and 75th quartiles, min, max
summary ( longley )
1
2
3
4
5
6
7
8
9
10
11
12
13
14 ## GNP.deflator GNP Unemployed Armed.Forces
## Min. : 83.00 Min. :234.3 Min. :187.0 Min. :145.6
## 1st Qu.: 94.53 1st Qu.:317.9 1st Qu.:234.8 1st Qu.:229.8
## Median :100.60 Median :381.4 Median :314.4 Median :271.8
## Mean :101.68 Mean :387.7 Mean :319.3 Mean :260.7
## 3rd Qu.:111.25 3rd Qu.:454.1 3rd Qu.:384.2 3rd Qu.:306.1
## Max. :116.90 Max. :554.9 Max. :480.6 Max. :359.4
## Population Year Employed
## Min. :107.6 Min. :1947 Min. :60.17
## 1st Qu.:111.8 1st Qu.:1951 1st Qu.:62.71
## Median :116.8 Median :1954 Median :65.50
## Mean :117.4 Mean :1954 Mean :65.32
## 3rd Qu.:122.3 3rd Qu.:1958 3rd Qu.:68.29
## Max. :130.1 Max. :1962 Max. :70.55
# Tukey min, lower-hinge, median, upper-hinge, max
fivenum ( longley $ GNP )
## [1] 234.289 306.787 381.427 463.625 554.894
# n, nmiss, unique, mean, 5, 10, 25, 50, 75, 90, 95th percentiles
# 5 lowest and 5 highest scores
library ( Hmisc )
describe ( longley )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102 ## longley
##
## 7 Variables 16 Observations
## ---------------------------------------------------------------------------
## GNP.deflator
## n missing distinct Info Mean Gmd .05 .10
## 16 0 16 1 101.7 12.74 86.90 88.35
## .25 .50 .75 .90 .95
## 94.53 100.60 111.25 114.95 116.00
##
## Value 83.0 88.2 88.5 89.5 96.2 98.1 99.0 100.0 101.2 104.6
## Frequency 1 1 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
##
## Value 108.4 110.8 112.6 114.2 115.7 116.9
## Frequency 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062
## ---------------------------------------------------------------------------
## GNP
## n missing distinct Info Mean Gmd .05 .10
## 16 0 16 1 387.7 117.8 252.1 258.7
## .25 .50 .75 .90 .95
## 317.9 381.4 454.1 510.4 527.4
##
## Value 234.289 258.054 259.426 284.599 328.975 346.999 363.112 365.385
## Frequency 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
##
## Value 397.469 419.180 442.769 444.546 482.704 502.601 518.173 554.894
## Frequency 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
## ---------------------------------------------------------------------------
## Unemployed
## n missing distinct Info Mean Gmd .05 .10
## 16 0 16 1 319.3 110.1 191.6 201.6
## .25 .50 .75 .90 .95
## 234.8 314.4 384.2 434.4 471.2
##
## Value 187.0 193.2 209.9 232.5 235.6 282.2 290.4 293.6 335.1 357.8
## Frequency 1 1 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
##
## Value 368.2 381.3 393.1 400.7 468.1 480.6
## Frequency 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062
## ---------------------------------------------------------------------------
## Armed.Forces
## n missing distinct Info Mean Gmd .05 .10
## 16 0 16 1 260.7 79.85 155.7 160.3
## .25 .50 .75 .90 .95
## 229.8 271.8 306.1 344.9 355.9
##
## Value 145.6 159.0 161.6 165.0 251.4 255.2 257.2 263.7 279.8 282.7
## Frequency 1 1 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
##
## Value 285.7 304.8 309.9 335.0 354.7 359.4
## Frequency 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062
## ---------------------------------------------------------------------------
## Population
## n missing distinct Info Mean Gmd .05 .10
## 16 0 16 1 117.4 8.229 108.4 109.2
## .25 .50 .75 .90 .95
## 111.8 116.8 122.3 126.6 128.4
##
## Value 107.608 108.632 109.773 110.929 112.075 113.270 115.094 116.219
## Frequency 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
##
## Value 117.388 118.734 120.445 121.950 123.366 125.368 127.852 130.081
## Frequency 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
## ---------------------------------------------------------------------------
## Year
## n missing distinct Info Mean Gmd .05 .10
## 16 0 16 1 1954 5.667 1948 1948
## .25 .50 .75 .90 .95
## 1951 1954 1958 1960 1961
##
## Value 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956
## Frequency 1 1 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
##
## Value 1957 1958 1959 1960 1961 1962
## Frequency 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062
## ---------------------------------------------------------------------------
## Employed
## n missing distinct Info Mean Gmd .05 .10
## 16 0 16 1 65.32 4.153 60.28 60.72
## .25 .50 .75 .90 .95
## 62.71 65.50 68.29 69.45 69.81
##
## Value 60.171 60.323 61.122 61.187 63.221 63.639 63.761 64.989 66.019
## Frequency 1 1 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062 0.062
##
## Value 66.513 67.857 68.169 68.655 69.331 69.564 70.551
## Frequency 1 1 1 1 1 1 1
## Proportion 0.062 0.062 0.062 0.062 0.062 0.062 0.062
## ---------------------------------------------------------------------------
# nbr.val, nbr.null, nbr.na, min max, range, sum,
# median, mean, SE.mean, CI.mean, var, std.dev, coef.var
library ( pastecs )
stat.desc ( longley )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 ## GNP.deflator GNP Unemployed Armed.Forces
## nbr.val 16.0000000 16.0000000 16.0000000 16.0000000
## nbr.null 0.0000000 0.0000000 0.0000000 0.0000000
## nbr.na 0.0000000 0.0000000 0.0000000 0.0000000
## min 83.0000000 234.2890000 187.0000000 145.6000000
## max 116.9000000 554.8940000 480.6000000 359.4000000
## range 33.9000000 320.6050000 293.6000000 213.8000000
## sum 1626.9000000 6203.1750000 5109.3000000 4170.7000000
## median 100.6000000 381.4270000 314.3500000 271.7500000
## mean 101.6812500 387.6984375 319.3312500 260.6687500
## SE.mean 2.6978884 24.8487344 23.3616062 17.3979901
## CI.mean.0.95 5.7504129 52.9638237 49.7940849 37.0829381
## var 116.4576250 9879.3536593 8732.2342917 4843.0409583
## std.dev 10.7915534 99.3949378 93.4464247 69.5919604
## coef.var 0.1061312 0.2563718 0.2926316 0.2669747
## Population Year Employed
## nbr.val 1.600000e+01 1.600000e+01 1.600000e+01
## nbr.null 0.000000e+00 0.000000e+00 0.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00
## min 1.076080e+02 1.947000e+03 6.017100e+01
## max 1.300810e+02 1.962000e+03 7.055100e+01
## range 2.247300e+01 1.500000e+01 1.038000e+01
## sum 1.878784e+03 3.127200e+04 1.045072e+03
## median 1.168035e+02 1.954500e+03 6.550400e+01
## mean 1.174240e+02 1.954500e+03 6.531700e+01
## SE.mean 1.739025e+00 1.190238e+00 8.779921e-01
## CI.mean.0.95 3.706645e+00 2.536932e+00 1.871396e+00
## var 4.838735e+01 2.266667e+01 1.233392e+01
## std.dev 6.956102e+00 4.760952e+00 3.511968e+00
## coef.var 5.923918e-02 2.435893e-03 5.376806e-02
# item name, item number, nvalid, mean, sd,
# median, mad, min, max, skew, kurtosis, se
library ( psych )
describe ( longley )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 ## vars n mean sd median trimmed mad min max
## GNP.deflator 1 16 101.68 10.79 100.60 101.93 15.79 83.00 116.90
## GNP 2 16 387.70 99.39 381.43 386.71 118.57 234.29 554.89
## Unemployed 3 16 319.33 93.45 314.35 317.26 116.75 187.00 480.60
## Armed.Forces 4 16 260.67 69.59 271.75 261.84 52.78 145.60 359.40
## Population 5 16 117.42 6.96 116.80 117.22 8.17 107.61 130.08
## Year 6 16 1954.50 4.76 1954.50 1954.50 5.93 1947.00 1962.00
## Employed 7 16 65.32 3.51 65.50 65.31 4.31 60.17 70.55
## range skew kurtosis se
## GNP.deflator 33.90 -0.13 -1.40 2.70
## GNP 320.61 0.02 -1.35 24.85
## Unemployed 293.60 0.14 -1.30 23.36
## Armed.Forces 213.80 -0.37 -1.20 17.40
## Population 22.47 0.26 -1.27 1.74
## Year 15.00 0.00 -1.43 1.19
## Employed 10.38 -0.09 -1.55 0.88
# with
with ( longley , median ( GNP ))
# vs.
median ( longley $ GNP )
Group data
# dataset
head ( mtcars , 3 )
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
# variable grouped by one factor to show a function
by ( mtcars $ mpg , mtcars $ cyl , FUN = function ( x ) { c ( m = mean ( x ), s = sd ( x )) })
## mtcars$cyl: 4
## m s
## 26 . 663636 4 . 509828
## --------------------------------------------------------
## mtcars$cyl: 6
## m s
## 19 . 742857 1 . 453567
## --------------------------------------------------------
## mtcars$cyl: 8
## m s
## 15 . 100000 2 . 560048
# description statistics by group
library ( psych )
describeBy ( mtcars , group = mtcars $ am )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53 ##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range
## mpg 1 19 17.15 3.83 17.30 17.12 3.11 10.40 24.40 14.00
## cyl 2 19 6.95 1.54 8.00 7.06 0.00 4.00 8.00 4.00
## disp 3 19 290.38 110.17 275.80 289.71 124.83 120.10 472.00 351.90
## hp 4 19 160.26 53.91 175.00 161.06 77.10 62.00 245.00 183.00
## drat 5 19 3.29 0.39 3.15 3.28 0.22 2.76 3.92 1.16
## wt 6 19 3.77 0.78 3.52 3.75 0.45 2.46 5.42 2.96
## qsec 7 19 18.18 1.75 17.82 18.07 1.19 15.41 22.90 7.49
## vs 8 19 0.37 0.50 0.00 0.35 0.00 0.00 1.00 1.00
## am 9 19 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## gear 10 19 3.21 0.42 3.00 3.18 0.00 3.00 4.00 1.00
## carb 11 19 2.74 1.15 3.00 2.76 1.48 1.00 4.00 3.00
## skew kurtosis se
## mpg 0.01 -0.80 0.88
## cyl -0.95 -0.74 0.35
## disp 0.05 -1.26 25.28
## hp -0.01 -1.21 12.37
## drat 0.50 -1.30 0.09
## wt 0.98 0.14 0.18
## qsec 0.85 0.55 0.40
## vs 0.50 -1.84 0.11
## am NaN NaN 0.00
## gear 1.31 -0.29 0.10
## carb -0.14 -1.57 0.26
## --------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew
## mpg 1 13 24.39 6.17 22.80 24.38 6.67 15.00 33.90 18.90 0.05
## cyl 2 13 5.08 1.55 4.00 4.91 0.00 4.00 8.00 4.00 0.87
## disp 3 13 143.53 87.20 120.30 131.25 58.86 71.10 351.00 279.90 1.33
## hp 4 13 126.85 84.06 109.00 114.73 63.75 52.00 335.00 283.00 1.36
## drat 5 13 4.05 0.36 4.08 4.02 0.27 3.54 4.93 1.39 0.79
## wt 6 13 2.41 0.62 2.32 2.39 0.68 1.51 3.57 2.06 0.21
## qsec 7 13 17.36 1.79 17.02 17.39 2.34 14.50 19.90 5.40 -0.23
## vs 8 13 0.54 0.52 1.00 0.55 0.00 0.00 1.00 1.00 -0.14
## am 9 13 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN
## gear 10 13 4.38 0.51 4.00 4.36 0.00 4.00 5.00 1.00 0.42
## carb 11 13 2.92 2.18 2.00 2.64 1.48 1.00 8.00 7.00 0.98
## kurtosis se
## mpg -1.46 1.71
## cyl -0.90 0.43
## disp 0.40 24.19
## hp 0.56 23.31
## drat 0.21 0.10
## wt -1.17 0.17
## qsec -1.42 0.50
## vs -2.13 0.14
## am NaN 0.00
## gear -1.96 0.14
## carb -0.21 0.60
# description statistics by group
library ( doBy )
summaryBy ( mpg + wt ~ cyl + vs , data = mtcars ,
FUN = function ( x ) { c ( m = mean ( x ), s = sd ( x )) } )
## cyl vs mpg.m mpg.s wt.m wt.s
## 1 4 0 26.00000 NA 2.140000 NA
## 2 4 1 26.73000 4.7481107 2.300300 0.5982073
## 3 6 0 20.56667 0.7505553 2.755000 0.1281601
## 4 6 1 19.12500 1.6317169 3.388750 0.1162164
## 5 8 0 15.10000 2.5600481 3.999214 0.7594047
Frequency Tables, CrossTables, and Independence
Create frequency and contingency tables from categorical variables. Perform tests of independence, measures of association, and graphically display results.
2D frequency tables
mytable <- table(A, B)
where A are rows, B are columns.
# dataset
mytable <- matrix ( c ( 1 , 2 , 3 , 4 ), nrow = 2 )
mytable
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
# A frequencies (summed over columns = 1)
margin.table ( mytable , 1 )
# B frequencies (summed over rows = 2)
margin.table ( mytable , 2 )
# A/(A + B)
# cell percentages
prop.table ( mytable )
## [,1] [,2]
## [1,] 0.1 0.3
## [2,] 0.2 0.4
# row percentages
prop.table ( mytable , 1 )
## [,1] [,2]
## [1,] 0.2500000 0.7500000
## [2,] 0.3333333 0.6666667
# column percentages
prop.table ( mytable , 2 )
## [,1] [,2]
## [1,] 0.3333333 0.4285714
## [2,] 0.6666667 0.5714286
3D frequency tables
## Grouped Data: uptake ~ conc | Plant
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
A <- as.numeric ( CO2 [, 'Plant' ])
B <- CO2 [, 'conc' ]
C <- CO2 [, 'uptake' ]
mytable <- table ( A , B , C )
# several arrays (3D)
dim ( mytable ) # the printout is immense
# folded table (2D)
dim ( ftable ( mytable )) # the printout is immense
# 3-Way frequency Table
mytable <- xtabs ( ~ A + B + C , data = mytable , na.action = na.omit )
dim ( ftable ( mytable )) # the printout is immense
CrossTable
A <- as.numeric ( CO2 [ 1 : 8 , 'Plant' ])
A
## [1] 95 175 250 350 500 675 1000 95
# 2-way cross tabulation
library ( gmodels )
CrossTable ( A , B ) # mydata$myrowvar x mydata$mycolvar
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 ##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 8
##
##
## | B
## A | 95 | 175 | 250 | 350 | 500 | 675 | 1000 | Row Total |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 7 |
## | 0.321 | 0.018 | 0.018 | 0.018 | 0.018 | 0.018 | 0.018 | |
## | 0.143 | 0.143 | 0.143 | 0.143 | 0.143 | 0.143 | 0.143 | 0.875 |
## | 0.500 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | |
## | 0.125 | 0.125 | 0.125 | 0.125 | 0.125 | 0.125 | 0.125 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
## | 2.250 | 0.125 | 0.125 | 0.125 | 0.125 | 0.125 | 0.125 | |
## | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.125 |
## | 0.500 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## | 0.125 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
## Column Total | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 8 |
## | 0.250 | 0.125 | 0.125 | 0.125 | 0.125 | 0.125 | 0.125 | |
## -------------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
##
##
Tests of independence
There are more tests of independence in section 10, Resampling Statistics.
# dataset, a contingency table
colors
## fair.hair red.hair medium.hair dark.hair black.hair
## blue eyes 326 38 241 110 3
## light eyes 688 116 584 188 4
## medium eyes 343 84 909 412 26
## dark eyes 98 48 403 681 85
# chi-square test on 2-way tables
# test independence of the row and column variables, p-value is calculated from the asymptotic chi-squared distribution of the test statistic
chisq.test ( colors )
##
## Pearson's Chi-squared test
##
## data: colors
## X-squared = 1240, df = 12, p-value < 2.2e-16
# dataset, a contingency table in 2x2 matrix form
TeaTasting <-
matrix ( c ( 3 , 1 , 1 , 3 ),
nrow = 2 ,
dimnames = list ( Guess = c ( "Milk" , "Tea" ),
Truth = c ( "Milk" , "Tea" )))
TeaTasting
## Truth
## Guess Milk Tea
## Milk 3 1
## Tea 1 3
# Fisher exact test
fisher.test ( TeaTasting , alternative = "greater" )
##
## Fisher ' s Exact Test for Count Data
##
## data : TeaTasting
## p - value = 0 .2429
## alternative hypothesis : true odds ratio is greater than 1
## 95 percent confidence interval :
## 0 .3135693 Inf
## sample estimates :
## odds ratio
## 6 .408309
# 3D contingency table, where the last dimension refers to the strata
Rabbits <-
array ( c ( 0 , 0 , 6 , 5 ,
3 , 0 , 3 , 6 ,
6 , 2 , 0 , 4 ,
5 , 6 , 1 , 0 ,
2 , 5 , 0 , 0 ),
dim = c ( 2 , 2 , 5 ),
dimnames = list (
Delay = c ( "None" , "1.5h" ),
Response = c ( "Cured" , "Died" ),
Penicillin.Level = c ( "1/8" , "1/4" , "1/2" , "1" , "4" )))
Rabbits
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 ## , , Penicillin.Level = 1/8
##
## Response
## Delay Cured Died
## None 0 6
## 1.5h 0 5
##
## , , Penicillin.Level = 1/4
##
## Response
## Delay Cured Died
## None 3 3
## 1.5h 0 6
##
## , , Penicillin.Level = 1/2
##
## Response
## Delay Cured Died
## None 6 0
## 1.5h 2 4
##
## , , Penicillin.Level = 1
##
## Response
## Delay Cured Died
## None 5 1
## 1.5h 6 0
##
## , , Penicillin.Level = 4
##
## Response
## Delay Cured Died
## None 2 0
## 1.5h 5 0
# Mantel-Haenszel test / Cochran-Mantel-Haenszel chi-squared test, hypothesis that two nominal variables are conditionally independent in each stratum, assuming that there is no three-way interaction.
mantelhaen.test ( Rabbits )
##
## Mantel-Haenszel chi-squared test with continuity correction
##
## data: Rabbits
## Mantel-Haenszel X-squared = 3.9286, df = 1, p-value = 0.04747
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
## 1.026713 47.725133
## sample estimates:
## common odds ratio
## 7
# dataset
# 3-way contingency table based on variables A, B, and C
A <- CO2 [, 'Plant' ]
B <- CO2 [, 'conc' ]
C <- CO2 [, 'uptake' ]
mytable <- xtabs ( ~ A + B + C ) # a 3D array
# loglinear Models
# mutual independence: A, B, and C are pairwise independent
library ( MASS )
loglm ( ~ A + B + C , mytable )
## Call:
## loglm(formula = ~A + B + C, data = mytable)
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 720.1035 6291 1.0000000
## Pearson 6300.0000 6291 0.4656775
# conditional independence: A is independent of B, given C
loglm ( ~ A + B + C + A * C + B * C , mytable )
## Call:
## loglm(formula = ~A + B + C + A * C + B * C, data = mytable)
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 12.13685 5016 1
## Pearson NaN 5016 NaN
# no three-way interaction
loglm ( ~ A + B + C + A * B + A * C + B * C , mytable )
## Call:
## loglm(formula = ~A + B + C + A * B + A * C + B * C, data = mytable)
##
## Statistics:
## X^2 df P(> X^2)
## Likelihood Ratio 1.038376 4950 1
## Pearson NaN 4950 NaN
Measures of association
Association between two nominal variables, giving a value between 0 and +1 (inclusive). It is based on Pearson’s chi-squared statistic.
## 'data.frame': 84 obs. of 5 variables:
## $ ID : int 57 46 77 17 36 23 75 39 33 55 ...
## $ Treatment: Factor w/ 2 levels "Placebo","Treated": 2 2 2 2 2 2 2 2 2 2 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Age : int 27 29 30 32 46 58 59 59 63 63 ...
## $ Improved : Ord.factor w/ 3 levels "None"<"Some"<..: 2 1 1 3 3 3 1 3 1 1 ...
tab <- xtabs ( ~ Improved + Treatment , data = Arthritis )
tab
## Treatment
## Improved Placebo Treated
## None 29 13
## Some 7 7
## Marked 7 21
1
2
3
4
5
6
7
8
9
10
11
12
13 ##
## Call : xtabs ( formula = ~ Improved + Treatment , data = Arthritis )
## Number of cases in table : 84
## Number of factors : 2
## Test for independence of all factors :
## Chisq = 13 .055 , df = 2 , p - value = 0 .001463
## X ^ 2 df P ( > X ^ 2 )
## Likelihood Ratio 13 .530 2 0 .0011536
## Pearson 13 .055 2 0 .0014626
##
## Phi - Coefficient : NA
## Contingency Coeff .: 0 .367
## Cramer ' s V : 0.394
# phi coefficient, contingency coefficient, and Cramér's V for an 2D table
library ( vcd )
assocstats ( tab )
## X^2 df P(> X^2)
## Likelihood Ratio 13.530 2 0.0011536
## Pearson 13.055 2 0.0014626
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.367
## Cramer's V : 0.394
## Truth
## Guess Milk Tea
## Milk 3 1
## Tea 1 3
# Cohen's kappa and weighted kappa for a confusion matrix
library ( vcd )
kappa ( TeaTasting )
Correlations
# dataset
head ( mtcars , 3 )
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
# correlations/covariances among numeric variables in
# a data frame
cor ( mtcars , use = "complete.obs" , method = "kendall" ) # method = "pearson", "spearman" or "kendall"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 ## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.7953134 -0.7681311 -0.7428125 0.46454879 -0.7278321
## cyl -0.7953134 1.0000000 0.8144263 0.7851865 -0.55131785 0.7282611
## disp -0.7681311 0.8144263 1.0000000 0.6659987 -0.49898277 0.7433824
## hp -0.7428125 0.7851865 0.6659987 1.0000000 -0.38262689 0.6113081
## drat 0.4645488 -0.5513178 -0.4989828 -0.3826269 1.00000000 -0.5471495
## wt -0.7278321 0.7282611 0.7433824 0.6113081 -0.54714953 1.0000000
## qsec 0.3153652 -0.4489698 -0.3008155 -0.4729061 0.03272155 -0.1419881
## vs 0.5896790 -0.7710007 -0.6033059 -0.6305926 0.37510111 -0.4884787
## am 0.4690128 -0.4946212 -0.5202739 -0.3039956 0.57554849 -0.6138790
## gear 0.4331509 -0.5125435 -0.4759795 -0.2794458 0.58392476 -0.5435956
## carb -0.5043945 0.4654299 0.4137360 0.5959842 -0.09535193 0.3713741
## qsec vs am gear carb
## mpg 0.31536522 0.5896790 0.46901280 0.43315089 -0.50439455
## cyl -0.44896982 -0.7710007 -0.49462115 -0.51254349 0.46542994
## disp -0.30081549 -0.6033059 -0.52027392 -0.47597955 0.41373600
## hp -0.47290613 -0.6305926 -0.30399557 -0.27944584 0.59598416
## drat 0.03272155 0.3751011 0.57554849 0.58392476 -0.09535193
## wt -0.14198812 -0.4884787 -0.61387896 -0.54359562 0.37137413
## qsec 1.00000000 0.6575431 -0.16890405 -0.09126069 -0.50643945
## vs 0.65754312 1.0000000 0.16834512 0.26974788 -0.57692729
## am -0.16890405 0.1683451 1.00000000 0.77078758 -0.05859929
## gear -0.09126069 0.2697479 0.77078758 1.00000000 0.09801487
## carb -0.50643945 -0.5769273 -0.05859929 0.09801487 1.00000000
cov ( mtcars , use = "complete.obs" )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 ## mpg cyl disp hp drat
## mpg 36.324103 -9.1723790 -633.09721 -320.732056 2.19506351
## cyl -9.172379 3.1895161 199.66028 101.931452 -0.66836694
## disp -633.097208 199.6602823 15360.79983 6721.158669 -47.06401915
## hp -320.732056 101.9314516 6721.15867 4700.866935 -16.45110887
## drat 2.195064 -0.6683669 -47.06402 -16.451109 0.28588135
## wt -5.116685 1.3673710 107.68420 44.192661 -0.37272073
## qsec 4.509149 -1.8868548 -96.05168 -86.770081 0.08714073
## vs 2.017137 -0.7298387 -44.37762 -24.987903 0.11864919
## am 1.803931 -0.4657258 -36.56401 -8.320565 0.19015121
## gear 2.135685 -0.6491935 -50.80262 -6.358871 0.27598790
## carb -5.363105 1.5201613 79.06875 83.036290 -0.07840726
## wt qsec vs am gear
## mpg -5.1166847 4.50914919 2.01713710 1.80393145 2.1356855
## cyl 1.3673710 -1.88685484 -0.72983871 -0.46572581 -0.6491935
## disp 107.6842040 -96.05168145 -44.37762097 -36.56401210 -50.8026210
## hp 44.1926613 -86.77008065 -24.98790323 -8.32056452 -6.3588710
## drat -0.3727207 0.08714073 0.11864919 0.19015121 0.2759879
## wt 0.9573790 -0.30548161 -0.27366129 -0.33810484 -0.4210806
## qsec -0.3054816 3.19316613 0.67056452 -0.20495968 -0.2804032
## vs -0.2736613 0.67056452 0.25403226 0.04233871 0.0766129
## am -0.3381048 -0.20495968 0.04233871 0.24899194 0.2923387
## gear -0.4210806 -0.28040323 0.07661290 0.29233871 0.5443548
## carb 0.6757903 -1.89411290 -0.46370968 0.04637097 0.3266129
## carb
## mpg -5.36310484
## cyl 1.52016129
## disp 79.06875000
## hp 83.03629032
## drat -0.07840726
## wt 0.67579032
## qsec -1.89411290
## vs -0.46370968
## am 0.04637097
## gear 0.32661290
## carb 2.60887097
# correlations with significance levels
library ( Hmisc )
rcorr ( as.matrix ( mtcars ), type = "pearson" ) # type can be pearson or spearman
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41 ## mpg cyl disp hp drat wt qsec vs am gear carb
## mpg 1.00 -0.85 -0.85 -0.78 0.68 -0.87 0.42 0.66 0.60 0.48 -0.55
## cyl -0.85 1.00 0.90 0.83 -0.70 0.78 -0.59 -0.81 -0.52 -0.49 0.53
## disp -0.85 0.90 1.00 0.79 -0.71 0.89 -0.43 -0.71 -0.59 -0.56 0.39
## hp -0.78 0.83 0.79 1.00 -0.45 0.66 -0.71 -0.72 -0.24 -0.13 0.75
## drat 0.68 -0.70 -0.71 -0.45 1.00 -0.71 0.09 0.44 0.71 0.70 -0.09
## wt -0.87 0.78 0.89 0.66 -0.71 1.00 -0.17 -0.55 -0.69 -0.58 0.43
## qsec 0.42 -0.59 -0.43 -0.71 0.09 -0.17 1.00 0.74 -0.23 -0.21 -0.66
## vs 0.66 -0.81 -0.71 -0.72 0.44 -0.55 0.74 1.00 0.17 0.21 -0.57
## am 0.60 -0.52 -0.59 -0.24 0.71 -0.69 -0.23 0.17 1.00 0.79 0.06
## gear 0.48 -0.49 -0.56 -0.13 0.70 -0.58 -0.21 0.21 0.79 1.00 0.27
## carb -0.55 0.53 0.39 0.75 -0.09 0.43 -0.66 -0.57 0.06 0.27 1.00
##
## n= 32
##
##
## P
## mpg cyl disp hp drat wt qsec vs am gear
## mpg 0.0000 0.0000 0.0000 0.0000 0.0000 0.0171 0.0000 0.0003 0.0054
## cyl 0.0000 0.0000 0.0000 0.0000 0.0000 0.0004 0.0000 0.0022 0.0042
## disp 0.0000 0.0000 0.0000 0.0000 0.0000 0.0131 0.0000 0.0004 0.0010
## hp 0.0000 0.0000 0.0000 0.0100 0.0000 0.0000 0.0000 0.1798 0.4930
## drat 0.0000 0.0000 0.0000 0.0100 0.0000 0.6196 0.0117 0.0000 0.0000
## wt 0.0000 0.0000 0.0000 0.0000 0.0000 0.3389 0.0010 0.0000 0.0005
## qsec 0.0171 0.0004 0.0131 0.0000 0.6196 0.3389 0.0000 0.2057 0.2425
## vs 0.0000 0.0000 0.0000 0.0000 0.0117 0.0010 0.0000 0.3570 0.2579
## am 0.0003 0.0022 0.0004 0.1798 0.0000 0.0000 0.2057 0.3570 0.0000
## gear 0.0054 0.0042 0.0010 0.4930 0.0000 0.0005 0.2425 0.2579 0.0000
## carb 0.0011 0.0019 0.0253 0.0000 0.6212 0.0146 0.0000 0.0007 0.7545 0.1290
## carb
## mpg 0.0011
## cyl 0.0019
## disp 0.0253
## hp 0.0000
## drat 0.6212
## wt 0.0146
## qsec 0.0000
## vs 0.0007
## am 0.7545
## gear 0.1290
## carb
# dataset
x <- mtcars [ 1 : 3 ]
y <- mtcars [ 4 : 6 ]
# correlation between two vectors
cor ( x , y )
## hp drat wt
## mpg -0.7761684 0.6811719 -0.8676594
## cyl 0.8324475 -0.6999381 0.7824958
## disp 0.7909486 -0.7102139 0.8879799
Polychoric correlation
The correlation between two theorised normally distributed continuous latent variables, from two observed ordinal variables.
# dataset
# 2-way contingency table of counts
colors
## fair.hair red.hair medium.hair dark.hair black.hair
## blue eyes 326 38 241 110 3
## light eyes 688 116 584 188 4
## medium eyes 343 84 909 412 26
## dark eyes 98 48 403 681 85
# polychoric correlation
library ( polycor )
polychor ( colors )
# heterogeneous correlations in one matrix
# pearson (numeric-numeric),
# polyserial (numeric-ordinal),
# and polychoric (ordinal-ordinal)
# a data frame with ordered factors and numeric variables
hetcor ( colors )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28 ##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## fair.hair red.hair medium.hair dark.hair black.hair
## fair.hair 1 Pearson Pearson Pearson Pearson
## red.hair 0.8333 1 Pearson Pearson Pearson
## medium.hair 0.2597 0.6467 1 Pearson Pearson
## dark.hair -0.7091 -0.2251 0.1911 1 Pearson
## black.hair -0.781 -0.3875 -0.06329 0.9674 1
##
## Standard Errors:
## fair.hair red.hair medium.hair dark.hair
## fair.hair
## red.hair 0.03628
## medium.hair 0.3607 0.1383
## dark.hair 0.09977 0.3733 0.3841
## black.hair 0.06019 0.3004 0.4092 0.001491
##
## n = 4
##
## P-values for Tests of Bivariate Normality:
## fair.hair red.hair medium.hair dark.hair
## fair.hair
## red.hair 0.8306
## medium.hair 0.6939 0.8609
## dark.hair 0.7465 0.6335 0.7161
## black.hair 0.6582 0.5927 0.6711 0.9873
t-tests
# independent 2-group t-test
t.test ( y ~ x ) # where y is numeric and x is a binary factor
# independent 2-group t-test
t.test ( y1 , y2 ) # where y1 and y2 are numeric
# paired t-test
t.test ( y1 , y2 , paired = TRUE ) # where y1 & y2 are numeric
# one sample t-test
t.test ( y , mu = 3 ) # Ho: mu=3
# dataset
# 2-way contingency table
colors
## fair.hair red.hair medium.hair dark.hair black.hair
## blue eyes 326 38 241 110 3
## light eyes 688 116 584 188 4
## medium eyes 343 84 909 412 26
## dark eyes 98 48 403 681 85
mean ( as.numeric ( colors [ 1 ,])) # row 1 average
# independent 2-group t-test
t.test ( as.numeric ( colors [ 1 ,]), as.numeric ( colors [ 2 ,])) # where y1 and y2 are numeric
##
## Welch Two Sample t-test
##
## data: as.numeric(colors[1, ]) and as.numeric(colors[2, ])
## t = -1.164, df = 5.5777, p-value = 0.2918
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -541.5725 196.7725
## sample estimates:
## mean of x mean of y
## 143.6 316.0
# one sample t-test
t.test ( as.numeric ( colors [ 1 ,]), mu = 0 ) # Ho: mu=0
##
## One Sample t-test
##
## data: as.numeric(colors[1, ])
## t = 2.348, df = 4, p-value = 0.07868
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -26.2009 313.4009
## sample estimates:
## mean of x
## 143.6
t.test ( as.numeric ( colors [ 1 ,]), mu = 0 , alternative = "greater" ) # Ho: mu=<0
##
## One Sample t-test
##
## data: as.numeric(colors[1, ])
## t = 2.348, df = 4, p-value = 0.03934
## alternative hypothesis: true mean is greater than 0
## 95 percent confidence interval:
## 13.22123 Inf
## sample estimates:
## mean of x
## 143.6
alternative="less"
or alternative="greater"
option to specify a one tailed test.
var.equal = TRUE
option to specify equal variances and a pooled variance estimate.
For multivariate tests and ANOVA, see sections 8 and 9.
Nonparametric statistics
Nonnormal distributions.
Bivariate tests
# independent 2-group Mann-Whitney U test
wilcox.test ( y ~ A ) # where y is numeric and A is A binary factor
# independent 2-group Mann-Whitney U test
wilcox.test ( y , x ) # where y and x are numeric
# dependent 2-group Wilcoxon signed rank test
wilcox.test ( y1 , y2 , paired = TRUE ) # where y1 and y2 are numeric
# 2-way contingency table
colors
## fair.hair red.hair medium.hair dark.hair black.hair
## blue eyes 326 38 241 110 3
## light eyes 688 116 584 188 4
## medium eyes 343 84 909 412 26
## dark eyes 98 48 403 681 85
# independent 2-group Mann-Whitney U Test
wilcox.test ( as.numeric ( colors [ 1 ,]), as.numeric ( colors [ 2 ,])) # where y and x are numeric
##
## Wilcoxon rank sum test
##
## data: as.numeric(colors[1, ]) and as.numeric(colors[2, ])
## W = 8, p-value = 0.4206
## alternative hypothesis: true location shift is not equal to 0
alternative="less"
or alternative="greater"
option to specify a one
tailed test.
ANOVA
# Kruskal Wallis test one-Way ANOVA by ranks
kruskal.test ( y ~ A ) # where y1 is numeric and A is a factor
# randomized block design - Friedman test
friedman.test ( y ~ A | B ) # where y are the data values, A is a grouping factor and B is a blocking factor
# Kruskal Wallis test one-way ANOVA by ranks
kruskal.test ( y , x ) # where y and x are numeric
# dataset
# 2-way contingency table
colors
## fair.hair red.hair medium.hair dark.hair black.hair
## blue eyes 326 38 241 110 3
## light eyes 688 116 584 188 4
## medium eyes 343 84 909 412 26
## dark eyes 98 48 403 681 85
# Kruskal Wallis test one-way ANOVA by ranks
kruskal.test ( as.numeric ( colors [ 1 ,]), as.numeric ( colors [ 2 ,])) # where y and x are numeric
##
## Kruskal-Wallis rank sum test
##
## data: as.numeric(colors[1, ]) and as.numeric(colors[2, ])
## Kruskal-Wallis chi-squared = 4, df = 4, p-value = 0.406
Multiple Regressions
Fitting the Model
# multiple linear regression
fit <- lm ( y ~ x1 + x2 + x3 , data = mydata )
summary ( fit ) # show results
# useful functions
coefficients ( fit ) # model coefficients
confint ( fit , level = 0.95 ) # CIs for model parameters
fitted ( fit ) # predicted values
residuals ( fit ) # residuals
anova ( fit ) # ANOVA table
vcov ( fit ) # covariance matrix for model parameters
influence ( fit ) # regression diagnostics
## 'data.frame': 16 obs. of 7 variables:
## $ GNP.deflator: num 83 88.5 88.2 89.5 96.2 ...
## $ GNP : num 234 259 258 285 329 ...
## $ Unemployed : num 236 232 368 335 210 ...
## $ Armed.Forces: num 159 146 162 165 310 ...
## $ Population : num 108 109 110 111 112 ...
## $ Year : int 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 ...
## $ Employed : num 60.3 61.1 60.2 61.2 63.2 ...
# multiple linear regression example
fit <- lm ( Armed.Forces ~ GNP + Population , data = longley )
summary ( fit ) # show results
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 ##
## Call:
## lm(formula = Armed.Forces ~ GNP + Population, data = longley)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.349 -33.569 5.076 16.409 104.037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4123.922 1276.579 3.230 0.00657 **
## GNP 3.365 0.986 3.413 0.00463 **
## Population -44.011 14.089 -3.124 0.00807 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.56 on 13 degrees of freedom
## Multiple R-squared: 0.5426, Adjusted R-squared: 0.4723
## F-statistic: 7.712 on 2 and 13 DF, p-value: 0.006191
# other useful functions
coefficients ( fit ) # model coefficients
## (Intercept) GNP Population
## 4123.922484 3.365215 -44.010955
confint ( fit , level = 0.95 ) # CIs for model parameters
## 2.5 % 97.5 %
## (Intercept) 1366.042123 6881.802846
## GNP 1.235097 5.495333
## Population -74.447969 -13.573942
fitted ( fit ) # predicted values
## 1947 1948 1949 1950 1951 1952 1953 1954
## 176.4245 215.9487 161.1151 199.5681 298.4663 306.5279 288.1248 230.9633
## 1955 1956 1957 1958 1959 1960 1961 1962
## 295.1332 308.9566 313.0359 252.7794 318.8698 297.7176 240.7975 266.2711
residuals ( fit ) # residuals
## 1947 1948 1949 1950 1951 1952
## -17.4245114 -70.3487085 0.4848669 -34.5681071 11.4336564 52.8721086
## 1953 1954 1955 1956 1957 1958
## 66.5752438 104.0367029 9.6668098 -23.2566323 -33.2359499 10.9205505
## 1959 1960 1961 1962
## -63.6698197 -46.3175746 16.4025069 16.4288577
## Analysis of Variance Table
##
## Response: Armed.Forces
## Df Sum Sq Mean Sq F value Pr(>F)
## GNP 1 14479 14478.7 5.6649 0.033301 *
## Population 1 24941 24940.8 9.7583 0.008068 **
## Residuals 13 33226 2555.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov ( fit ) # covariance matrix for model parameters
## (Intercept) GNP Population
## (Intercept) 1629652.882 1239.7478355 -17970.27388
## GNP 1239.748 0.9721911 -13.76775
## Population -17970.274 -13.7677545 198.49444
influence ( fit ) # regression diagnostics
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40 ## $hat
## 1947 1948 1949 1950 1951 1952
## 0.27412252 0.17438942 0.31563199 0.16765687 0.21219668 0.21127212
## 1953 1954 1955 1956 1957 1958
## 0.11339116 0.08602104 0.10270245 0.12845683 0.13251347 0.11070418
## 1959 1960 1961 1962
## 0.15598638 0.15164367 0.32488437 0.33842686
##
## $coefficients
## (Intercept) GNP Population
## 1947 128.042531 0.131479474 -1.53731106
## 1948 29.040635 0.121992502 -0.69544934
## 1949 -6.396740 -0.005738655 0.07379997
## 1950 177.778426 0.175668519 -2.11609661
## 1951 133.333008 0.093997366 -1.43810938
## 1952 638.680267 0.462229807 -6.92955762
## 1953 422.107967 0.305133565 -4.56222462
## 1954 -385.998493 -0.325674565 4.42308458
## 1955 54.458118 0.042127997 -0.59713302
## 1956 -163.371695 -0.131240587 1.81041089
## 1957 -212.039316 -0.179084306 2.37664757
## 1958 -51.395691 -0.033854337 0.55600613
## 1959 -329.488802 -0.311550864 3.79446941
## 1960 3.116881 -0.049904757 0.10916689
## 1961 -242.199361 -0.158976865 2.60043035
## 1962 -194.416431 -0.113799395 2.04462752
##
## $sigma
## 1947 1948 1949 1950 1951 1952 1953 1954
## 52.28757 47.63741 52.61955 51.47046 52.48826 49.73420 48.50003 42.21357
## 1955 1956 1957 1958 1959 1960 1961 1962
## 52.53729 52.12610 51.60167 52.51353 48.66817 50.57779 52.30331 52.29577
##
## $wt.res
## 1947 1948 1949 1950 1951 1952
## -17.4245114 -70.3487085 0.4848669 -34.5681071 11.4336564 52.8721086
## 1953 1954 1955 1956 1957 1958
## 66.5752438 104.0367029 9.6668098 -23.2566323 -33.2359499 10.9205505
## 1959 1960 1961 1962
## -63.6698197 -46.3175746 16.4025069 16.4288577
Diagnostic plots
# diagnostic plots
layout ( matrix ( c ( 1 , 2 , 3 , 4 ), 2 , 2 )) # optional 4 graphs/page
plot ( fit )
layout ( matrix ( c ( 1 , 1 , 1 , 1 ), 2 , 2 ))
Comparing two models with ANOVA
# compare models
fit1 <- lm ( y ~ x1 + x2 + x3 + x4 , data = mydata )
fit2 <- lm ( y ~ x1 + x2 )
anova ( fit1 , fit2 )
# compare models
fit1 <- lm ( Armed.Forces ~ GNP + Population , data = longley )
fit2 <- lm ( Armed.Forces ~ GNP , data = longley )
anova ( fit1 , fit2 )
## Analysis of Variance Table
##
## Model 1: Armed.Forces ~ GNP + Population
## Model 2: Armed.Forces ~ GNP
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 13 33226
## 2 14 58167 -1 -24941 9.7583 0.008068 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Cross validation
# dataset
library ( DAAG )
data ( houseprices )
head ( houseprices , 3 )
## area bedrooms sale.price
## 9 694 4 192
## 10 905 4 215
## 11 802 4 215
# k-fold cross-validation
library ( DAAG )
# case 1, # 3 fold cross-validation
CVlm ( houseprices , form.lm = formula ( sale.price ~ area ), m = 3 , dots = FALSE , seed = 29 , plotit = c ( "Observed" , "Residual" ), main = "Small symbols show cross-validation predicted values" , legend.pos = "topleft" , printit = TRUE )
## Analysis of Variance Table
##
## Response: sale.price
## Df Sum Sq Mean Sq F value Pr(>F)
## area 1 18566 18566 8 0.014 *
## Residuals 13 30179 2321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 ##
## fold 1
## Observations in test set: 5
## 11 20 21 22 23
## area 802 696 771.0 1006.0 1191
## cvpred 204 188 199.3 234.7 262
## sale.price 215 255 260.0 293.0 375
## CV residual 11 67 60.7 58.3 113
##
## Sum of squares = 24351 Mean square = 4870 n = 5
##
## fold 2
## Observations in test set: 5
## 10 13 14 17 18
## area 905 716 963.0 1018.00 887.00
## cvpred 255 224 264.4 273.38 252.06
## sale.price 215 113 185.0 276.00 260.00
## CV residual -40 -112 -79.4 2.62 7.94
##
## Sum of squares = 20416 Mean square = 4083 n = 5
##
## fold 3
## Observations in test set: 5
## 9 12 15 16 19
## area 694.0 1366 821.00 714.0 790.00
## cvpred 183.2 388 221.94 189.3 212.49
## sale.price 192.0 274 212.00 220.0 221.50
## CV residual 8.8 -114 -9.94 30.7 9.01
##
## Sum of squares = 14241 Mean square = 2848 n = 5
##
## Overall (Sum over all 5 folds)
## ms
## 3934
# dataset
head ( longley , 3 )
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## 1947 83.0 234 236 159 108 1947 60.3
## 1948 88.5 259 232 146 109 1948 61.1
## 1949 88.2 258 368 162 110 1949 60.2
# case 2, # 3 fold cross-validation
CVlm ( longley , form.lm = formula ( Armed.Forces ~ GNP + Population ), m = 3 , dots = FALSE , seed = 29 , plotit = c ( "Observed" , "Residual" ), main = "Small symbols show cross-validation predicted values" , legend.pos = "topleft" , printit = TRUE )
## Analysis of Variance Table
##
## Response: Armed.Forces
## Df Sum Sq Mean Sq F value Pr(>F)
## GNP 1 14479 14479 5.66 0.0333 *
## Population 1 24941 24941 9.76 0.0081 **
## Residuals 13 33226 2556
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 ##
## fold 1
## Observations in test set: 5
## 1953 1954 1957 1960 1962
## Predicted 288.1 231 313.0 297.7 266.3
## cvpred 281.4 219 310.6 295.7 263.1
## Armed.Forces 354.7 335 279.8 251.4 282.7
## CV residual 73.3 116 -30.8 -44.3 19.6
##
## Sum of squares = 22004 Mean square = 4401 n = 5
##
## fold 2
## Observations in test set: 6
## 1948 1949 1955 1958 1959 1961
## Predicted 215.9 161.12 295.13 252.78 318.9 240.8
## cvpred 231.9 171.41 306.33 255.07 324.5 234.9
## Armed.Forces 145.6 161.60 304.80 263.70 255.2 257.2
## CV residual -86.3 -9.81 -1.53 8.63 -69.3 22.3
##
## Sum of squares = 12917 Mean square = 2153 n = 6
##
## fold 3
## Observations in test set: 5
## 1947 1950 1951 1952 1956
## Predicted 176.4 199.6 298.5 306.5 308.96
## cvpred 192.8 211.9 282.3 288.9 295.17
## Armed.Forces 159.0 165.0 309.9 359.4 285.70
## CV residual -33.8 -46.9 27.6 70.5 -9.47
##
## Sum of squares = 9161 Mean square = 1832 n = 5
##
## Overall (Sum over all 5 folds)
## ms
## 2755
Variable selection – Heuristic methods
# dataset
head ( longley , 3 )
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## 1947 83.0 234 236 159 108 1947 60.3
## 1948 88.5 259 232 146 109 1948 61.1
## 1949 88.2 258 368 162 110 1949 60.2
# Stepwise Regression
library ( MASS )
fit <- lm ( Armed.Forces ~ GNP + Population + Employed + Unemployed , data = longley )
step <- stepAIC ( fit , direction = "both" )
## Start: AIC=125
## Armed.Forces ~ GNP + Population + Employed + Unemployed
##
## Df Sum of Sq RSS AIC
## <none> 21655 125
## - Unemployed 1 4508 26163 126
## - Population 1 5522 27177 127
## - Employed 1 10208 31863 130
## - GNP 1 15323 36978 132
step $ anova # display results
1
2
3
4
5
6
7
8
9
10
11
12 ## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Armed.Forces ~ GNP + Population + Employed + Unemployed
##
## Final Model:
## Armed.Forces ~ GNP + Population + Employed + Unemployed
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 11 21655 125
The goal is to reduce the AIC. Adding variable does not improve the model. Let’s opt for the most valuable variable.
fit <- lm ( Armed.Forces ~ Unemployed , data = longley )
step <- stepAIC ( fit , direction = "both" )
1
2
3
4
5
6
7
8
9
10
11
12
13 ## Start: AIC=138
## Armed.Forces ~ Unemployed
##
## Df Sum of Sq RSS AIC
## - Unemployed 1 2287 72646 137
## <none> 70359 138
##
## Step: AIC=137
## Armed.Forces ~ 1
##
## Df Sum of Sq RSS AIC
## <none> 72646 137
## + Unemployed 1 2287 70359 138
step $ anova # display results
1
2
3
4
5
6
7
8
9
10
11
12
13 ## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Armed.Forces ~ Unemployed
##
## Final Model:
## Armed.Forces ~ 1
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 14 70359 138
## 2 - Unemployed 1 2287 15 72646 137
Variable selection – Graphical methods
# model
fit <- lm ( Armed.Forces ~ GNP + Population + Employed + Unemployed , data = longley )
# all Subsets Regression
library ( leaps )
leaps <- regsubsets ( Armed.Forces ~ GNP + Population + Employed + Unemployed , data = longley , nbest = 10 )
# view results
summary ( leaps )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 ## Subset selection object
## Call: regsubsets . formula ( Armed . Forces ~ GNP + Population + Employed +
## Unemployed , data = longley , nbest = 10 )
## 4 Variables ( and intercept )
## Forced in Forced out
## GNP FALSE FALSE
## Population FALSE FALSE
## Employed FALSE FALSE
## Unemployed FALSE FALSE
## 10 subsets of each size up to 4
## Selection Algorithm: exhaustive
## GNP Population Employed Unemployed
## 1 ( 1 ) " " " " "*" " "
## 1 ( 2 ) "*" " " " " " "
## 1 ( 3 ) " " "*" " " " "
## 1 ( 4 ) " " " " " " "*"
## 2 ( 1 ) "*" "*" " " " "
## 2 ( 2 ) "*" " " " " "*"
## 2 ( 3 ) " " "*" " " "*"
## 2 ( 4 ) " " " " "*" "*"
## 2 ( 5 ) " " "*" "*" " "
## 2 ( 6 ) "*" " " "*" " "
## 3 ( 1 ) "*" "*" "*" " "
## 3 ( 2 ) "*" " " "*" "*"
## 3 ( 3 ) "*" "*" " " "*"
## 3 ( 4 ) " " "*" "*" "*"
## 4 ( 1 ) "*" "*" "*" "*"
# plot a table of models showing variables in each model
# models are ordered by the selection statistic
plot ( leaps , scale = "r2" )
# plot statistic by subset size
library ( car )
subsets ( leaps , statistic = "rsq" , legend = FALSE ) # available criteria are rsq, rss, adjr2, cp, bic
## Abbreviation
## GNP G
## Population P
## Employed E
## Unemployed U
subsets ( leaps , statistic = "bic" , legend = FALSE ) # available criteria are rsq, rss, adjr2, cp, bic
## Abbreviation
## GNP G
## Population P
## Employed E
## Unemployed U
Variable selection – Relative importance
Model: fit <- lm(Armed.Forces ~ GNP + Population + Employed + Unemployed, data = longley)
.
Warning: eval=FALSE
.
# calculate the relative importance of each predictor
library ( relaimpo )
calc.relimp ( fit , type = c ( "lmg" , "last" , "first" , "pratt" ), rela = TRUE )
Results.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 Response variable : Armed . Forces
Total response variance : 4843
Analysis based on 16 observations
4 Regressors :
GNP Population Employed Unemployed
Proportion of variance explained by model : 70.2 %
Metrics are normalized to sum to 100 % ( rela = TRUE ) .
Relative importance metrics :
lmg last first pratt
GNP 0.328 0.431 0.348 4.566
Population 0.241 0.155 0.232 - 1.934
Employed 0.217 0.287 0.365 - 1.780
Unemployed 0.215 0.127 0.055 0.148
Average coefficients for different model sizes :
1 X 2 Xs 3 Xs 4 Xs
GNP 0.313 1.301 3.641 5.026
Population 3.646 - 14.815 - 24.637 - 37.260
Employed 9.062 17.646 - 34.249 - 54.144
Unemployed - 0.132 - 0.511 - 0.584 - 0.436
Bootstrapping
Model: fit <- lm(Armed.Forces ~ GNP + Population + Employed + Unemployed, data = longley)
.
Warning: eval=FALSE
.
# bootstrap measures of relative importance (1000 samples)
boot <- boot.relimp ( fit , b = 1000 , type = c ( "lmg" , "last" , "first" , "pratt" ), rank = TRUE , diff = TRUE , rela = TRUE )
booteval.relimp ( boot ) # print result
Results.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90 Response variable : Armed . Forces
Total response variance : 4843
Analysis based on 16 observations
4 Regressors :
GNP Population Employed Unemployed
Proportion of variance explained by model : 70.2 %
Metrics are normalized to sum to 100 % ( rela = TRUE ) .
Relative importance metrics :
lmg last first pratt
GNP 0.328 0.431 0.348 4.566
Population 0.241 0.155 0.232 - 1.934
Employed 0.217 0.287 0.365 - 1.780
Unemployed 0.215 0.127 0.055 0.148
Average coefficients for different model sizes :
1 X 2 Xs 3 Xs 4 Xs
GNP 0.313 1.301 3.641 5.026
Population 3.646 - 14.815 - 24.637 - 37.260
Employed 9.062 17.646 - 34.249 - 54.144
Unemployed - 0.132 - 0.511 - 0.584 - 0.436
Confidence interval information ( 1000 bootstrap replicates , bty = perc ):
Relative Contributions with confidence intervals :
Lower Upper
percentage 0.95 0.95 0.95
GNP . lmg 0.3280 ABC_ 0.1488 0.4030
Population . lmg 0.2410 _BCD 0.1572 0.3060
Employed . lmg 0.2170 ABCD 0.1107 0.3240
Unemployed . lmg 0.2150 ABCD 0.0586 0.5550
GNP . last 0.4310 ABCD 0.0101 0.5660
Population . last 0.1550 _BCD 0.0003 0.3850
Employed . last 0.2870 ABCD 0.0476 0.6280
Unemployed . last 0.1270 ABCD 0.0009 0.7810
GNP . first 0.3480 ABCD 0.0151 0.3960
Population . first 0.2320 _BCD 0.0070 0.3060
Employed . first 0.3650 ABCD 0.0159 0.4020
Unemployed . first 0.0550 ABCD 0.0004 0.9280
GNP . pratt 4.5660 ABCD - 1.0160 10.5380
Population . pratt - 1.9340 ABCD - 6.0800 2.2050
Employed . pratt - 1.7800 _BCD - 5.0290 0.8510
Unemployed . pratt 0.1480 ABC_ - 0.3250 1.0910
Letters indicate the ranks covered by bootstrap CIs .
( Rank bootstrap confidence intervals always obtained by percentile method )
CAUTION : Bootstrap confidence intervals can be somewhat liberal .
Differences between Relative Contributions :
Lower Upper
difference 0.95 0.95 0.95
GNP - Population . lmg 0.0871 - 0.0188 0.1521
GNP - Employed . lmg 0.1106 - 0.0479 0.1949
GNP - Unemployed . lmg 0.1130 - 0.4021 0.3281
Population - Employed . lmg 0.0236 - 0.1091 0.1124
Population - Unemployed . lmg 0.0259 - 0.3904 0.2417
Employed - Unemployed . lmg 0.0023 - 0.4315 0.2295
GNP - Population . last 0.2756 - 0.1062 0.4502
GNP - Employed . last 0.1438 - 0.5564 0.4011
GNP - Unemployed . last 0.3041 - 0.7408 0.5502
Population - Employed . last - 0.1318 - 0.6191 0.2653
Population - Unemployed . last 0.0285 - 0.7382 0.3622
Employed - Unemployed . last 0.1603 - 0.6784 0.4852
GNP - Population . first 0.1161 - 0.0590 0.1736
GNP - Employed . first - 0.0172 - 0.0893 0.0934
GNP - Unemployed . first 0.2930 - 0.9047 0.3766
Population - Employed . first - 0.1333 - 0.2134 0.0688
Population - Unemployed . first 0.1769 - 0.8967 0.2969
Employed - Unemployed . first 0.3102 - 0.8937 0.3823
GNP - Population . pratt 6.4995 - 2.2069 15.7978
GNP - Employed . pratt 6.3461 - 1.3879 15.0093
GNP - Unemployed . pratt 4.4180 - 1.9134 10.6456
Population - Employed . pratt - 0.1534 - 4.1860 5.9691
Population - Unemployed . pratt - 2.0815 - 6.1439 2.2348
Employed - Unemployed . pratt - 1.9281 - 5.2057 0.1725
* indicates that CI for difference does not include 0.
CAUTION : Bootstrap confidence intervals can be somewhat liberal .
Warning: eval=FALSE
.
plot ( booteval.relimp ( boot , sort = TRUE )) # to plot the results
The .png file.
Going further
The nls
package provides functions for nonlinear regression.
Perform robust regression with the rlm
function in the MASS
package.
The robust
package provides a comprehensive library of robust methods, including regression.
The robustbase
package also provides basic robust statistics including model selection methods.
Regression diagnostics
# assume that we are fitting a multiple linear regression
fit <- lm ( mpg ~ disp + hp + wt + drat , data = mtcars )
fit
##
## Call:
## lm(formula = mpg ~ disp + hp + wt + drat, data = mtcars)
##
## Coefficients:
## (Intercept) disp hp wt drat
## 29.14874 0.00382 -0.03478 -3.47967 1.76805
Outliers
library ( car )
# assessing outliers
outlierTest ( fit ) # Bonferonni p-value for most extreme obs
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## Toyota Corolla 2.52 0.0184 0.588
qqPlot ( fit , main = "QQ Plot" ) #qq plot for studentized resid
leveragePlots ( fit ) # leverage plots
Influential observations
Model: fit <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)
.
# influential observations
# added variable plots
library ( car )
avPlots ( fit )
par ( mfrow = c ( 1 , 1 ))
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4 / (( nrow ( mtcars ) - length ( fit $ coefficients ) - 2 ))
plot ( fit , which = 4 , cook.levels = cutoff )
Warning: eval=FALSE
; interactive function.
# influence plot
influencePlot ( fit , id.method = "identify" , main = "Influence Plot" , sub = "Circle size is proportial to Cook's Distance" )
Nonnormality
Model: fit <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)
.
# normality of residuals
# qq plot for studentized resid
library ( car )
qqPlot ( fit , main = "QQ Plot" )
# distribution of studentized residuals
library ( MASS )
sresid <- studres ( fit )
hist ( sresid , freq = FALSE , main = "Distribution of Studentized Residuals" )
xfit <- seq ( min ( sresid ), max ( sresid ), length = 40 )
yfit <- dnorm ( xfit )
lines ( xfit , yfit )
Heteroscedasticity
Nonconstant error variance.
Model: fit <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)
.
# evaluate homoscedasticity
# non-constant error variance test
ncvTest ( fit )
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1.43 Df = 1 p = 0.232
# plot studentized residuals vs. fitted values
spreadLevelPlot ( fit )
##
## Suggested power transformation: 0.662
Multicollinearity
Model: fit <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)
.
# evaluatecCollinearity
vif ( fit )
## disp hp wt drat
## 8.21 2.89 5.10 2.28
sqrt ( vif ( fit )) > 2 # benchmark = 1.96, rounded to 2
## disp hp wt drat
## TRUE FALSE TRUE FALSE
Nonlinearity
Model: fit <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)
.
# evaluate nonlinearity
# component + residual plot
crPlots ( fit )
# Ceres plots
ceresPlots ( fit )
Autocorrelation
Serial correlation or non-independence of errors.
Model: fit <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)
.
# test for autocorrelated errors
durbinWatsonTest ( fit )
## lag Autocorrelation D-W Statistic p-value
## 1 0.101 1.74 0.29
## Alternative hypothesis: rho != 0
Global diagnostic
Model: fit <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)
.
# global test of model assumptions
library ( gvlma )
gvmodel <- gvlma ( fit )
summary ( gvmodel )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 ##
## Call:
## lm(formula = mpg ~ disp + hp + wt + drat, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.508 -1.905 -0.506 0.982 5.688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.14874 6.29359 4.63 8.2e-05 ***
## disp 0.00382 0.01080 0.35 0.7268
## hp -0.03478 0.01160 -3.00 0.0058 **
## wt -3.47967 1.07837 -3.23 0.0033 **
## drat 1.76805 1.31978 1.34 0.1915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.6 on 27 degrees of freedom
## Multiple R-squared: 0.838, Adjusted R-squared: 0.814
## F-statistic: 34.8 on 4 and 27 DF, p-value: 2.7e-10
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = fit)
##
## Value p-value Decision
## Global Stat 13.9382 0.00750 Assumptions NOT satisfied!
## Skewness 4.3131 0.03782 Assumptions NOT satisfied!
## Kurtosis 0.0138 0.90654 Assumptions acceptable.
## Link Function 8.7166 0.00315 Assumptions NOT satisfied!
## Heteroscedasticity 0.8947 0.34421 Assumptions acceptable.
ANOVA
Analysis of variance (ANOVA) is an alternative to regressions among other applications.
# lower case letters are numeric variables
# upper case letters are factors
# one-way ANOVA (completely randomized design)
fit <- aov ( y ~ A , data = mydataframe )
# randomized block design (B is the blocking factor)
fit <- aov ( y ~ A + B , data = mydataframe )
# two-way factorial design
fit <- aov ( y ~ A + B + A : B , data = mydataframe )
fit <- aov ( y ~ A * B , data = mydataframe ) # same thing
# analysis of covariance
fit <- aov ( y ~ A + x , data = mydataframe )
## Grouped Data: uptake ~ conc | Plant
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
# one-way ANOVA (completely randomized design)
fit <- aov ( uptake ~ Plant , data = CO2 )
fit
## Call:
## aov(formula = uptake ~ Plant, data = CO2)
##
## Terms:
## Plant Residuals
## Sum of Squares 4862 4845
## Deg. of Freedom 11 72
##
## Residual standard error: 8.2
## Estimated effects are balanced
## Df Sum Sq Mean Sq F value Pr(>F)
## Plant 11 4862 442 6.57 1.8e-07 ***
## Residuals 72 4845 67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# randomized block design (B is the blocking factor)
fit <- aov ( uptake ~ Plant + Type , data = CO2 )
fit
## Call:
## aov(formula = uptake ~ Plant + Type, data = CO2)
##
## Terms:
## Plant Residuals
## Sum of Squares 4862 4845
## Deg. of Freedom 11 72
##
## Residual standard error: 8.2
## 1 out of 13 effects not estimable
## Estimated effects are balanced
## Df Sum Sq Mean Sq F value Pr(>F)
## Plant 11 4862 442 6.57 1.8e-07 ***
## Residuals 72 4845 67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# two-way factorial design
fit <- aov ( uptake ~ Plant + Type + Plant : Type , data = CO2 )
fit
## Call:
## aov(formula = uptake ~ Plant + Type + Plant:Type, data = CO2)
##
## Terms:
## Plant Residuals
## Sum of Squares 4862 4845
## Deg. of Freedom 11 72
##
## Residual standard error: 8.2
## 12 out of 24 effects not estimable
## Estimated effects are balanced
## Df Sum Sq Mean Sq F value Pr(>F)
## Plant 11 4862 442 6.57 1.8e-07 ***
## Residuals 72 4845 67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit <- aov ( uptake ~ Plant * Type , data = CO2 ) # same thing
fit
## Call:
## aov(formula = uptake ~ Plant * Type, data = CO2)
##
## Terms:
## Plant Residuals
## Sum of Squares 4862 4845
## Deg. of Freedom 11 72
##
## Residual standard error: 8.2
## 12 out of 24 effects not estimable
## Estimated effects are balanced
## Df Sum Sq Mean Sq F value Pr(>F)
## Plant 11 4862 442 6.57 1.8e-07 ***
## Residuals 72 4845 67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Analysis of Covariance
fit <- aov ( uptake ~ uptake + conc , data = CO2 )
fit
## Call:
## aov(formula = uptake ~ uptake + conc, data = CO2)
##
## Terms:
## conc Residuals
## Sum of Squares 2285 7422
## Deg. of Freedom 1 82
##
## Residual standard error: 9.51
## Estimated effects may be unbalanced
## Df Sum Sq Mean Sq F value Pr(>F)
## conc 1 2285 2285 25.2 2.9e-06 ***
## Residuals 82 7422 91
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Evaluate model effects
Type I sequential SS: A+B and B+A will produce different results!
Use the drop1
to produce the familiar Type III results; compare each term with the full model.
# display Type I ANOVA table
fit1 <- aov ( uptake ~ Plant + Type , data = CO2 )
summary ( fit1 )
## Df Sum Sq Mean Sq F value Pr(>F)
## Plant 11 4862 442 6.57 1.8e-07 ***
## Residuals 72 4845 67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# display Type I ANOVA table
fit2 <- aov ( uptake ~ Type + Plant , data = CO2 )
summary ( fit2 )
## Df Sum Sq Mean Sq F value Pr(>F)
## Type 1 3366 3366 50.02 8.1e-10 ***
## Plant 10 1497 150 2.22 0.026 *
## Residuals 72 4845 67
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# display type III SS and F tests
drop1 ( fit1 , ~ . , test = "F" )
## Single term deletions
##
## Model:
## uptake ~ Plant + Type
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 4845 365
## Plant 10 1497 6341 367 2.22 0.026 *
## Type 0 0 4845 365
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drop1 ( fit2 , ~ . , test = "F" )
## Single term deletions
##
## Model:
## uptake ~ Type + Plant
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 4845 365
## Type 0 0 4845 365
## Plant 10 1497 6341 367 2.22 0.026 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Compare nested models directly
fit1 <- aov ( uptake ~ Plant + Type , data = CO2 )
fit2 <- aov ( uptake ~ Plant , data = CO2 )
anova ( fit1 , fit2 )
## Analysis of Variance Table
##
## Model 1: uptake ~ Plant + Type
## Model 2: uptake ~ Plant
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 72 4845
## 2 72 4845 0 0
fit2 <- aov ( uptake ~ Plant + Type , data = CO2 )
fit1 <- aov ( uptake ~ Plant , data = CO2 )
anova ( fit1 , fit2 )
## Analysis of Variance Table
##
## Model 1: uptake ~ Plant
## Model 2: uptake ~ Plant + Type
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 72 4845
## 2 72 4845 0 0
Multiple comparisons
Tukey HSD tests for post hoc comparisons on each factor in the model.
Factors as an option.
Type I SS.
# model
fit <- aov ( uptake ~ Plant + Type , data = CO2 )
# Tukey honestly significant differences
TukeyHSD ( fit )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73 ## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = uptake ~ Plant + Type, data = CO2)
##
## $Plant
## diff lwr upr p adj
## Qn2-Qn1 1.929 -12.88 16.739 1.000
## Qn3-Qn1 4.386 -10.42 19.196 0.997
## Qc1-Qn1 -3.257 -18.07 11.553 1.000
## Qc3-Qn1 -0.643 -15.45 14.168 1.000
## Qc2-Qn1 -0.529 -15.34 14.282 1.000
## Mn3-Qn1 -9.114 -23.92 5.696 0.639
## Mn2-Qn1 -5.886 -20.70 8.925 0.970
## Mn1-Qn1 -6.829 -21.64 7.982 0.918
## Mc2-Qn1 -21.086 -35.90 -6.275 0.000
## Mc3-Qn1 -15.929 -30.74 -1.118 0.024
## Mc1-Qn1 -15.229 -30.04 -0.418 0.038
## Qn3-Qn2 2.457 -12.35 17.268 1.000
## Qc1-Qn2 -5.186 -20.00 9.625 0.989
## Qc3-Qn2 -2.571 -17.38 12.239 1.000
## Qc2-Qn2 -2.457 -17.27 12.353 1.000
## Mn3-Qn2 -11.043 -25.85 3.768 0.346
## Mn2-Qn2 -7.814 -22.62 6.996 0.822
## Mn1-Qn2 -8.757 -23.57 6.053 0.694
## Mc2-Qn2 -23.014 -37.82 -8.204 0.000
## Mc3-Qn2 -17.857 -32.67 -3.047 0.006
## Mc1-Qn2 -17.157 -31.97 -2.347 0.010
## Qc1-Qn3 -7.643 -22.45 7.168 0.842
## Qc3-Qn3 -5.029 -19.84 9.782 0.991
## Qc2-Qn3 -4.914 -19.72 9.896 0.993
## Mn3-Qn3 -13.500 -28.31 1.311 0.108
## Mn2-Qn3 -10.271 -25.08 4.539 0.457
## Mn1-Qn3 -11.214 -26.02 3.596 0.323
## Mc2-Qn3 -25.471 -40.28 -10.661 0.000
## Mc3-Qn3 -20.314 -35.12 -5.504 0.001
## Mc1-Qn3 -19.614 -34.42 -4.804 0.002
## Qc3-Qc1 2.614 -12.20 17.425 1.000
## Qc2-Qc1 2.729 -12.08 17.539 1.000
## Mn3-Qc1 -5.857 -20.67 8.953 0.971
## Mn2-Qc1 -2.629 -17.44 12.182 1.000
## Mn1-Qc1 -3.571 -18.38 11.239 1.000
## Mc2-Qc1 -17.829 -32.64 -3.018 0.006
## Mc3-Qc1 -12.671 -27.48 2.139 0.167
## Mc1-Qc1 -11.971 -26.78 2.839 0.233
## Qc2-Qc3 0.114 -14.70 14.925 1.000
## Mn3-Qc3 -8.471 -23.28 6.339 0.735
## Mn2-Qc3 -5.243 -20.05 9.568 0.988
## Mn1-Qc3 -6.186 -21.00 8.625 0.958
## Mc2-Qc3 -20.443 -35.25 -5.632 0.001
## Mc3-Qc3 -15.286 -30.10 -0.475 0.037
## Mc1-Qc3 -14.586 -29.40 0.225 0.057
## Mn3-Qc2 -8.586 -23.40 6.225 0.719
## Mn2-Qc2 -5.357 -20.17 9.453 0.985
## Mn1-Qc2 -6.300 -21.11 8.511 0.952
## Mc2-Qc2 -20.557 -35.37 -5.747 0.001
## Mc3-Qc2 -15.400 -30.21 -0.589 0.034
## Mc1-Qc2 -14.700 -29.51 0.111 0.054
## Mn2-Mn3 3.229 -11.58 18.039 1.000
## Mn1-Mn3 2.286 -12.52 17.096 1.000
## Mc2-Mn3 -11.971 -26.78 2.839 0.233
## Mc3-Mn3 -6.814 -21.62 7.996 0.919
## Mc1-Mn3 -6.114 -20.92 8.696 0.961
## Mn1-Mn2 -0.943 -15.75 13.868 1.000
## Mc2-Mn2 -15.200 -30.01 -0.389 0.039
## Mc3-Mn2 -10.043 -24.85 4.768 0.493
## Mc1-Mn2 -9.343 -24.15 5.468 0.603
## Mc2-Mn1 -14.257 -29.07 0.553 0.070
## Mc3-Mn1 -9.100 -23.91 5.711 0.641
## Mc1-Mn1 -8.400 -23.21 6.411 0.746
## Mc3-Mc2 5.157 -9.65 19.968 0.989
## Mc1-Mc2 5.857 -8.95 20.668 0.971
## Mc1-Mc3 0.700 -14.11 15.511 1.000
Visualizing results
# two-way interaction plot
attach ( mtcars )
gear <- factor ( gear )
cyl <- factor ( cyl )
# two-way interactions
library ( car )
interaction.plot ( cyl , gear , mpg , type = "b" , col = c ( 1 : 3 ), leg.bty = "o" , leg.bg = "beige" , lwd = 2 , pch = c ( 18 , 24 , 22 ), xlab = "Number of Cylinders" , ylab = "Mean Miles Per Gallon" , main = "Interaction Plot" )
# mean plots for single factors, and includes confidence intervals
library ( gplots )
plotmeans ( mpg ~ cyl , xlab = "Number of Cylinders" , ylab = "Miles Per Gallon" , main = "Mean Plot\nwith 95% CI" )
MANOVA
Multivariate analysis of variance (MANOVA) With more than one dependent variable Y. We can run several ANOVA over different Y, or one MANOVA with one Y built with several variables.
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## 1947 83.0 234 236 159 108 1947 60.3
## 1948 88.5 259 232 146 109 1948 61.1
## 1949 88.2 258 368 162 110 1949 60.2
attach ( longley )
# 2x2 factorial MANOVA with 3 dependent variables, Y
Y <- cbind ( Unemployed , Armed.Forces , Employed ) # Y
fit <- manova ( Y ~ Population * GNP ) # Y ~ X
# display type I SS
summary ( fit , test = "Pillai" )
## Df Pillai approx F num Df den Df Pr(>F)
## Population 1 0.997 1074 3 10 7.7e-13 ***
## GNP 1 0.888 26 3 10 4.6e-05 ***
## Population:GNP 1 0.870 22 3 10 9.4e-05 ***
## Residuals 12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# test = "Wilks", "Hotelling-Lawley", and "Roy"
# display univariate statistics
summary.aov ( fit )
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 ## Response Unemployed :
## Df Sum Sq Mean Sq F value Pr(>F)
## Population 1 61739 61739 45.92 2e-05 ***
## GNP 1 42841 42841 31.87 0.00011 ***
## Population:GNP 1 10270 10270 7.64 0.01715 *
## Residuals 12 16133 1344
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Armed.Forces :
## Df Sum Sq Mean Sq F value Pr(>F)
## Population 1 9647 9647 4.25 0.0617 .
## GNP 1 29772 29772 13.10 0.0035 **
## Population:GNP 1 5958 5958 2.62 0.1314
## Residuals 12 27268 2272
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Employed :
## Df Sum Sq Mean Sq F value Pr(>F)
## Population 1 170.6 170.6 546.45 2.2e-11 ***
## GNP 1 10.5 10.5 33.60 8.5e-05 ***
## Population:GNP 1 0.1 0.1 0.41 0.54
## Residuals 12 3.7 0.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# display type III SS
fit1 <- manova ( Y ~ Population * GNP )
fit2 <- manova ( Y ~ GNP * Population )
# type III GNP effect
summary ( fit1 )
## Df Pillai approx F num Df den Df Pr(>F)
## Population 1 0.997 1074 3 10 7.7e-13 ***
## GNP 1 0.888 26 3 10 4.6e-05 ***
## Population:GNP 1 0.870 22 3 10 9.4e-05 ***
## Residuals 12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# type III Population effect
summary ( fit2 )
## Df Pillai approx F num Df den Df Pr(>F)
## GNP 1 0.997 1088 3 10 7.2e-13 ***
## Population 1 0.786 12 3 10 0.0011 **
## GNP:Population 1 0.870 22 3 10 9.4e-05 ***
## Residuals 12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Multiple comparisons
TukeyHSD
and plot
do work with a MANOVA fit. Run each dependent variable separately to obtain them… or proceed with ANOVA on each dependent variable.
Going further
Package lme4
has excellent facilities for fitting linear and generalized linear mixed-effects models.
(M)ANOVA Assumptions
Outliers
Outliers can severely affect normality and homogeneity of variance.
# dataset
head ( mtcars , 3 )
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.62 16.5 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.88 17.0 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# detect outliers
# ordered squared robust Mahalanobis distances of the observations against the empirical distribution function of the MD2i
library ( mvoutlier )
outliers <-
aq.plot ( mtcars [ c ( "mpg" , "disp" , "hp" , "drat" , "wt" , "qsec" )])
## Projection to the first and second robust principal components.
## Proportion of total variation (explained variance): 0.974
outliers # show list of outliers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 ## $outliers
## Mazda RX4 Mazda RX4 Wag Datsun 710
## FALSE FALSE FALSE
## Hornet 4 Drive Hornet Sportabout Valiant
## FALSE FALSE FALSE
## Duster 360 Merc 240D Merc 230
## TRUE TRUE TRUE
## Merc 280 Merc 280C Merc 450SE
## FALSE FALSE FALSE
## Merc 450SL Merc 450SLC Cadillac Fleetwood
## FALSE FALSE FALSE
## Lincoln Continental Chrysler Imperial Fiat 128
## FALSE FALSE TRUE
## Honda Civic Toyota Corolla Toyota Corona
## FALSE FALSE FALSE
## Dodge Challenger AMC Javelin Camaro Z28
## FALSE FALSE TRUE
## Pontiac Firebird Fiat X1-9 Porsche 914-2
## FALSE FALSE FALSE
## Lotus Europa Ford Pantera L Ferrari Dino
## FALSE TRUE FALSE
## Maserati Bora Volvo 142E
## TRUE FALSE
Univariate normality
# Q-Q plot
qqnorm ( mtcars $ mpg )
qqline ( mtcars $ mpg )
# Shapiro-Wilk test of normality
shapiro.test ( mtcars $ mpg )
##
## Shapiro-Wilk normality test
##
## data: mtcars$mpg
## W = 0.9, p-value = 0.1
library ( nortest )
# Anderson-Darling test for normality
ad.test ( mtcars $ mpg )
##
## Anderson-Darling normality test
##
## data: mtcars$mpg
## A = 0.6, p-value = 0.1
# Cramer-von Mises test for normality
cvm.test ( mtcars $ mpg )
##
## Cramer-von Mises normality test
##
## data: mtcars$mpg
## W = 0.09, p-value = 0.2
# Lilliefors (Kolmogorov-Smirnov) test for normality
lillie.test ( mtcars $ mpg )
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: mtcars$mpg
## D = 0.1, p-value = 0.2
# Pearson chi-square test for normality
pearson.test ( mtcars $ mpg )
##
## Pearson chi-square normality test
##
## data: mtcars$mpg
## P = 8, p-value = 0.2
# Shapiro-Francia test for normality
sf.test ( mtcars $ mpg )
##
## Shapiro-Francia normality test
##
## data: mtcars$mpg
## W = 1, p-value = 0.1
Multivariate normality
## DAX SMI CAC FTSE
## [1,] 1629 1678 1773 2444
## [2,] 1614 1688 1750 2460
## [3,] 1607 1679 1718 2448
library ( mvnormtest )
# Shapiro-Wilk test for multivariate normality of numeric matrix
mshapiro.test ( t ( EuStockMarkets ))
##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.9, p-value <2e-16
With a p x 1 multivariate normal random vector x vector, the squared Mahalanobis distance between x and ?? is going to be chi-square distributed with p degrees of freedom.
# graphical assessment of multivariate normality
x <- as.matrix ( EuStockMarkets ) # n x p numeric matrix
center <- colMeans ( x ) # centroid
n <- nrow ( x ); p <- ncol ( x )
cov <- cov ( x )
d <- mahalanobis ( x , center , cov ) # distances
qqplot ( qchisq ( ppoints ( n ), df = p ), d , main = "QQ Plot Assessing Multivariate Normality" , ylab = "Mahalanobis D2" )
abline ( a = 0 , b = 1 )
Heteroscedasticity
Nonconstant error variance or non-homogeneity of variances.
# dataset
head ( mtcars , 3 )
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.62 16.5 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.88 17.0 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# y is a numeric variable and G is the grouping variable
# Bartlett parametric test of homogeneity of variances
bartlett.test ( mpg ~ cyl , data = mtcars )
##
## Bartlett test of homogeneity of variances
##
## data: mpg by cyl
## Bartlett's K-squared = 8, df = 2, p-value = 0.02
# Figner-Killeen non-parametric test of homogeneity of variances
fligner.test ( mpg ~ cyl , data = mtcars )
##
## Fligner-Killeen test of homogeneity of variances
##
## data: mpg by cyl
## Fligner-Killeen:med chi-squared = 7, df = 2, p-value = 0.03
library ( HH )
# y is numeric and G is a grouping factor
# G must be of type factor
hov ( mpg ~ factor ( cyl ), data = mtcars )
##
## hov: Brown-Forsyth
##
## data: mpg
## F = 6, df:factor(cyl) = 2, df:Residuals = 30, p-value = 0.009
## alternative hypothesis: variances are not identical
# homogeneity of variance plot
# graphic test of homogeneity of variances based on Brown-Forsyth
hovPlot ( mpg ~ factor ( cyl ), data = mtcars )
Non-homogeneity of covariance matrices
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## ---
## biotools version 3.0
# Box's M test
# very sensitive to violations of normality, leading to rejection in most typical cases
boxM ( iris [, -5 ], iris [, 5 ])
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: iris[, -5]
## Chi-Sq (approx.) = 100, df = 20, p-value <2e-16
Resampling Statistics
Independent k-sample location tests
# dataset
head ( mtcars , 3 )
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.62 16.5 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.88 17.0 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# exact Wilcoxon Mann Whitney rank sum test
# re-randomization or permutation based statistical tests
# where y is numeric and A is a binary factor
library ( coin )
wilcox_test ( mpg ~ factor ( am ), data = mtcars , distribution = "exact" )
##
## Exact Wilcoxon-Mann-Whitney Test
##
## data: mpg by factor(am) (0, 1)
## Z = -3, p-value = 0.001
## alternative hypothesis: true mu is not equal to 0
# lower case letters represent numerical variables and upper case letters represent categorical factors
Monte-Carlo simulations are available for all tests. Exact tests are available for 2 group procedures.
These tests do not assume random sampling from well-defined populations. They can be a reasonable alternative to classical procedures when test assumptions can not be met.
# one-way permutation test based on 9999 Monte-Carlo
# resamplings
# y is numeric and A is a categorical factor
library ( coin )
oneway_test ( mpg ~ factor ( am ), data = mtcars , distribution = approximate ( B = 9999 ))
##
## Approximative Two-Sample Fisher-Pitman Permutation Test
##
## data: mpg by factor(am) (0, 1)
## Z = -3, p-value = 5e-04
## alternative hypothesis: true mu is not equal to 0
Symmetry of a response for repeated measurements
# exact Wilcoxon signed rank test
# where y1 and y2 are repeated measures
library ( coin )
wilcoxsign_test ( mpg ~ factor ( am ), data = mtcars , distribution = "exact" )
##
## Exact Wilcoxon-Pratt Signed-Rank Test
##
## data: y by x (pos, neg)
## stratified by block
## Z = 5, p-value = 5e-10
## alternative hypothesis: true mu is not equal to 0
# Freidman Test based on 9999 Monte-Carlo resamplings.
# y is numeric, A is a grouping factor, and B is a
# blocking factor
friedman_test ( y ~ A | B , data = mydata , distribution = approximate ( B = 9999 ))
Independence of two numeric variables
# Spearman Test of independence based on 9999 Monte-Carlo
# resamplings. x and y are numeric variables
library ( coin )
spearman_test ( mpg ~ am , data = mtcars , distribution = approximate ( B = 9999 ))
##
## Approximative Spearman Correlation Test
##
## data: mpg by am
## Z = 3, p-value = 0.001
## alternative hypothesis: true rho is not equal to 0
Independence in contingency tables
## Grouped Data: uptake ~ conc | Plant
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
cases <- c ( 4 , 2 , 3 , 1 , 59 )
n <- sum ( cases )
cochran <- data.frame (
diphtheria = factor (
unlist ( rep ( list ( c ( 1 , 1 , 1 , 1 ),
c ( 1 , 1 , 0 , 1 ),
c ( 0 , 1 , 1 , 1 ),
c ( 0 , 1 , 0 , 1 ),
c ( 0 , 0 , 0 , 0 )),
cases ))
),
media = factor ( rep ( LETTERS [ 1 : 4 ], n )),
case = factor ( rep ( seq_len ( n ), each = 4 ))
)
head ( cochran )
## diphtheria media case
## 1 1 A 1
## 2 1 B 1
## 3 1 C 1
## 4 1 D 1
## 5 1 A 2
## 6 1 B 2
# independence in 2-way contingency table based on
# 9999 Monte-Carlo resamplings. A and B are factors
library ( coin )
chisq_test ( Plant ~ Type , data = CO2 , distribution = approximate ( B = 9999 ))
##
## Approximative Linear-by-Linear Association Test
##
## data: Plant (ordered) by Type (Quebec, Mississippi)
## Z = -8, p-value <2e-16
## alternative hypothesis: two.sided
# Cochran-Mantel-Haenzsel Test of 3-way Contingency Table
# based on 9999 Monte-Carlo resamplings. A, B, are factors
# and C is a stratefying factor
mh_test ( diphtheria ~ media | case , data = cochran , distribution = approximate ( B = 9999 ))
##
## Approximative Marginal Homogeneity Test
##
## data: diphtheria by
## media (A, B, C, D)
## stratified by case
## chi-squared = 8, p-value = 0.05
# linear by linear association test based on 9999
# Monte-Carlo resamplings
# A and B are ordered factors
library ( coin )
lbl_test ( Plant ~ Type , data = CO2 , distribution = approximate ( B = 9999 ))
##
## Approximative Linear-by-Linear Association Test
##
## data: Plant (ordered) by Type (Quebec, Mississippi)
## Z = -8, p-value <2e-16
## alternative hypothesis: two.sided
Many other univariate and multivariate tests are possible using the functions in the coin
package.