Please download the following files:

Stats sesstion

We need ‘rms’, ‘ggplot2’, and ‘car’ packages If you haven’t already installed these:

Load the packages

library(rms)

## Loading required package: Hmisc

## Loading required package: grid

## Loading required package: lattice

## Loading required package: survival

## Loading required package: splines

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(ggplot2)
library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:rms':
## 
##     vif

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

The data should be merged and ready to go from day 1. If not, here’s the code for it:

data1 <- read.csv("~/Downloads/messy_demographic.csv")  # put in the location of the downloaded data file 1
data2 <- read.csv("~/Downloads/messy_cognitive.csv")  # put in the location of the downloaded data file 2
data3 <- read.csv("~/Downloads/messy_genotype.csv")

data1[data1==""] <- NA
data1[data1=="missing"] <- NA
data1[data1=="9999"] <- NA

data1$age <- as.numeric(as.character(data1$age))
data1$ethnicity <- factor(data1$ethnicity,levels=c("Cauc","AA","As","In","Other"))
data1$sex <- factor(data1$sex, levels=c(0,1), labels=c("Male","Female"))
data1$dx <- factor(data1$dx, levels=c(0,1), labels=c("Control","Case"))

data2[data2==""] <- NA
data2[data2=="missing"] <- NA
data2[data2=="9999"] <- NA
data2$cog1 <- as.numeric(as.character(data2$cog1))
data2$cog2 <- as.numeric(as.character(data2$cog2))
data2$cog3 <- as.numeric(as.character(data2$cog3))
data2$subID <- gsub(data2$subID,pattern="subject",replacement="SUB_")

data3[data3==""] <- NA
data3[data3=="missing"] <- NA
data3[data3=="9999"] <- NA
data3$genotype <- factor(data3$genotype, levels=c(0,1,2), labels=c("AA","AG","GG"))
data3$subID <- gsub(data3$subID,pattern="subject",replacement="SUB_")

alldata <- merge(data1,data2,by.x="subject_ID",by.y="subID")
alldata <- merge(alldata,data3,by.x="subject_ID",by.y="subID")

HERE WE GO WITH Statistics!

RESEARCH AIM 4: Linear Regression

total_behaviour_score ~ age

Calculate a composite variable by combining multiple variables Note: new variables can be made easily (using dplyr’s mutate verb)

alldata$totalcog <- (alldata$cog1 + alldata$cog3) / alldata$cog2

Simple linear regression (two ways: base package and rms)

lm.base <- lm(data=alldata, totalcog ~ age)
lm.rms <- ols(data=alldata, totalcog ~ age)

Let’s compare the output’s

lm.base

## 
## Call:
## lm(formula = totalcog ~ age, data = alldata)
## 
## Coefficients:
## (Intercept)          age  
##     1.78587     -0.01219

summary(lm.base)

## 
## Call:
## lm(formula = totalcog ~ age, data = alldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0529 -0.5054 -0.1940  0.4352  2.3720 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.785875   0.172793  10.335  < 2e-16 ***
## age         -0.012192   0.003346  -3.644 0.000313 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6966 on 322 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.03961,    Adjusted R-squared:  0.03663 
## F-statistic: 13.28 on 1 and 322 DF,  p-value: 0.0003125

anova(lm.base)

## Analysis of Variance Table
## 
## Response: totalcog
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## age         1   6.445  6.4446   13.28 0.0003125 ***
## Residuals 322 156.259  0.4853                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Note: to make the most out of rms package functionality, we need to store summary stats using the datadist() function. That way, when we call summary() on an ols() object (we just made one called “lm.rms”) it will give us useful info.

dd.alldata <- datadist(alldata)
options(datadist="dd.alldata")

lm.rms

## 
## Linear Regression Model
## 
## ols(formula = totalcog ~ age, data = alldata)
## 
## Frequencies of Missing Values Due to Each Variable
## totalcog      age 
##       18        8 
## 
## 
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
## Obs      324    LR chi2     13.09    R2       0.040    
## sigma 0.6966    d.f.            1    R2 adj   0.037    
## d.f.     322    Pr(> chi2) 0.0003    g        0.160    
## 
## Residuals
## 
##     Min      1Q  Median      3Q     Max 
## -1.0529 -0.5054 -0.1940  0.4352  2.3720 
## 
##           Coef    S.E.   t     Pr(>|t|)
## Intercept  1.7859 0.1728 10.34 <0.0001 
## age       -0.0122 0.0033 -3.64 0.0003

summary(lm.rms)

##              Effects              Response : totalcog 
## 
##  Factor Low High Diff. Effect S.E. Lower 0.95 Upper 0.95
##  age    43  58   15    -0.18  0.05 -0.28      -0.08

anova(lm.rms)

##                 Analysis of Variance          Response: totalcog 
## 
##  Factor     d.f. Partial SS MS        F     P    
##  age          1    6.444638 6.4446382 13.28 3e-04
##  REGRESSION   1    6.444638 6.4446382 13.28 3e-04
##  ERROR      322  156.258677 0.4852754

Visualize results using ggplot2

ggplot(data=alldata, aes(y=totalcog, x=age)) + 
    geom_point() + 
    geom_smooth(method=lm)

## Warning: Removed 26 rows containing missing values (stat_smooth).

## Warning: Removed 26 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

Challenge 1: add a title and change the axis labels

Visualize predicted results using rms

plot(Predict(lm.rms))

Check regression assumption of normal residuals

hist(resid(lm.rms))

They are not normal! We can look at this formally also:

shapiro.test(resid(lm.rms))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(lm.rms)
## W = 0.9158, p-value = 1.677e-12

Let’s look at and normalize our outcome…

two ways of looking at it (base and ggplot2):

ggplot(data=alldata, aes(x=totalcog)) + geom_histogram()

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Let’s try transforming our variable using log and sqrt transformations to see if it helps:

alldata$totalcog_log <- log(alldata$totalcog)
alldata$totalcog_sqrt <- sqrt(alldata$totalcog)

Here’s a super neat way to optimize power transformations:

The powerTransform function will calculate the best transform for your data. It saves the best exponent as the value “lambda”

# calculate the best exponent using powerTransform:
pT <- powerTransform(alldata$totalcog)
# apply the power transform and save the result to a new variable
alldata$totalcog_pT <- alldata$totalcog^pT$lambda ## note ^ is exponent in r

let’s try our regression again with the transformed outcome (using rms):

Note: if we want to use summary() on our ols() object we will have to redefine datadist because we created new variables that were not in the original datadist object

dd.alldata <- datadist(alldata)
options(datadist="dd.alldata")

Challenge 2: run a new regression (using ols) with the your new transformed cognitive score as the dependant variable

Challenge 3: use ggplot to make a new plot of this effect

RESEARCH AIM 5: total_behaviour_score ~ age + sex + genotype (multiple linear regression)

This is were we start to add covariates and do multiple regression

lm3 <- ols(data=alldata, totalcog_pT ~ age + genotype + sex)
anova(lm3)

##                 Analysis of Variance          Response: totalcog_pT 
## 
##  Factor     d.f. Partial SS   MS           F     P     
##  age          1  8.449950e-04 8.449950e-04 14.68 0.0002
##  genotype     2  6.737907e-05 3.368954e-05  0.59 0.5576
##  sex          1  2.106451e-04 2.106451e-04  3.66 0.0567
##  REGRESSION   4  1.115625e-03 2.789063e-04  4.85 0.0008
##  ERROR      309  1.778648e-02 5.756143e-05

summary(lm3)

##              Effects              Response : totalcog_pT 
## 
##  Factor            Low High Diff. Effect S.E. Lower 0.95 Upper 0.95
##  age               43  58   15    0      0    0          0         
##  genotype - AA:AG   2   1   NA    0      0    0          0         
##  genotype - GG:AG   2   3   NA    0      0    0          0         
##  sex - Male:Female  2   1   NA    0      0    0          0

We can easily visualise the effects of each variable on the outcome using rms, but this feature does not give us plot points and is not very flexible:

plot(Predict(lm3))

Now let’s say we want to recode out genotype variable so that we have only 2 groups: those who “carry” the G allele, and those who do not carry it:

alldata$riskcarrier[alldata$genotype=="AG" | alldata$genotype=="GG"] <- "carrier"
alldata$riskcarrier[alldata$genotype=="AA"] <- "non-carrier"
alldata$riskcarrier <- as.factor(alldata$riskcarrier)

No we can re-run the model:

dd.alldata <- datadist(alldata)
options(datadist="dd.alldata")
lm3 <- ols(data=alldata, totalcog_pT ~ age + sex + riskcarrier)
lm3

## 
## Linear Regression Model
## 
## ols(formula = totalcog_pT ~ age + sex + riskcarrier, data = alldata)
## 
## Frequencies of Missing Values Due to Each Variable
## totalcog_pT         age         sex riskcarrier 
##          18           8           4           8 
## 
## 
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
## Obs      314    LR chi2     17.93    R2       0.055    
## sigma 0.0076    d.f.            3    R2 adj   0.046    
## d.f.     310    Pr(> chi2) 0.0005    g        0.002    
## 
## Residuals
## 
##        Min         1Q     Median         3Q        Max 
## -1.761e-02 -5.765e-03 -7.513e-05  6.148e-03  1.767e-02 
## 
##                         Coef    S.E.   t      Pr(>|t|)
## Intercept                1.0080 0.0020 500.39 <0.0001 
## age                     -0.0001 0.0000  -3.83 0.0002  
## sex=Female              -0.0017 0.0009  -1.93 0.0548  
## riskcarrier=non-carrier  0.0001 0.0010   0.12 0.9074

anova(lm3)

##                 Analysis of Variance          Response: totalcog_pT 
## 
##  Factor      d.f. Partial SS   MS           F     P     
##  age           1  8.468547e-04 8.468547e-04 14.70 0.0002
##  sex           1  2.140203e-04 2.140203e-04  3.72 0.0548
##  riskcarrier   1  7.802082e-07 7.802082e-07  0.01 0.9074
##  REGRESSION    3  1.049026e-03 3.496754e-04  6.07 0.0005
##  ERROR       310  1.785308e-02 5.759059e-05

summary(lm3)

##              Effects              Response : totalcog_pT 
## 
##  Factor                            Low High Diff. Effect S.E. Lower 0.95
##  age                               43  58   15    0      0    0         
##  sex - Male:Female                  2   1   NA    0      0    0         
##  riskcarrier - non-carrier:carrier  1   2   NA    0      0    0         
##  Upper 0.95
##  0         
##  0         
##  0

RESEARCH AIM 5: Interaction!

total_behaviour_score ~ age*riskcarrier + sex

The concept of statistical interaction goes by many names and has many definitions. Simply this is the concept that the effect of one variable changes depending on the value of another variable.

Interaction is indicated in R formula syntax with a “:” or *, depending on if you want to automatically include the main effects of your interacting variables or not. As a general rule, always use *.

lm4 <- lm(totalcog_pT ~ sex + riskcarrier*age, data=alldata)
summary(lm4)

## 
## Call:
## lm(formula = totalcog_pT ~ sex + riskcarrier * age, data = alldata)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0176521 -0.0056722 -0.0004349  0.0057920  0.0181582 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 1.010e+00  2.301e-03 438.920  < 2e-16 ***
## sexFemale                  -1.727e-03  8.935e-04  -1.933   0.0541 .  
## riskcarriernon-carrier     -6.756e-03  4.504e-03  -1.500   0.1346    
## age                        -1.790e-04  4.372e-05  -4.095  5.4e-05 ***
## riskcarriernon-carrier:age  1.312e-04  8.402e-05   1.561   0.1195    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007571 on 309 degrees of freedom
##   (36 observations deleted due to missingness)
## Multiple R-squared:  0.06289,    Adjusted R-squared:  0.05076 
## F-statistic: 5.184 on 4 and 309 DF,  p-value: 0.0004697

anova(lm4)

## Analysis of Variance Table
## 
## Response: totalcog_pT
##                  Df    Sum Sq    Mean Sq F value    Pr(>F)    
## sex               1 0.0001862 0.00018620  3.2482 0.0724769 .  
## riskcarrier       1 0.0000160 0.00001597  0.2786 0.5980138    
## age               1 0.0008469 0.00084685 14.7729 0.0001472 ***
## riskcarrier:age   1 0.0001397 0.00013968  2.4367 0.1195489    
## Residuals       309 0.0177134 0.00005732                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

BONUS SECTION - the “right” way to plot….

How to visualize a significant effect from our regression

####…Controlling for the other variables in the model….

To visualize a given effect more informatively, we want to caculate the residuals of the model lacking our co-varitate of interest and plot those residuals as our outcome:

For genotype we want a boxplot of model residuals:

lm3.plot <- ols(data=alldata, totalcog_pT ~ genotype + age)

ggplot(data=alldata, aes(y=resid(lm3.plot), x=sex)) + 
  geom_boxplot()

## Warning: Removed 33 rows containing non-finite values (stat_boxplot).

See it thinks NA is a value! That is because the ols() object stores NA input values as NA residuals, and ggplot2 sees these as another level to plot. Fix by re-running the model to exclude missing observations and plotting the data subset where NAs are excluded:

lm3.plot <- ols(data=subset(alldata,sex!="NA"), totalcog_pT ~ age + genotype)
ggplot(data=subset(alldata,sex!="NA"), aes(y=resid(lm3.plot), x=sex)) + 
  geom_boxplot()

## Warning: Removed 32 rows containing non-finite values (stat_boxplot).

Ror age we want a scatterplot of residuals. same subsetting principle applies:

lm3.plot <- ols(data=subset(alldata,age!="NA"), totalcog_pT ~ genotype + sex)
ggplot(data=subset(alldata,age!="NA"), aes(y=resid(lm3.plot), x=age)) + 
  geom_point() + 
  geom_smooth(method=lm)

## Warning: Removed 28 rows containing missing values (stat_smooth).

## Warning: Removed 28 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

CONCEPT: notice that if we include age in our model and plot age on the x-axis in our residual plot, the effect is lost - we have modeled it out:

lm3.plot <- ols(data=alldata, totalcog_pT ~ genotype + sex + age)
ggplot(data=alldata, aes(y=resid(lm3.plot), x=age)) + 
  geom_point() + 
  geom_smooth(method=lm)

## Warning: Removed 36 rows containing missing values (stat_smooth).

## Warning: Removed 36 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

To plot the interaction we just tell ggplo2 to plot the regression lines for ou

lm4.plot <- ols(data=subset(alldata,age!="NA" & riskcarrier!="NA"), totalcog_pT ~ sex)
ggplot(data=subset(alldata,age!="NA" & riskcarrier!="NA"), aes(y=resid(lm4.plot), x=age, col=riskcarrier)) + 
  geom_point() + 
  geom_smooth(method=lm)

## Warning: Removed 10 rows containing missing values (stat_smooth).

## Warning: Removed 11 rows containing missing values (stat_smooth).

## Warning: Removed 21 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

Stats with R tutorial

Erin W Dickie

September 11, 2016