Return to SCWG Home


Please download the following files:

  1. messy_demographic.csv
  2. messy_cognitive.csv
  3. messy_genotype.csv

We need ‘ggplot2’, ‘dplyr’, ‘tidyr’, and ‘broom’ packages If you haven’t already installed these:

Load the packages

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(broom)
library(knitr)

The data should be merged and ready to go from day 1. If not, here’s the code for it:

data1 <- read.csv("~/Downloads/messy_demographic.csv")  # put in the location of the downloaded data file 1
data2 <- read.csv("~/Downloads/messy_cognitive.csv")  # put in the location of the downloaded data file 2
data3 <- read.csv("~/Downloads/messy_genotype.csv")
data1[data1==""] <- NA
data1[data1=="missing"] <- NA
data1[data1=="9999"] <- NA

data1$age <- as.numeric(as.character(data1$age))
data1$ethnicity <- factor(data1$ethnicity,levels=c("Cauc","AA","As","In","Other"))
data1$sex <- factor(data1$sex, levels=c(0,1), labels=c("Male","Female"))
data1$dx <- factor(data1$dx, levels=c(0,1), labels=c("Control","Case"))

data2[data2==""] <- NA
data2[data2=="missing"] <- NA
data2[data2=="9999"] <- NA
data2$cog1 <- as.numeric(as.character(data2$cog1))
data2$cog2 <- as.numeric(as.character(data2$cog2))
data2$cog3 <- as.numeric(as.character(data2$cog3))
data2$subID <- gsub(data2$subID,pattern="subject",replacement="SUB_")

data3[data3==""] <- NA
data3[data3=="missing"] <- NA
data3[data3=="9999"] <- NA
data3$genotype <- factor(data3$genotype, levels=c(0,1,2), labels=c("AA","AG","GG"))
data3$subID <- gsub(data3$subID,pattern="subject",replacement="SUB_")

alldata <- merge(data1,data2,by.x="subject_ID",by.y="subID")
alldata <- merge(alldata,data3,by.x="subject_ID",by.y="subID")

## Write merged dataframe as new .csv
write.csv(alldata, "alldata.csv", row.names=F)

RESEARCH QUESTION 3:

cog1 ~ dx (t-test)

t.test(data=alldata, cog1 ~ dx)
## 
##  Welch Two Sample t-test
## 
## data:  cog1 by dx
## t = -6.347, df = 334.916, p-value = 7.133e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.871388 -1.512676
## sample estimates:
## mean in group Control    mean in group Case 
##              9.940047             12.132079

Wow we have a result!! But one of my favorite things output R is that the statistical output can be saved as an object!!

my.t.result <- t.test(data=alldata, cog1 ~ dx) # saves to output to my.t.result

print(my.t.result)       ## prints to output to the console
## 
##  Welch Two Sample t-test
## 
## data:  cog1 by dx
## t = -6.347, df = 334.916, p-value = 7.133e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.871388 -1.512676
## sample estimates:
## mean in group Control    mean in group Case 
##              9.940047             12.132079
my.t.result$statistic    ## gets us the t statistic!
##         t 
## -6.347017
my.t.result$parameter    ## the degrees of freedom
##       df 
## 334.9162
my.t.result$p.value      ## gets us the p-value
## [1] 7.13299e-10
round(my.t.result$statistic,2) ## we can these numbers using the "round" function
##     t 
## -6.35

Let’s put these three together into something we might want to report in our paper

my.t.results.txt = paste0('t(',
                        round(my.t.result$parameter,1),
                        ') = ',
                        round(my.t.result$statistic,2), ', p = ',
                        round(my.t.result$p.value, 7))

my.t.results.txt
## [1] "t(334.9) = -6.35, p = 0"

View this as basic boxplots (two ways: base package and ggplot2)

ggplot(data=alldata, aes(y=cog1,x=dx)) + 
    geom_boxplot()
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

Let’s make it fancier:

First - let’s deal with the NA’s we don’t want to plot - let’s remove them from the plotting dataset

data.toplot <-filter(alldata, !is.na(cog1), !is.na(dx))

ggplot(data.toplot, aes(y=cog1,x=dx)) + 
    geom_boxplot(outlier.shape=NA) + ## removes outliers from boxplot layer
    geom_jitter(alpha=0.5)           ## add dots on top
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page

## even fancier - let's add a title, and annotation and label the axes
ggplot(data.toplot, aes(y=cog1,x=dx)) + 
  geom_boxplot(outlier.shape=NA) + 
  geom_jitter(alpha=0.5) +
  labs(title="Effect of Diagnosis on Cog Score #1",
       y = "Cognitive Test 1",
       x = "Diagnosis") +
  annotate("text", label = my.t.results.txt, x = 1, y = 21) +
  theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page

NOW LET’s save our plot!!!

Note: we can start by using the “Export” button in the plots tab..

ggsave('figure1_ttestresults.pdf', width = 5, height = 5)

Let’s make a diagnosis by cognition table

my.stats.table <- summarise(alldata,
                            "Mean" = mean(cog1, na.rm = T),
                            "St Dev" = sd(cog1, na.rm = T))

my.stats.table <- alldata %>% 
                  group_by(dx,sex) %>%
                  summarise("Mean" = mean(cog1, na.rm = T),
                            "St Dev" = sd(cog1, na.rm = T))

kable(my.stats.table)
dx sex Mean St Dev
Control Male 10.476754 2.7469625
Control Female 9.746893 2.9514609
Control NA 12.254830 1.5465560
Case Male 12.215568 3.5483599
Case Female 12.022392 3.3859637
Case NA 13.036707 0.1736939
NA Male 11.868654 1.0237882
NA Female 10.580708 2.3376551

reshaping data to look at all three cognitive scores

but let’s look at all three subscales….

## let's add a rick carrier
alldata$riskcarrier[alldata$genotype=="AG" | alldata$genotype=="GG"] <- "carrier"
alldata$riskcarrier[alldata$genotype=="AA"] <- "non-carrier"
alldata$riskcarrier <- as.factor(alldata$riskcarrier)

alldata_melted <- gather(alldata, cog_var, cognitive.score, 
                  -subject_ID, -age, -sex, -ethnicity, -dx, -genotype, -riskcarrier)

toplot <- filter(alldata_melted, !is.na(cognitive.score), 
                                 !is.na(dx), 
                                 !is.na(riskcarrier))

ggplot(toplot, aes(y=cognitive.score,x=age,color=riskcarrier)) + 
  geom_point() + 
  geom_smooth(method=lm) + 
  facet_wrap(~cog_var, scales = "free")
## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_point).

## Warning: Removed 7 rows containing missing values (geom_point).

## Warning: Removed 7 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

LAST CHALLENGE: can you use dplyr to present these statistics in a table??

LAST BUT NOT LEAST, running a loop to screen for significance at multiple tests (manhattan plot)

## add stargazer descriptive stats table?
cog_dems_table <- alldata_melted  %>% 
                  filter(!is.na(cognitive.score), !is.na(dx), !is.na(riskcarrier)) %>%
                  group_by(dx, cog_var, riskcarrier) %>%
                  summarise("Mean" = mean(cognitive.score, na.rm = T),
                            "St Dev" = sd(cognitive.score, na.rm = T))

kable(cog_dems_table)
dx cog_var riskcarrier Mean St Dev
Control cog1 carrier 9.367999 2.745134
Control cog1 non-carrier 11.545711 2.884425
Control cog2 carrier 31.439535 4.015106
Control cog2 non-carrier 31.845074 3.675566
Control cog3 carrier 24.442639 20.988259
Control cog3 non-carrier 25.761328 24.178563
Case cog1 carrier 11.549411 3.450849
Case cog1 non-carrier 13.418635 3.126957
Case cog2 carrier 31.405908 3.921310
Case cog2 non-carrier 32.569693 3.489469
Case cog3 carrier 26.178213 20.660281
Case cog3 non-carrier 23.402200 20.367174
regress_results <- alldata_melted %>%
                    filter(!is.na(cognitive.score), !is.na(dx), !is.na(riskcarrier)) %>%
                    group_by(cog_var) %>%
                    do(tidy(lm(cognitive.score ~ dx*riskcarrier*age, .)))

kable(regress_results)
cog_var term estimate std.error statistic p.value
cog1 (Intercept) 11.9264294 1.1672595 10.2174617 0.0000000
cog1 dxCase 2.8394090 1.7698966 1.6042796 0.1096482
cog1 riskcarriernon-carrier -1.0538028 2.6306132 -0.4005921 0.6889903
cog1 age -0.0520558 0.0226907 -2.2941403 0.0224360
cog1 dxCase:riskcarriernon-carrier -4.6365570 3.6489450 -1.2706569 0.2047825
cog1 dxCase:age -0.0134964 0.0351062 -0.3844435 0.7009075
cog1 riskcarriernon-carrier:age 0.0650805 0.0501192 1.2985151 0.1950542
cog1 dxCase:riskcarriernon-carrier:age 0.0785818 0.0685391 1.1465256 0.2524424
cog2 (Intercept) 19.9055562 1.1676401 17.0476815 0.0000000
cog2 dxCase 0.8053476 1.7868704 0.4507029 0.6525118
cog2 riskcarriernon-carrier 0.7816939 2.6015980 0.3004668 0.7640181
cog2 age 0.2308120 0.0226252 10.2015568 0.0000000
cog2 dxCase:riskcarriernon-carrier 1.1478888 3.6265195 0.3165263 0.7518114
cog2 dxCase:age -0.0128973 0.0353055 -0.3653057 0.7151268
cog2 riskcarriernon-carrier:age -0.0156561 0.0493271 -0.3173933 0.7511541
cog2 dxCase:riskcarriernon-carrier:age -0.0217776 0.0679281 -0.3205975 0.7487267
cog3 (Intercept) 33.6200213 8.2169195 4.0915602 0.0000546
cog3 dxCase -10.1585421 12.4897860 -0.8133480 0.4166368
cog3 riskcarriernon-carrier 22.2201568 18.6311019 1.1926378 0.2339146
cog3 age -0.1834078 0.1600371 -1.1460333 0.2526568
cog3 dxCase:riskcarriernon-carrier -34.6621556 25.8192409 -1.3424932 0.1804090
cog3 dxCase:age 0.2483313 0.2484239 0.9996272 0.3182631
cog3 riskcarriernon-carrier:age -0.4091669 0.3562594 -1.1485084 0.2516357
cog3 dxCase:riskcarriernon-carrier:age 0.5742907 0.4857384 1.1823044 0.2379823
#Let's just look at the carier by age interaction term

age_regress_results <- regress_results %>% filter(term == "age") 
age_regress_results$p.fdr = p.adjust(age_regress_results$p.value, method = "fdr")
kable(age_regress_results)
cog_var term estimate std.error statistic p.value p.fdr
cog1 age -0.0520558 0.0226907 -2.294140 0.0224360 0.0336540
cog2 age 0.2308120 0.0226252 10.201557 0.0000000 0.0000000
cog3 age -0.1834078 0.1600371 -1.146033 0.2526568 0.2526568
age_regress_results$p.bonferonni = p.adjust(age_regress_results$p.value, method = "bonferroni")

kable(age_regress_results)
cog_var term estimate std.error statistic p.value p.fdr p.bonferonni
cog1 age -0.0520558 0.0226907 -2.294140 0.0224360 0.0336540 0.0673081
cog2 age 0.2308120 0.0226252 10.201557 0.0000000 0.0000000 0.0000000
cog3 age -0.1834078 0.1600371 -1.146033 0.2526568 0.2526568 0.7579704