Exploring data with R tutorial

Please download the following files:

We need ‘ggplot2’, ‘dplyr’, ‘tidyr’, and ‘broom’ packages If you haven’t already installed these:

Load the packages

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(broom)
library(knitr)

The data should be merged and ready to go from day 1. If not, here’s the code for it:

data1 <- read.csv("~/Downloads/messy_demographic.csv")  # put in the location of the downloaded data file 1
data2 <- read.csv("~/Downloads/messy_cognitive.csv")  # put in the location of the downloaded data file 2
data3 <- read.csv("~/Downloads/messy_genotype.csv")

data1[data1==""] <- NA
data1[data1=="missing"] <- NA
data1[data1=="9999"] <- NA

data1$age <- as.numeric(as.character(data1$age))
data1$ethnicity <- factor(data1$ethnicity,levels=c("Cauc","AA","As","In","Other"))
data1$sex <- factor(data1$sex, levels=c(0,1), labels=c("Male","Female"))
data1$dx <- factor(data1$dx, levels=c(0,1), labels=c("Control","Case"))

data2[data2==""] <- NA
data2[data2=="missing"] <- NA
data2[data2=="9999"] <- NA
data2$cog1 <- as.numeric(as.character(data2$cog1))
data2$cog2 <- as.numeric(as.character(data2$cog2))
data2$cog3 <- as.numeric(as.character(data2$cog3))
data2$subID <- gsub(data2$subID,pattern="subject",replacement="SUB_")

data3[data3==""] <- NA
data3[data3=="missing"] <- NA
data3[data3=="9999"] <- NA
data3$genotype <- factor(data3$genotype, levels=c(0,1,2), labels=c("AA","AG","GG"))
data3$subID <- gsub(data3$subID,pattern="subject",replacement="SUB_")

alldata <- merge(data1,data2,by.x="subject_ID",by.y="subID")
alldata <- merge(alldata,data3,by.x="subject_ID",by.y="subID")

## Write merged dataframe as new .csv
write.csv(alldata, "alldata.csv", row.names=F)

RESEARCH QUESTION 3:

cog1 ~ dx (t-test)

t.test(data=alldata, cog1 ~ dx)

## 
##  Welch Two Sample t-test
## 
## data:  cog1 by dx
## t = -6.347, df = 334.916, p-value = 7.133e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.871388 -1.512676
## sample estimates:
## mean in group Control    mean in group Case 
##              9.940047             12.132079

Wow we have a result!! But one of my favorite things output R is that the statistical output can be saved as an object!!

my.t.result <- t.test(data=alldata, cog1 ~ dx) # saves to output to my.t.result

print(my.t.result)       ## prints to output to the console

## 
##  Welch Two Sample t-test
## 
## data:  cog1 by dx
## t = -6.347, df = 334.916, p-value = 7.133e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.871388 -1.512676
## sample estimates:
## mean in group Control    mean in group Case 
##              9.940047             12.132079

my.t.result$statistic    ## gets us the t statistic!

##         t 
## -6.347017

my.t.result$parameter    ## the degrees of freedom

##       df 
## 334.9162

my.t.result$p.value      ## gets us the p-value

## [1] 7.13299e-10

round(my.t.result$statistic,2) ## we can these numbers using the "round" function

##     t 
## -6.35

Let’s put these three together into something we might want to report in our paper

my.t.results.txt = paste0('t(',
                        round(my.t.result$parameter,1),
                        ') = ',
                        round(my.t.result$statistic,2), ', p = ',
                        round(my.t.result$p.value, 7))

my.t.results.txt

## [1] "t(334.9) = -6.35, p = 0"

View this as basic boxplots (two ways: base package and ggplot2)

ggplot(data=alldata, aes(y=cog1,x=dx)) + 
    geom_boxplot()

## Warning: Removed 5 rows containing non-finite values (stat_boxplot).

Let’s make it fancier:

First - let’s deal with the NA’s we don’t want to plot - let’s remove them from the plotting dataset

data.toplot <-filter(alldata, !is.na(cog1), !is.na(dx))

ggplot(data.toplot, aes(y=cog1,x=dx)) + 
    geom_boxplot(outlier.shape=NA) + ## removes outliers from boxplot layer
    geom_jitter(alpha=0.5)           ## add dots on top

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page

## even fancier - let's add a title, and annotation and label the axes
ggplot(data.toplot, aes(y=cog1,x=dx)) + 
  geom_boxplot(outlier.shape=NA) + 
  geom_jitter(alpha=0.5) +
  labs(title="Effect of Diagnosis on Cog Score #1",
       y = "Cognitive Test 1",
       x = "Diagnosis") +
  annotate("text", label = my.t.results.txt, x = 1, y = 21) +
  theme_bw()

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page

NOW LET’s save our plot!!!

Note: we can start by using the “Export” button in the plots tab..

ggsave('figure1_ttestresults.pdf', width = 5, height = 5)

Let’s make a diagnosis by cognition table

my.stats.table <- summarise(alldata,
                            "Mean" = mean(cog1, na.rm = T),
                            "St Dev" = sd(cog1, na.rm = T))

my.stats.table <- alldata %>% 
                  group_by(dx,sex) %>%
                  summarise("Mean" = mean(cog1, na.rm = T),
                            "St Dev" = sd(cog1, na.rm = T))

kable(my.stats.table)

dx	sex	Mean	St Dev
Control	Male	10.476754	2.7469625
Control	Female	9.746893	2.9514609
Control	NA	12.254830	1.5465560
Case	Male	12.215568	3.5483599
Case	Female	12.022392	3.3859637
Case	NA	13.036707	0.1736939
NA	Male	11.868654	1.0237882
NA	Female	10.580708	2.3376551

reshaping data to look at all three cognitive scores

but let’s look at all three subscales….

## let's add a rick carrier
alldata$riskcarrier[alldata$genotype=="AG" | alldata$genotype=="GG"] <- "carrier"
alldata$riskcarrier[alldata$genotype=="AA"] <- "non-carrier"
alldata$riskcarrier <- as.factor(alldata$riskcarrier)

alldata_melted <- gather(alldata, cog_var, cognitive.score, 
                  -subject_ID, -age, -sex, -ethnicity, -dx, -genotype, -riskcarrier)

toplot <- filter(alldata_melted, !is.na(cognitive.score), 
                                 !is.na(dx), 
                                 !is.na(riskcarrier))

ggplot(toplot, aes(y=cognitive.score,x=age,color=riskcarrier)) + 
  geom_point() + 
  geom_smooth(method=lm) + 
  facet_wrap(~cog_var, scales = "free")

## Warning: Removed 5 rows containing missing values (stat_smooth).

## Warning: Removed 2 rows containing missing values (stat_smooth).

## Warning: Removed 5 rows containing missing values (stat_smooth).

## Warning: Removed 2 rows containing missing values (stat_smooth).

## Warning: Removed 5 rows containing missing values (stat_smooth).

## Warning: Removed 2 rows containing missing values (stat_smooth).

## Warning: Removed 7 rows containing missing values (geom_point).

## Warning: Removed 7 rows containing missing values (geom_point).

## Warning: Removed 7 rows containing missing values (geom_point).

## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page

LAST CHALLENGE: can you use dplyr to present these statistics in a table??

LAST BUT NOT LEAST, running a loop to screen for significance at multiple tests (manhattan plot)

## add stargazer descriptive stats table?
cog_dems_table <- alldata_melted  %>% 
                  filter(!is.na(cognitive.score), !is.na(dx), !is.na(riskcarrier)) %>%
                  group_by(dx, cog_var, riskcarrier) %>%
                  summarise("Mean" = mean(cognitive.score, na.rm = T),
                            "St Dev" = sd(cognitive.score, na.rm = T))

kable(cog_dems_table)

dx	cog_var	riskcarrier	Mean	St Dev
Control	cog1	carrier	9.367999	2.745134
Control	cog1	non-carrier	11.545711	2.884425
Control	cog2	carrier	31.439535	4.015106
Control	cog2	non-carrier	31.845074	3.675566
Control	cog3	carrier	24.442639	20.988259
Control	cog3	non-carrier	25.761328	24.178563
Case	cog1	carrier	11.549411	3.450849
Case	cog1	non-carrier	13.418635	3.126957
Case	cog2	carrier	31.405908	3.921310
Case	cog2	non-carrier	32.569693	3.489469
Case	cog3	carrier	26.178213	20.660281
Case	cog3	non-carrier	23.402200	20.367174

regress_results <- alldata_melted %>%
                    filter(!is.na(cognitive.score), !is.na(dx), !is.na(riskcarrier)) %>%
                    group_by(cog_var) %>%
                    do(tidy(lm(cognitive.score ~ dx*riskcarrier*age, .)))

kable(regress_results)

cog_var	term	estimate	std.error	statistic	p.value
cog1	(Intercept)	11.9264294	1.1672595	10.2174617	0.0000000
cog1	dxCase	2.8394090	1.7698966	1.6042796	0.1096482
cog1	riskcarriernon-carrier	-1.0538028	2.6306132	-0.4005921	0.6889903
cog1	age	-0.0520558	0.0226907	-2.2941403	0.0224360
cog1	dxCase:riskcarriernon-carrier	-4.6365570	3.6489450	-1.2706569	0.2047825
cog1	dxCase:age	-0.0134964	0.0351062	-0.3844435	0.7009075
cog1	riskcarriernon-carrier:age	0.0650805	0.0501192	1.2985151	0.1950542
cog1	dxCase:riskcarriernon-carrier:age	0.0785818	0.0685391	1.1465256	0.2524424
cog2	(Intercept)	19.9055562	1.1676401	17.0476815	0.0000000
cog2	dxCase	0.8053476	1.7868704	0.4507029	0.6525118
cog2	riskcarriernon-carrier	0.7816939	2.6015980	0.3004668	0.7640181
cog2	age	0.2308120	0.0226252	10.2015568	0.0000000
cog2	dxCase:riskcarriernon-carrier	1.1478888	3.6265195	0.3165263	0.7518114
cog2	dxCase:age	-0.0128973	0.0353055	-0.3653057	0.7151268
cog2	riskcarriernon-carrier:age	-0.0156561	0.0493271	-0.3173933	0.7511541
cog2	dxCase:riskcarriernon-carrier:age	-0.0217776	0.0679281	-0.3205975	0.7487267
cog3	(Intercept)	33.6200213	8.2169195	4.0915602	0.0000546
cog3	dxCase	-10.1585421	12.4897860	-0.8133480	0.4166368
cog3	riskcarriernon-carrier	22.2201568	18.6311019	1.1926378	0.2339146
cog3	age	-0.1834078	0.1600371	-1.1460333	0.2526568
cog3	dxCase:riskcarriernon-carrier	-34.6621556	25.8192409	-1.3424932	0.1804090
cog3	dxCase:age	0.2483313	0.2484239	0.9996272	0.3182631
cog3	riskcarriernon-carrier:age	-0.4091669	0.3562594	-1.1485084	0.2516357
cog3	dxCase:riskcarriernon-carrier:age	0.5742907	0.4857384	1.1823044	0.2379823

#Let's just look at the carier by age interaction term

age_regress_results <- regress_results %>% filter(term == "age") 
age_regress_results$p.fdr = p.adjust(age_regress_results$p.value, method = "fdr")
kable(age_regress_results)

cog_var	term	estimate	std.error	statistic	p.value	p.fdr
cog1	age	-0.0520558	0.0226907	-2.294140	0.0224360	0.0336540
cog2	age	0.2308120	0.0226252	10.201557	0.0000000	0.0000000
cog3	age	-0.1834078	0.1600371	-1.146033	0.2526568	0.2526568

age_regress_results$p.bonferonni = p.adjust(age_regress_results$p.value, method = "bonferroni")

kable(age_regress_results)

cog_var	term	estimate	std.error	statistic	p.value	p.fdr	p.bonferonni
cog1	age	-0.0520558	0.0226907	-2.294140	0.0224360	0.0336540	0.0673081
cog2	age	0.2308120	0.0226252	10.201557	0.0000000	0.0000000	0.0000000
cog3	age	-0.1834078	0.1600371	-1.146033	0.2526568	0.2526568	0.7579704