Welcome to the Tidyverse
Charlie Murtaugh
EIHG 4420B
801-581-5958
murtaugh@genetics.utah.edu
https://twitter.com/mostbiggestdata/status/1033725572462137344
Recommended reading
Outline
What is the tidyverse?
Hadley Wickham
Chief Scientist, RStudio
Tidy data
In tidy data:
Wickham, J. Stat. Software 2014
Usefulness of tidy data
Wickham, J. Stat. Software 2014
Usefulness of tidy data
Wickham, J. Stat. Software 2014
aka “narrow” data
(tidyverse pivot_longer() function)
Tidy data approach
Getting started with Tidyverse functions
%>% “pipe” output from one function to another
pivot_longer convert spreadsheet-type data to tidy format
(aka gather)
separate split up single descriptor variable (e.g. spreadsheet
column head) into multiple variables
group_by organize data according to descriptor variables
summarize extract summary information from grouped data
filter isolate subsets of data
Creating a toy dataset – tibble format
library(tidyverse)
library(cowplot)
temp_df <- bind_cols(sample=c(1,2,3), temp=c(-40, 32, 98.6)) %>%
print()
## # A tibble: 3 x 2
## sample temp
## <dbl> <dbl>
## 1 1 -40
## 2 2 32
## 3 3 98.6
Piping your code for easier writing and reading
# same result, different ways to get there
test <- c(1, 2, 3, 4)
test_sqrt <- sqrt(test)
print(test_sqrt)
c(1, 2, 3, 4) %>% sqrt() %>% print()
[1] 1.000000 1.414214 1.732051 2.000000
Piping your code for easier writing and reading
# same result, different ways to get there
test <- c(1, 2, 3, 4)
test_sqrt <- sqrt(test)
print(test_sqrt)
c(1, 2, 3, 4) %>% sqrt() %>% print()
[1] 1.000000 1.414214 1.732051 2.000000
Creating new columns or changing existing ones with mutate
# use "mutate" to create new column with temperature in Celsius
temp_df <- mutate(temp_df, tempC=(temp-32)*5/9) %>% print()
## # A tibble: 3 x 3
## sample temp tempC
## <dbl> <dbl> <dbl>
## 1 1 -40 -40
## 2 2 32 0
## 3 3 98.6 37
One mutate, multiple operations
# create toy data, again
temp_df <- bind_cols(time=c(1,2,3), temp=c(-40, 32, 98.6))
# now let's add both Celsius and Kelvin temperatures in one command
temp_df <- mutate(temp_df, tempC=(temp-32)*5/9,
tempK=tempC+273.15) %>%
rename(tempF = temp) %>%
print()
## # A tibble: 3 x 4
## time tempF tempC tempK
## <dbl> <dbl> <dbl> <dbl>
## 1 1 -40 -40 233.
## 2 2 32 0 273.
## 3 3 98.6 37 310.
Summarizing data with summarize –�a toy example
data(mpg)
print(mpg) # just to check what the data look like
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp…
## # … with 224 more rows
Tidyverse_lecture_proliferation_code_2022.R
group_by: organize data based on descriptor variable
## # A tibble: 234 x 11
## # Groups: cyl [4]
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp…
## # … with 224 more rows
mpg_by_cyl <- group_by(mpg, cyl) %>% print()
summarize: perform functions on groups within dataset
## # A tibble: 4 x 6
## cyl n hwy_mean hwy_sd displ_mean displ_sd
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 4 81 28.8 4.52 2.15 0.315
## 2 5 4 28.8 0.5 2.5 0
## 3 6 79 22.8 3.69 3.41 0.472
## 4 8 70 17.6 3.26 5.13 0.589
mpg_cyl_summarize <- summarize(mpg_by_cyl, n=n(),
hwy_mean=mean(hwy),
hwy_sd=sd(hwy),
displ_mean=mean(displ),
displ_sd=sd(displ))
print(mpg_cyl_summarize)
filter to look at specific subsets of data
mpg_auto <- filter(mpg, str_detect(trans, 'auto'))
mpg_auto_best <- filter(mpg_auto, hwy > quantile(hwy, 0.75))
mpg_auto_best_who <- count(mpg_auto_best, manufacturer) %>% print()
## # A tibble: 9 x 2
## manufacturer n
## <chr> <int>
## 1 audi 4
## 2 chevrolet 3
## 3 honda 4
## 4 hyundai 3
## 5 nissan 2
## 6 pontiac 2
## 7 subaru 1
## 8 toyota 9
## 9 volkswagen 7
filter to look at specific subsets of data
mpg_auto <- filter(mpg, str_detect(trans, 'auto'))
mpg_auto_best <- filter(mpg_auto, hwy > quantile(hwy, 0.75))
mpg_auto_best_who <- count(mpg_auto_best, manufacturer) %>% print()
## # A tibble: 9 x 2
## manufacturer n
## <chr> <int>
## 1 audi 4
## 2 chevrolet 3
## 3 honda 4
## 4 hyundai 3
## 5 nissan 2
## 6 pontiac 2
## 7 subaru 1
## 8 toyota 9
## 9 volkswagen 7
Making it simpler with the pipe
filter(mpg, str_detect(trans, 'auto')) %>%
filter(hwy > quantile(hwy, 0.75)) %>%
count(manufacturer) %>%
ggplot(aes(x=manufacturer, y=n)) +
geom_bar(stat='identity') +
xlab('manufacturer') +
ylab('number of models') +
theme(axis.text.x=element_text(angle = 45, hjust=1))
Making it simpler with the pipe
filter(mpg, str_detect(trans, 'auto')) %>%
filter(hwy > quantile(hwy, 0.75)) %>%
count(manufacturer) %>%
arrange(desc(n)) %>%
mutate(manufacturer=factor(manufacturer, levels=manufacturer)) %>%
ggplot(aes(x=manufacturer, y=n)) +
geom_bar(stat='identity') +
xlab('manufacturer') +
ylab('number of models') +
theme(axis.text.x=element_text(angle = 45, hjust=1))
ggplot2 package – the “grammar of graphics”
Analyzing proliferation data – wrangling, transforming and visualizing
“3T3 assay” – measuring cumulative population growth over time
Todaro and Green, J Cell Biol 1963
Original data in Excel spreadsheet
# cells plated at start of first passage (x105)
# cells present in dish at end of first passage (3 days later) (x105)
Load data into R and take a quick look
pdac <- read_excel('PDAC_3T3_data_simplified.xlsx') %>% print()
pdac <- mutate(pdac, treatment = replace_na(treatment, 'untreated'))
pdac <- mutate(pdac, treatment=factor(treatment, levels = c('untreated', 'dox')),
virus=factor(virus, levels = c('EGFP', 'Ptf1a')))
convert data from wide format to narrow/tidy with pivot_longer*
pdac_tidy <- pivot_longer(pdac,
contains(c('plating_', 'harvest_')),
names_to='observation',
values_to='cell_num') %>% print()
* function formerly known as gather
convert data from wide format to narrow/tidy with pivot_longer*
pdac_tidy <- pivot_longer(pdac,
contains(c('plating_', 'harvest_')),
names_to='observation',
values_to='cell_num') %>% print()
* function formerly known as gather
# get rid of unnecessary variables
pdac_tidy <- select(pdac_tidy, -plating, -sample)
# remove any missing elements
pdac_tidy <- filter(pdac_tidy, !is.na(cell_num)) %>% print()
Split observation variable into multiple variables with separate
pdac_tidy <- separate(pdac_tidy, observation,
into=c('observation', 'passage_num'),
convert=T) %>% print()
Let’s make a graph (Figure 1)
palette <- c('green3', 'orangered’)
# nice palette for graphing GFP (green) vs Ptf1a (orange)
filter(pdac_tidy, observation=='harvest') %>%
ggplot(aes(x=passage_num, y=cell_num,
group=interaction(experiment, virus, treatment),
col=virus, lty=treatment, pch=treatment)) +
geom_line(size=0.75) +
geom_point(alpha=0.4) +
scale_color_manual(values=palette) +
scale_shape_manual(values=c(1,16)) +
scale_linetype_manual(values=c(3,1)) +
facet_wrap(~line, scales='free') +
theme_bw()
Let’s make a graph (Figure 1)
Let’s convert from absolute cell number to fold-increase (relative to # plated)
# let's make the data wider again, temporarily
pdac_fold <- pivot_wider(pdac_tidy, names_from=observation, values_from=cell_num) %>% print()
pdac_fold <- mutate(pdac_fold, fold_change=harvest/plating, .keep='unused’)
# let's convert fold-change to population doublings, via log2
pdac_fold <- mutate(pdac_fold, doublings=log2(fold_change)) %>% print()
Let’s make a graph (Figure 2)
ggplot(pdac_fold, aes(x=passage_num, y=doublings,
group=interaction(experiment, virus, treatment),
col=virus, lty=treatment)) +
geom_line(size=0.75) +
geom_point(alpha=0.4) +
scale_color_manual(values=palette) +
scale_shape_manual(values=c(1,16)) +
scale_linetype_manual(values=c(3,1)) +
facet_wrap(~line, scales='free') +
theme_bw()
Let’s generate an actual growth curve by calculating the cumulative sum of population doublings (Figure 3)
pdac_fold <- group_by(pdac_fold, line, experiment, virus, treatment) %>%
mutate(cuml_doublings=cumsum(doublings)) %>% ungroup() %>% print()
# how does this look when plotted?
ggplot(pdac_fold, aes(x=passage_num, y=cuml_doublings,
group=interaction(experiment, virus, treatment),
col=virus, lty=treatment)) +
geom_line(size=0.75) +
geom_point(alpha=0.4) +
scale_color_manual(values=palette) +
scale_shape_manual(values=c(1,16)) +
scale_linetype_manual(values=c(3,1)) +
facet_wrap(~line, scales='free') +
theme_bw()
Let’s generate an actual growth curve by calculating the cumulative sum of population doublings (Figure 3)
Instead of plotting individual lines for each experiment, let’s plot means of independent experiments
# now let's calculate the mean cumulative doublings (and std deviation) for each cell line, across experiments
pdac_mean <- group_by(pdac_fold, line, virus, treatment, passage_num) %>%
summarize(cuml_mean=mean(cuml_doublings),
cuml_sd=sd(cuml_doublings),
.groups='drop') %>%
print()
Now: plot the mean population growth as line, with error bars indicating SDs (Figure 4)
ggplot(pdac_mean, aes(x=passage_num, y=cuml_mean,
group=interaction(virus, treatment),
col=virus, lty=treatment)) +
geom_line(size=0.75) +
geom_point(alpha=0.4) +
geom_errorbar(aes(ymin=cuml_mean-cuml_sd, ymax=cuml_mean+cuml_sd), width=0.1) +
scale_color_manual(values=palette) +
scale_shape_manual(values=c(1,16)) +
scale_linetype_manual(values=c(3,1)) +
facet_wrap(~line, scales='free') +
theme_bw()
Now: plot the mean population growth as line, with error bars indicating SDs (Figure 4)
Problem: our individual point values and our mean calculations are in different data tables
pdac_mean
pdac_fold
ggplot can combine elements with coordinates specified by multiple data sources
ggplot(pdac_mean, aes(x=passage_num, y=cuml_mean,
group=interaction(virus, treatment),
col=virus, lty=treatment)) +
geom_line(size=0.75) +
geom_point(data=pdac_fold, aes(x=passage_num, y=cuml_doublings), alpha=0.4) +
scale_color_manual(values=palette) +
scale_shape_manual(values=c(1,16)) +
scale_linetype_manual(values=c(3,1)) +
facet_wrap(~line, scales='free') +
theme_bw()
Figure 4 – mean growth curves together with individual data points
Endpoint analysis: analyze interaction between cell line, virus and treatment at last timepoint
pdac_end <- group_by(pdac_fold, line) %>% filter(passage_num==max(passage_num)) %>% ungroup() %>% print()
Create nested tibble with each cell line’s data separated out
pdac_fold_nest <- group_by(pdac_end, line) %>% nest() %>%
ungroup() %>% print()
# look inside the first one (Panc1)
print(pdac_fold_nest$data[[1]])
Analyze each dataset via ANOVA followed by TukeyHSD, using map function
# for simplicity, create function for ANOVA modeling each data set, and returning Tukey HSD results (cleaned up with "tidy)
pd_aov <- function(df) {
aov(cuml_doublings ~ interaction(virus, treatment), data=df) %>% TukeyHSD() %>% tidy()
}
# call "pd_aov" on each cell line's dataset, using "map" function
pdac_fold_nest <- mutate(pdac_fold_nest,
aov_tukey=map(data, pd_aov)) %>% print()
What do the results look like?
# let's look at the first one (Panc1)
pdac_fold_nest$aov_tukey[[1]]
What do the results look like?
# what are the p-values for Ptf1a + DOX vs. EGFP + DOX?
pdac_anova_results <- unnest(pdac_fold_nest, cols=aov_tukey) %>%
filter(contrast=='Ptf1a.dox-EGFP.dox') %>%
print()
p=0.986
p=0.0485
p=0.0021
p=0.0093
What do the results look like?
# correct for multiple comparisons (4 cell lines)
mutate(pdac_anova_results, p.corrected=p.adjust(adj.p.value, method='bonferroni'))
p=1.0
p=0.194
p= 0.00839
p=0.0372
# what are the p-values for Ptf1a + DOX vs. EGFP + DOX?
pdac_anova_results <- unnest(pdac_fold_nest, cols=aov_tukey) %>%
filter(contrast=='Ptf1a.dox-EGFP.dox') %>%
print()
What do the results look like?
# correct for multiple comparisons (4 cell lines)
mutate(pdac_anova_results, p.corrected=p.adjust(adj.p.value, method='bonferroni'))
p=1.0
p=0.194
p= 0.00839
p=0.0372
# what are the p-values for Ptf1a + DOX vs. EGFP + DOX?
pdac_anova_results <- unnest(pdac_fold_nest, cols=aov_tukey) %>%
filter(contrast=='Ptf1a.dox-EGFP.dox') %>%
print()
Is there a better statistical method to analyze data like this?