Proteomics assay data
Proteomics assay data
Assays for high-throughput proteomics have recently become available, allowing the screening of thousands of proteins in large number of samples. Two technologies are currently widely encountered: aptamers and antibodies (ref). Both provide data in a tabular format with the protein abundance estimates for every sample.
Libraries
We will need the following libraries. If they are not availble on your
system, use the ìnstall_packages
command to install them.
library(janitor) # A package to clean dirty data
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyr) # A package to tidy data
library(dplyr) # A package to manipulate data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(parallel) # A package to run tasks in parallel
library(glue) # A package to glue variables in strings
library(stringr) # A package to manipulate strings
library(ggplot2) # A package to plot data
library(scico) # A package of scientific color palettes
library(conflicted) # A package to manage namespace conflicts
# Resolve conflicting function names
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package
# Set the theme for plots
theme_set(theme_bw(base_size = 13))
Load data
:pencil2: Open the data set available at these links: - link1 - link2 - link3
files_folder <- "/home/marc/Projects/tutorials/data"
protein_values <- read.table(
file = file.path(files_folder, "soma_example.gz"),
header = T,
sep = "\t",
stringsAsFactors = F
) %>%
clean_names() # This function makes the formatting of the column names uniform, note that it transforms the identifiers of the proteins
# write.table(
# x = protein_values,
# col.names = T,
# row.names = F,
# sep = "\t",
# file = file.path(files_folder, "soma_example.gz")
# )
samples_details <- read.table(
file = file.path(files_folder, "soma_example_2.gz"),
header = T,
sep = "\t",
stringsAsFactors = F
) %>%
clean_names()
protein_details <- read.table(
file = file.path(files_folder, "soma_example_3.gz"),
header = T,
sep = "\t",
stringsAsFactors = F,
comment.char = ""
) %>%
clean_names() %>%
mutate(
protein_id = paste0("x", str_replace_all(string = seq_id, pattern = "-", replacement = "_")) # Note that we create a column to match identifiers with the column names of the protein_values
)
:speech_balloon: What is shown in row/columns in each file? How many proteins were measured in how many samples?
Abundance estimates standardization
:pencil2: Select a sample and make a histogram of the protein abundance estimates.
selected_sample <- sample(samples_details$id, 1)
selected_sample_values <- protein_values %>%
filter(
sample_id == selected_sample
) %>%
select(
starts_with("x")
) %>%
pivot_longer(
cols = starts_with("x")
)
ggplot() +
geom_histogram(
data = selected_sample_values,
mapping = aes(
x = value
),
bins = 50,
col = "darkblue",
fill = "darkblue",
alpha = 0.4
) +
scale_x_continuous(
name = glue("Abundance estimate for sample {selected_sample}")
) +
scale_y_continuous(
name = glue("Number of proteins"),
expand = expansion(
mult = c(0, 0.05)
)
)
:speech_balloon: Is this what you would expect when looking at the proteome of a sample?
:pencil2: Select a protein and look at the distribution of abundance estimates.
selected_protein <- sample(protein_details$protein_id, 1)
ggplot() +
geom_histogram(
data = protein_values,
mapping = aes(
x = !!sym(selected_protein)
),
bins = 50,
col = "darkgreen",
fill = "darkgreen",
alpha = 0.4
) +
scale_x_continuous(
name = glue("Abundance estimate of {selected_protein}")
) +
scale_y_continuous(
name = glue("Number of samples"),
expand = expansion(
mult = c(0, 0.05)
)
)
:speech_balloon: How does the distribution looks like? How will it influence downstream analyses?
:pencil2: Log, center, and scale the abundance of each protein.
for (protein in protein_details$protein_id) {
log_values <- log10(protein_values[[protein]][protein_values$sample_id %in% samples_details$id & !is.na(protein_values[[protein]]) & protein_values[[protein]] > 0])
if (length(log_values) < 10) {
stop(glue("Too few values for protein {protein}"))
}
mean_log <- mean(log_values, na.rm = T)
sd_log <- sd(log_values, na.rm = T)
new_column <- paste0(protein, "_log_std")
protein_values[[new_column]] <- ifelse(protein_values$sample_id %in% samples_details$id & !is.na(protein_values[[protein]]) & protein_values[[protein]] > 0, (log10(protein_values[[protein]]) - mean_log) / sd_log, NA)
}
:pencil2: Plot again the distribution for the selected protein.
selected_protein_log_std <- paste0(selected_protein, "_log_std")
ggplot() +
geom_histogram(
data = protein_values,
mapping = aes(
x = !!sym(selected_protein_log_std)
),
bins = 50,
col = "darkgreen",
fill = "darkgreen",
alpha = 0.4
) +
scale_x_continuous(
name = glue("Standardized abundance estimate of {selected_protein}")
) +
scale_y_continuous(
name = glue("Number of samples"),
expand = expansion(
mult = c(0, 0.05)
)
)
## Warning: Removed 3 rows containing non-finite values (stat_bin).
:pencil2: Plot the standardized vs non-standardized values.
ggplot() +
geom_point(
data = protein_values,
mapping = aes(
x = !!sym(selected_protein),
y = !!sym(selected_protein_log_std)
),
col = "darkgreen",
alpha = 0.1
) +
scale_x_continuous(
name = glue("Abundance estimate of {selected_protein}")
) +
scale_y_continuous(
name = glue("Standardized abundance estimate of {selected_protein}")
)
## Warning: Removed 3 rows containing missing values (geom_point).
:pencil2: Plot again the distribution for the selected sample.
selected_sample_std_values <- protein_values %>%
filter(
sample_id == selected_sample
) %>%
select(
ends_with("_log_std")
) %>%
pivot_longer(
cols = ends_with("_log_std")
)
ggplot() +
geom_histogram(
data = selected_sample_std_values,
mapping = aes(
x = value
),
bins = 50,
col = "darkblue",
fill = "darkblue",
alpha = 0.4
) +
scale_x_continuous(
name = glue("Standardized abundance estimate for sample {selected_sample}")
) +
scale_y_continuous(
name = glue("Number of proteins"),
expand = expansion(
mult = c(0, 0.05)
)
)
:pencil2: Plot the standardized vs non-standardized values.
plot_values <- merge(selected_sample_values, selected_sample_std_values %>% rename(std_value = value) %>% mutate(name = str_remove(name, "_log_std")), by = "name")
ggplot() +
geom_point(
data = plot_values,
mapping = aes(
x = value,
y = std_value
),
col = "darkblue",
alpha = 0.1
) +
scale_x_continuous(
name = glue("Protein abundance estimate for {selected_sample}")
) +
scale_y_continuous(
name = glue("Standardized abundance estimate for {selected_sample}")
)
:speech_balloon: How do you interpret this plot?
Principal component analysis
:pencil2: Run a PCA on the standardized values.
pca_input <- protein_values %>%
filter(
sample_id %in% samples_details$id
) %>%
select(
sample_id, ends_with("_log_std")
)
pca_samples <- pca_input$sample_id
pca_input <- t(pca_input[, -1])
colnames(pca_input) <- pca_samples
pca <- prcomp(pca_input)
The quantitative information is now projected in a base of dimensions ordered by decreasing amount of variance, the principal components. We can plot the dimensions capturing most of the variance from the sdev attribute of the pca result.
:pencil2: Plot the variance explained by the first ten components
eigen_values <- pca$sdev ^ 2
total_eigen_value <- sum(eigen_values)
pc_contribution <- 100 * eigen_values / total_eigen_value
ggplot() +
geom_col(
mapping = aes(
x = 1:10,
y = pc_contribution[1:10]
)
) +
scale_x_continuous(
name = "PC"
) +
scale_y_continuous(
name = glue("Proportion of Variance [%]"),
expand = expansion(
mult = c(0, 0.05)
)
)
:speech_balloon: How do you interpret this plot?
:pencil2: Plot the samples in the plane made by PC1 and PC2
pc1 <- pca$rotation[,1]
pc2 <- pca$rotation[,2]
names <- dimnames(pca$rotation)[[1]]
plot_data <- data.frame(
pc1 = pc1,
pc2 = pc2,
id = names
)
contribution1 <- round(100 * eigen_values[1] / total_eigen_value)
contribution2 <- round(100 * eigen_values[2] / total_eigen_value)
ggplot() +
geom_point(
data = plot_data,
mapping = aes(
x = pc1,
y = pc2
),
alpha = 0.2
) +
xlab(glue("PC1 [{contribution1}%]")) +
ylab(glue("PC2 [{contribution2}%]"))
:pencil2: Color the points using the sample details.
plot_data <- plot_data %>%
left_join(
samples_details %>%
mutate(
id = as.character(id)
),
by = "id"
)
ggplot() +
geom_point(
data = plot_data,
mapping = aes(
x = pc1,
y = pc2,
col = age
),
alpha = 0.2
) +
xlab(glue("PC1 [{contribution1}%]")) +
ylab(glue("PC2 [{contribution2}%]")) +
scale_color_scico(
name = "Age",
palette = "berlin"
)
ggplot() +
geom_point(
data = plot_data,
mapping = aes(
x = pc1,
y = pc2,
col = bmi
),
alpha = 0.2
) +
xlab(glue("PC1 [{contribution1}%]")) +
ylab(glue("PC2 [{contribution2}%]")) +
scale_color_scico(
name = "BMI",
palette = "berlin"
)
ggplot() +
geom_point(
data = plot_data,
mapping = aes(
x = pc1,
y = pc2,
col = sex
),
alpha = 0.2
) +
xlab(glue("PC1 [{contribution1}%]")) +
ylab(glue("PC2 [{contribution2}%]")) +
scale_color_scico(
name = "Sex",
palette = "berlin"
)
Association analysis
:pencil2: Compute the association of the selected protein with one of the disease classes
class <- "Class_1"
samples_tested <- samples_details %>%
filter(
group == "Healthy" | group == class
) %>%
mutate(
group_level = ifelse(group == "Healthy", 0, 1)
)
current_protein_values <- protein_values %>%
select(
id = sample_id,
abundance = !!sym(selected_protein_log_std)
)
association_data <- samples_tested %>%
left_join(
current_protein_values,
by = "id"
) %>%
filter(
!is.na(abundance)
)
lm_results <- lm(
formula = "abundance ~ group_level + age + bmi + sex",
data = association_data
)
summary(lm_results)
##
## Call:
## lm(formula = "abundance ~ group_level + age + bmi + sex", data = association_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3736 -0.3283 -0.1226 0.2698 6.4926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.153188 0.320226 -0.478 0.6327
## group_level 0.093163 0.097110 0.959 0.3381
## age 0.009273 0.004196 2.210 0.0278 *
## bmi -0.010061 0.010184 -0.988 0.3240
## sex -0.039089 0.084630 -0.462 0.6445
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7013 on 309 degrees of freedom
## Multiple R-squared: 0.02357, Adjusted R-squared: 0.01093
## F-statistic: 1.865 on 4 and 309 DF, p-value: 0.1165
:pencil2: Plot the association with all classes.
classes <- c("Class_1", "Class_2a", "Class_2b", "Class_L")
plot_data <- NULL
for (class in classes) {
samples_tested <- samples_details %>%
filter(
group == "Healthy" | group == class
) %>%
mutate(
group_level = ifelse(group == "Healthy", 0, 1)
)
current_protein_values <- protein_values %>%
select(
id = sample_id,
abundance = !!sym(selected_protein_log_std)
)
association_data <- samples_tested %>%
left_join(
current_protein_values,
by = "id"
) %>%
filter(
!is.na(abundance)
)
association_data$class <- class
plot_data <- rbind(plot_data, association_data)
}
ggplot() +
geom_hline(
yintercept = 0
) +
geom_point(
data = plot_data,
mapping = aes(
x = group_level,
y = abundance,
col = class
),
alpha = 0.2
) +
geom_smooth(
data = plot_data,
mapping = aes(
x = group_level,
y = abundance,
col = class
),
formula = "y ~ x",
method = "lm"
) +
scale_x_continuous(
name = "Case or Control"
) +
scale_y_continuous(
name = "Protein abundance"
) +
facet_grid(
. ~ class
) +
theme(
legend.position = "none"
)
:pencil2: For every protein, run a linear model of every patient class vs healthy taking age, BMI, and sex as covariate.
classes <- c("Class_1", "Class_2a", "Class_2b", "Class_L")
n <- length(classes) * nrow(protein_details)
results <- data.frame(
class = character(n),
protein_id = character(n),
uniprot_id = character(n),
protein_name = character(n),
beta = numeric(n),
se = numeric(n),
p = numeric(n),
n = numeric(n),
stringsAsFactors = F
)
i <- 1
for (class in classes) {
print(glue("{Sys.time()} - Processing {class}"))
samples_tested <- samples_details %>%
filter(
group == "Healthy" | group == class
) %>%
mutate(
group_level = ifelse(group == "Healthy", 0, 1)
)
for (protein_index in 1:nrow(protein_details)) {
protein_id <- protein_details$protein_id[protein_index]
uniprot_id <- protein_details$uni_prot[protein_index]
protein_name <- protein_details$target_full_name[protein_index]
col_name <- paste0(protein_id, "_log_std")
current_protein_values <- protein_values %>%
select(
id = sample_id,
abundance = !!sym(col_name)
)
association_data <- samples_tested %>%
left_join(
current_protein_values,
by = "id"
) %>%
filter(
!is.na(abundance)
)
lm_results <- lm(
formula = "abundance ~ group_level + age + bmi + sex",
data = association_data
)
lm_summary <- summary(lm_results)
results$class[i] <- class
results$protein_id[i] <- protein_id
results$uniprot_id[i] <- uniprot_id
results$protein_name[i] <- protein_name
results$beta[i] <- lm_summary$coefficients["group_level", 1]
results$se[i] <- lm_summary$coefficients["group_level", 2]
results$p[i] <- lm_summary$coefficients["group_level", 4]
results$n[i] <- nrow(association_data)
i <- i + 1
}
}
## 2022-06-20 17:42:34 - Processing Class_1
## 2022-06-20 17:45:38 - Processing Class_2a
## 2022-06-20 17:48:39 - Processing Class_2b
## 2022-06-20 17:51:34 - Processing Class_L
:pencil2: Plot the significance against the effect size for each comparison.
ggplot() +
geom_point(
data = results,
mapping = aes(
x = beta,
y = -log10(p),
col = class
),
alpha = 0.2
) +
geom_hline(
yintercept = 2,
col = "darkgreen",
linetype = "dashed"
) +
scale_x_continuous(
name = "Effect size"
) +
scale_y_continuous(
name = "p-value [-log10]"
) +
facet_grid(
. ~ class
) +
theme(
legend.position = "none"
)
:pencil2: Plot the observed against expected p-value for each class.
qq_data <- results %>%
group_by(
class
) %>%
mutate(
logP = -log10(p)
) %>%
arrange(
logP
) %>%
mutate(
expectedLogP = sort(-log10(ppoints(n = n())))
) %>%
ungroup()
ggplot() +
geom_abline(
slope = 1,
intercept = 0,
color = "black",
linetype = "dashed"
) +
geom_point(
data = qq_data,
mapping = aes(
x = expectedLogP,
y = logP,
col = class
),
alpha = 0.2
) +
geom_hline(
yintercept = 2,
col = "darkgreen",
linetype = "dashed"
) +
scale_x_continuous(
name = "Effect size"
) +
scale_y_continuous(
name = "p-value [-log10]"
) +
facet_grid(
. ~ class
) +
theme(
legend.position = "none"
)