Note - Jeff Kinne’s version of this file, with some parts complete.
Users can load an Rdata frame which contains two R objects:
In R or Rstudio, load your data:
setwd("~jkinne/public_html/bd4isu-summer-2019/code/")
load("Metabolon_NCI_60.Rdata")
head(metab[,1:4])
## (p-Hydroxyphenyl)lactic acid 3-phospho-d-glycerate
## META-7889 0.072 0.927
## META-7890 0.072 1.389
## META-7891 0.072 1.695
## META-7892 0.483 1.322
## META-7893 1.000 1.883
## META-7894 0.507 0.118
## 5,6-Dimethylbenzimidazole 5-oxoproline
## META-7889 0.441 1.002
## META-7890 0.542 1.436
## META-7891 0.685 1.249
## META-7892 0.064 0.392
## META-7893 0.064 0.571
## META-7894 0.064 0.476
head(sampannot)
## name cancertype cell_line
## 1 META-7889 Breast BT-549
## 2 META-7890 Breast BT-549
## 3 META-7891 Breast BT-549
## 4 META-7892 Breast HS 578T
## 5 META-7893 Breast HS 578T
## 6 META-7894 Breast HS 578T
QUESTIONS: Using functions such as “dim”, “nrow”,“ncol”, “colnames”, “rownames”, answer the following questions:
# 1.
num_metabs <- ncol(metab)
print(paste("Number of metabolites:", num_metabs))
## [1] "Number of metabolites: 352"
#2.
num_samples <- nrow(metab)
print(paste("Number of samples:", num_samples))
## [1] "Number of samples: 39"
#3.
num_cell_lines <- length(unique(sampannot$cell_line))
print(paste("Number of cell lines:", num_cell_lines))
## [1] "Number of cell lines: 13"
#4.
num_cancer_types <- length(unique(sampannot$cancertype))
print(paste("Number of cancer types:", num_cancer_types))
## [1] "Number of cancer types: 2"
#5.
t <- table(sampannot$cell_line)
print(paste("Number of replicates per cell ranges in:", range(t)))
## [1] "Number of replicates per cell ranges in: 3"
## [2] "Number of replicates per cell ranges in: 3"
** Assess the presence of missing/imputed values. In this dataset, missing values are imputed by the minimum. One quick way to check for missing values, is to count the number of times the minumum abundance value appears for a given metabolite. One could also calculate the standard deviation and perhaps filter out metabolites with low variation. Let’s do the latter here. (If you have time on your hands, do both!) **
sds=as.numeric(apply(metab,2,sd))
# Now plot the distribution of standard deviations:
hist(sds,breaks=1000,main="Distribution of metabolite abundance sds")
Do you notice any outliers? Let’s zoom in a little:
hist(sds,breaks=1000,main="Distribution of metabolite abundance sds",xlim=c(0,10))
Do you notice more outliers? What metabolites could you remove from further analysis? How many are there? What metabolite has a very high standard deviation?
bads=c(which(sds==0),which(sds>15))
length(bads)
## [1] 34
hist(sds[-bads])
colnames(metab)[which(sds>=10)]
## [1] "gluconic acid" "sorbitol"
# Save original and filter out metabolites
metab_o=metab
metab=metab_o[,-bads]
Look at the distsribution of metabolite abundances across each sample using a simple boxplot:
boxplot(as.data.frame(t(metab)),pch=19)
Does the data need to be transformed? Try this:
mycol=c(rep("slategrey",3),rep("lightsteelblue1",3),rep("lightsteelblue3",3),
rep("seagreen4",3),rep("lightskyblue",3),rep("grey",3),rep("skyblue3",3),
rep("lightcoral",3),rep("indianred4",3),rep("lightsalmon3",3),rep("salmon",3),
rep("sandybrown",3),rep("tomato",3))
boxplot(as.data.frame(t(log2(metab))),pch=19,
main="Distribution of Metabolites Per Sample",names=F,col=mycol,
ylab="log2(Normalized metabolite abundances)")
text(x = seq_along(rownames(metab)), y=par("usr")[3] - 0.5, srt = 45, adj = 1,
labels = rownames(metab), xpd = TRUE, cex=0.6)
legend(3,8,legend=c("Blues: breast cancer","Reds: leukemias"))
Apply an unsupervised clustering method to see how samples cluster together.
For example, using PCA:
library(ggplot2) # I assume ggplot2 is installed, if not, see above
mypca=prcomp(log2(metab),center=F,scale=T)
percvar=round((mypca$sdev)^2 / sum(mypca$sdev^2)*100,2)
# Check that order of mypca matrix is same as sample matrix:
all.equal(rownames(mypca$x),rownames(metab))
## [1] TRUE
mydf=data.frame(PC1=mypca$x[,"PC1"],PC2=mypca$x[,"PC2"],
Cell_Line=sampannot$cell_line, Cancer_Type=sampannot$cancertype)
ggplot(mydf,aes(PC1,PC2,color=Cell_Line,shape=Cancer_Type)) +
geom_point(size=7) +
scale_color_manual(values=c("slategrey","lightsteelblue1","lightsteelblue3",
"seagreen4","lightskyblue","grey","skyblue3","lightcoral","indianred4",
"lightsalmon3","salmon","sandybrown","tomato")) +
xlab(paste0("PC1: ",percvar[1],"% variance")) +
ylab(paste0("PC2: ",percvar[2],"% variance")) +
theme_bw() +
ggtitle("Metabolomics PCA Plot \n log2(normalized values)") +
theme(axis.line = element_line(colour = "black"),
axis.title=element_text(size=15,face="bold"),
plot.title=element_text(size=20,face="bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
legend.key=element_blank())
For example, using a heatmap:
library(RColorBrewer)
library(pheatmap)
mydists=as.dist(1-cor(log2(t(metab))))
mymat=as.matrix(mydists)
rownames(mymat)=sampannot$cell_line
colnames(mymat)=sampannot$cancertype
mycols=colorRampPalette(brewer.pal(5,"PiYG")) (255)
pheatmap(mymat,clustering_distance_rows=mydists, clustering_distance_cols=mydists,col=mycols,
labels_col=colnames(mymat),labels_row=rownames(mymat),
main="Metabolomics clustering\ndistance=1-correlation",)
Do scatter plots to assess correlation amon triplicate samples:
# Example for BT-549 and MCF7:
par(mfrow=c(1,2))
pairs(log2(t(metab[which(sampannot$cell_line=="BT-549"),])),pch=19,main="BT-549")
pairs(log2(t(metab[which(sampannot$cell_line=="MCF7"),])),pch=19,main="MCF7")
Compare thee correlations with that of cell lines that did not look as tightly correlated in the PCA plot.
QUESTIONS: Using the PCA, heatmap, and replicate plots, answer the following questions: 1. Do some samples seem to have higher variability between replicates? 2. Do samples cluster by cancer type? 3. Do samples cluster by cell line?
First, do a t-test to compare breast and leukemia groups:
pval=c()
for (i in 1:ncol(metab)) {
temp=t.test(as.numeric(log2(metab[which(sampannot$cancertype=="Leukemia"),i])),
log2(as.numeric(metab[which(sampannot$cancertype=="Breast"),i])))
pval=c(pval,temp$p.value)
}
One quick way to check if you have any significance, is to look at the distribution of the p-values.
hist(pval,breaks=100)
abline(v=0.01,col='red')
Can you tell whether there will be significant altered metabolites? How?
# Adjust p-values
pval.adj=p.adjust(pval,method='fdr')
# Calculate fold changes
fc=c()
for (i in 1:ncol(metab)) {
mean1=mean(as.numeric(metab[which(sampannot$cancertype=="Leukemia"),i]))
mean2=mean(as.numeric(metab[which(sampannot$cancertype=="Breast"),i]))
fc=c(fc,mean1/mean2)
}
# Draw a volcano plot
plot(log2(fc),-log10(pval.adj),pch=19,xlab="log2(FC) - Leukemia/Breast",
main="Volcano Plot\nComparing Leukemia and Breast")
# Draw lines to show adjusted p-value cutoff of 0.05, and fold changes < 0.5 or > 1.5
abline(h=-log10(0.05),col='red')
abline(v=c(log2(0.5),log2(1.5)),col='green')
# Color significan metabolites
mysigs=intersect(c(which(fc>1.5),which(fc<0.5)),which(pval.adj<0.05))
points(log2(fc[mysigs]),-log10(pval.adj[mysigs]),col="salmon",pch=19)
QUESTIONS: