# Jeff Kinne's version of the R activity on May 29 as part of Ewy Mathe's workshop
#
# The setup - we imagine that we have two groups of individuals and measure
#             metabolites in all of them, and apply t test's to see which 
#             metabolites have a "statistically significant" difference between
#             the two groups.  Let's see...

# two groups of this size
groupSize <- 50
mygroup <- c(rep("Grp1", groupSize), rep("Grp2",groupSize))

# and this many metabolites
n_metab <- 100

# we generate fake random values for each metabolite for each individual
# and let them each range from 1 to 25000
mydata <- sample(1:25000, n_metab * (2*groupSize), replace=TRUE)
mymat <- matrix(mydata, nrow=n_metab, ncol=(groupSize*2)) # and put it into a matrix

# do a t test for each metabolite (each row) and save the p values
myp <- c()
for (i in 1:n_metab) {
  onemetab <- mymat[i,] #i'th row, i'th metabolite
  # t test for the two groups
  result <- t.test(onemetab[mygroup == "Grp1"], onemetab[mygroup == "Grp2"])
  myp <- c(myp, result$p.value) #and save the p value
}

# instead of using the for loop, could also do it with apply
myp <- sapply(1:n_metab, function(i) {
  onemetab <- mymat[i,]
  result <- t.test(onemetab[mygroup == "Grp1"], onemetab[mygroup == "Grp2"])
  result$p.value

})

# plot the histogram.  Should look "kind of" flat if the two 
# groups are not really different.  If there is really a difference
# between the two groups expect to see a peak near p=0.
hist(myp)

# and check with adjusted p values (correction for doing multiple comparisons)
myp.adjusted <- p.adjust(myp, method="fdr")
hist(myp.adjusted)

# plot both the myp and myp.adjusted together
par(mfrow=c(1,2))
hist(myp, ylim=c(0,20))
hist(myp.adjusted, ylim=c(0,20))

# And do again but now make some of the group2 metabolites different so 
# we should see those as significant
sigRows <- 1:10
mymat.sig <- mymat
mymat.sig[sigRows,mygroup == "Grp2"] <- 10 * mymat[sigRows,mygroup == "Grp2"]
myp.sig <- sapply(1:n_metab, function(i) {
  onemetab <- mymat.sig[i,]
  result <- t.test(onemetab[mygroup == "Grp1"], onemetab[mygroup == "Grp2"])
  result$p.value
  
})
hist(myp.sig, ylim=c(0,20))
myp.sig.adjusted <- p.adjust(myp.sig, method="fdr")
hist(myp.sig.adjusted, ylim=c(0,20))
which(myp.sig.adjusted < 0.05)


(t-162.1152)/162.1152 * 1e6 = 10 * 162.1152 / 1e6