# Note that lines starting with # are commends # uncomment the following line, and put the directory where you are keeping these files # setwd("~/Downloads") # read from csv file and save into variable named data data <- read.csv("GSE69618_data.csv") # print a summary - max, min, mean, etc. summary(data) # number of rows and columns. Note that #rows in this file is number of # gene transcripts that were measured, and #cols is the number of different # samples that were measured. dim(data) # view the data graphically, similar to how you would view it in excel View(data) # which columns actually have numbers we're interested in (since the first 2 don't) numberColumns <- 3:ncol(data) # note that the range on the data is huge - from 0 up to over 1 million # that is bad for plotting, and in general strange to think about # we want to take log of the values to make the range smaller, but we # also need to not have 0's in the data if we're taking log's # the following command makes log2 data for us data.log2 <- data nonzero <- data[,numberColumns] != 0 data.log2[,numberColumns][nonzero] <- log2(data[,numberColumns][nonzero]) # summary the log2 data, should be smaller range now summary(data.log2) # and a box plot of the different samples, should look okay since we log2'd it boxplot(data.log2[,numberColumns]) # graphically view the log2 data View(data.log2) # take a summary of two of the samples that are replicates of the same treatment, # in particular the two replicates of wild-type at day0 summary(data.log2$H1.WT.day0.Rep2 - data.log2$H1.WT.day0.Rep1) # another view, histogram hist(data.log2$H1.WT.day0.Rep2-data.log2$H1.WT.day0.Rep1) # it looks like a difference of about 2 between the replicates could be just due to # random chance # now compare two samples that are not from the same treatment, in particular # a wild-type day2 sample versus a wild-type day 0 sample summary(data.log2$H1.WT.day2.Rep2 - data.log2$H1.WT.day0.Rep2) hist(data.log2$H1.WT.day2.Rep2 - data.log2$H1.WT.day0.Rep2)