# Note that lines starting with # are commends

# uncomment the following line, and put the directory where you are keeping these files
# setwd("~/Downloads")

# read from csv file and save into variable named data
data <- read.csv("GSE69618_data.csv")

# print a summary - max, min, mean, etc.
summary(data)

# number of rows and columns.  Note that #rows in this file is number of 
#  gene transcripts that were measured, and #cols is the number of different
#  samples that were measured.
dim(data)

# view the data graphically, similar to how you would view it in excel
View(data)

# which columns actually have numbers we're interested in (since the first 2 don't)
numberColumns <- 3:ncol(data)

# note that the range on the data is huge - from 0 up to over 1 million
#  that is bad for plotting, and in general strange to think about
# we want to take log of the values to make the range smaller, but we 
#  also need to not have 0's in the data if we're taking log's
# the following command makes log2 data for us
data.log2 <- data
nonzero <- data[,numberColumns] != 0
data.log2[,numberColumns][nonzero] <- log2(data[,numberColumns][nonzero])

# summary the log2 data, should be smaller range now
summary(data.log2)

# and a box plot of the different samples, should look okay since we log2'd it
boxplot(data.log2[,numberColumns])

# graphically view the log2 data
View(data.log2)

# take a summary of two of the samples that are replicates of the same treatment, 
#  in particular the two replicates of wild-type at day0
summary(data.log2$H1.WT.day0.Rep2 - data.log2$H1.WT.day0.Rep1)
# another view, histogram
hist(data.log2$H1.WT.day0.Rep2-data.log2$H1.WT.day0.Rep1)
# it looks like a difference of about 2 between the replicates could be just due to 
# random chance

# now compare two samples that are not from the same treatment, in particular
#  a wild-type day2 sample versus a wild-type day 0 sample
summary(data.log2$H1.WT.day2.Rep2 - data.log2$H1.WT.day0.Rep2)
hist(data.log2$H1.WT.day2.Rep2 - data.log2$H1.WT.day0.Rep2)