# Goal of this file 
# - take original csv file and clean it up so it is ready for further processing
# - make values log2 instead of counts
# - column names something easier to read
# - just keep the columns we're interested in
# - add in gene symbol as identifier for the rows
# And output GSE69618_data.cleaned.csv

# uncomment the following line, and put the directory where you are keeping these files
setwd("~/Documents/bd4isu")

# read from csv file and save into variable named data
data <- read.csv("GSE69618_data.csv")

# print a summary - max, min, mean, etc.
#summary(data)

# Note - refseqid is read as factor, store as character instead
class(data$RefSeqID)
data$RefSeqID <- as.character(data$RefSeqID)

# number of rows and columns.  
#dim(data)

# view the data graphically, similar to how you would view it in excel
#View(data)

# which columns actually have numbers we're interested in (since the first 2 don't)
numberColumns <- 3:ncol(data)

# note that the range on the data is huge - from 0 up to over 1 million
#  that is bad for plotting, and in general strange to think about
# we want to take log of the values to make the range smaller, but we 
#  also need to not have 0's in the data if we're taking log's
# the following command makes log2 data for us
data.log2 <- data
nonzero <- data[,numberColumns] != 0
data.log2[,numberColumns][nonzero] <- log2(data[,numberColumns][nonzero])

# summary the log2 data, should be smaller range now
#summary(data.log2)

# and a box plot of the different samples, should look okay since we log2'd it
#boxplot(data.log2[,numberColumns])

# graphically view the log2 data
#View(data.log2)

# let's take out just the columns of interest - id, WT samples
WT.cols <- grep("WT", colnames(data.log2))
id.col <- grep("RefSeqID", colnames(data.log2))
data.log2.WT <- data.log2[ ,c(id.col, WT.cols)]
#View(data.log2.WT)

# and let's get rid of the "H1." from the column names
colnames(data.log2.WT) <- gsub("H1.","",colnames(data.log2.WT))

# also, the paper said that wt.day10.rep1 and wt.day10.rep2 were not good, so remove those
colnames(data.log2.WT)
data.log2.WT <- data.log2.WT[,-c(grep("WT.day10.Rep1",colnames(data.log2.WT)),
                                 grep("WT.day10.Rep2",colnames(data.log2.WT)))]

# check to make sure it looks good now
View(data.log2.WT)
summary(data.log2.WT)

# check that all id's are unique
length(data.log2.WT$RefSeqID) == length(unique(data.log2.WT$RefSeqID))

# last thing, find gene symbols for the refseq id's.  we'll use a library...

# try to install the library
#install.packages("org.Hs.eg.db")
# it gave some error, so maybe it's a bioconductor package.  search the web for org.Hs.eg.db R

#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
#BiocManager::install("org.Hs.eg.db")

library(org.Hs.eg.db)
View(columns(org.Hs.eg.db))

id.annotation <- select(org.Hs.eg.db, 
                        keys=data.log2.WT$RefSeqID,
                        columns=c("REFSEQ","SYMBOL","GENENAME"), 
                        keytype=c("REFSEQ")
                        )
#View(id.annotation)
#dim(id.annotation)
#dim(data.log2.WT)
data.log2.WT.annotated <- cbind(id.annotation[,1:3], data.log2.WT[,-1])


# now save it
write.csv(data.log2.WT.annotated, "GSE69618_data.cleaned.csv", row.names=FALSE)