# Goal of this file # - take original csv file and clean it up so it is ready for further processing # - make values log2 instead of counts # - column names something easier to read # - just keep the columns we're interested in # - add in gene symbol as identifier for the rows # And output GSE69618_data.cleaned.csv # uncomment the following line, and put the directory where you are keeping these files setwd("~/Documents/bd4isu") # read from csv file and save into variable named data data <- read.csv("GSE69618_data.csv") # print a summary - max, min, mean, etc. #summary(data) # Note - refseqid is read as factor, store as character instead class(data$RefSeqID) data$RefSeqID <- as.character(data$RefSeqID) # number of rows and columns. #dim(data) # view the data graphically, similar to how you would view it in excel #View(data) # which columns actually have numbers we're interested in (since the first 2 don't) numberColumns <- 3:ncol(data) # note that the range on the data is huge - from 0 up to over 1 million # that is bad for plotting, and in general strange to think about # we want to take log of the values to make the range smaller, but we # also need to not have 0's in the data if we're taking log's # the following command makes log2 data for us data.log2 <- data nonzero <- data[,numberColumns] != 0 data.log2[,numberColumns][nonzero] <- log2(data[,numberColumns][nonzero]) # summary the log2 data, should be smaller range now #summary(data.log2) # and a box plot of the different samples, should look okay since we log2'd it #boxplot(data.log2[,numberColumns]) # graphically view the log2 data #View(data.log2) # let's take out just the columns of interest - id, WT samples WT.cols <- grep("WT", colnames(data.log2)) id.col <- grep("RefSeqID", colnames(data.log2)) data.log2.WT <- data.log2[ ,c(id.col, WT.cols)] #View(data.log2.WT) # and let's get rid of the "H1." from the column names colnames(data.log2.WT) <- gsub("H1.","",colnames(data.log2.WT)) # also, the paper said that wt.day10.rep1 and wt.day10.rep2 were not good, so remove those colnames(data.log2.WT) data.log2.WT <- data.log2.WT[,-c(grep("WT.day10.Rep1",colnames(data.log2.WT)), grep("WT.day10.Rep2",colnames(data.log2.WT)))] # check to make sure it looks good now View(data.log2.WT) summary(data.log2.WT) # check that all id's are unique length(data.log2.WT$RefSeqID) == length(unique(data.log2.WT$RefSeqID)) # last thing, find gene symbols for the refseq id's. we'll use a library... # try to install the library #install.packages("org.Hs.eg.db") # it gave some error, so maybe it's a bioconductor package. search the web for org.Hs.eg.db R #if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("org.Hs.eg.db") library(org.Hs.eg.db) View(columns(org.Hs.eg.db)) id.annotation <- select(org.Hs.eg.db, keys=data.log2.WT$RefSeqID, columns=c("REFSEQ","SYMBOL","GENENAME"), keytype=c("REFSEQ") ) #View(id.annotation) #dim(id.annotation) #dim(data.log2.WT) data.log2.WT.annotated <- cbind(id.annotation[,1:3], data.log2.WT[,-1]) # now save it write.csv(data.log2.WT.annotated, "GSE69618_data.cleaned.csv", row.names=FALSE)