# Created by Jeff Kinne, jkinne@cs.indstate.edu
# Weather data is from https://mrcc.illinois.edu/CLIMATE/welcome.jsp

# Step 1...
# First, get your working directory set properly so you'll be able to import
# files without having to put the full path into your commands.

# On your CS computers, uncomment the following if you store your csv files in your home directory
#setwd("~/")

# On a windows computer, something like the following (depends where you keep your files)
#setwd("C:/Users/jkinne/Documents/cs459")

# On my macbook, I am using this - 
#setwd("~/Documents")

# When I am working on my office computer, I am using this - 
setwd("~/public_html/cs459-bd4isu-s2019/code")


# Step 2...
# Import the csv file into a data frame.
# header=TRUE because there is a header in the csv file.  na.strings so the numbers will properly be read as numbers 
#  (and M or T as missing data)
# Try using the following file as well - Champaign-Weather-Station-USC00118740-1950-2018.csv
data_indy <- read.csv("Indianapolis-Weather-Station-USW00093819-1950-2018.csv", 
                 header = TRUE, na.strings=c("M", "T"))
data_champ <- read.csv("Champaign-Weather-Station-USC00118740-1950-2018.csv", 
                  header = TRUE, na.strings=c("M", "T"))

processWeather <- function(data) {
  # Fix a few things with the data.  This is a process of looking at it to make sure 
  #  things are right, and fixing anything that is wrong.  Note - after working through issues
  #  with weather data downloaded for Terre Haute, I decided it wasn't good enough so I switched
  #  to data from the Indy airport.

  # Add a "Year" column so we can do averages and such based on that
  data$Year <- as.numeric(substr(data$Date, 1, 4))

  # Summary of all of the columns.  Check out the output
  #print(summary(data))

  # Check to make sure things look right...
  #View(data)
  return(data)
}
data_indy <- processWeather(data_indy)
data_champ <- processWeather(data_champ)

# Step 3...
# Some analysis and/or plotting

# Note, if we want the average PRCP for 1950, we could do ...
#  summary(data$PRCP[1:365])
#  mean(data$PRCP[1:365], na.rm=TRUE)

makeYearSummaries <- function(data) {
  # Data frame that will have one row for each year of our data
  # Note - unique does what it sounds like
  yearSummaries <- data.frame(year=unique(data$Year))
  
  # Add a column with a count of how many entries there are for each year (hopefully 365 or 366)
  yearSummaries$count <- table(as.factor(data$Year))
  
  #plot(yearSummaries$year, yearSummaries$count)
  
  # Add some other yearly data - average max, average min, average mean, total prcp, total snow
  yearSummaries$tmax <- tapply(data$TMAX, data$Year, mean, na.rm=TRUE)
  yearSummaries$tmin <- tapply(data$TMIN, data$Year, mean, na.rm=TRUE)
  yearSummaries$mean <- tapply(data$MEAN, data$Year, mean, na.rm=TRUE)
  
  yearSummaries$prcp <- tapply(data$PRCP, data$Year, sum, na.rm=TRUE)
  yearSummaries$snow <- tapply(data$SNOW, data$Year, sum, na.rm=TRUE)
  
  # Last, only keep years with a full set of data
  yearSummaries <- yearSummaries[yearSummaries$count >= 365,]
  # And take a look at the summary
  print(summary(yearSummaries))
  return(yearSummaries)
}
yearSummaries_indy <- makeYearSummaries(data_indy)
yearSummaries_champ <- makeYearSummaries(data_champ)

# Now for some plotting
# Plot of all of the years, with a point for the average max, average min, average mean
plot(yearSummaries_indy$year, yearSummaries_indy$tmax, # x coords, y coords
     col="red", ylim=c(30,80), pch=1, xlab="Year", ylab="Temperature (F)")

points(yearSummaries_indy$year, yearSummaries_indy$mean, col="green", pch=2)
points(yearSummaries_indy$year, yearSummaries_indy$tmin, col="blue", pch=3)
legend("top", legend=c("avg daily max temp for the year", "avg mean", "avg min"), 
       col=c("red","green","blue"), pch=c(1,2,3), cex=.7)

points(yearSummaries_champ$year, yearSummaries_champ$mean, col="orange", pch=4)

# And we also compute a linear fit for the max temperature.  Does it look like it's trending up?
fit_max <- lm(tmax ~ year, data=yearSummaries_indy)
#plot(fit_max)
# Use the fit to draw a line of what it would predict
predicted <- fit_max$coefficients[1] + fit_max$coefficients[2] * yearSummaries_indy$year
# would do the same thing, 
#  predicted <- predict(fit_max)
# And add that to the plot
points(yearSummaries_indy$year, predicted, col="black", type="l")


fit_min <- lm(tmin ~ year, data=yearSummaries_indy)
#plot(fit_max)
# Use the fit to draw a line of what it would predict
predicted_min <- fit_min$coefficients[1] + fit_min$coefficients[2] * yearSummaries_indy$year
# And add that to the plot
points(yearSummaries_indy$year, predicted_min, col="black", type="l")

# last 10 years from yearSummaries
n <- nrow(yearSummaries_indy)
print(yearSummaries_indy[(n-9):n,])
print(tail(yearSummaries_indy, n=10)) # another way

# first 10 years
print(yearSummaries_indy[1:10,])
print(head(yearSummaries_indy, n=10)) # another way


# differences between indy and champaign, out of the year summaries
differences <- data.frame(year=yearSummaries_indy$year, 
                          diff=(yearSummaries_indy$mean - yearSummaries_champ$mean))
plot(differences$year, differences$diff)

data_indy$MEAN_DIFF <- data_indy$MEAN - data_champ$MEAN
plot(data_indy$MEAN_DIFF)
hist(data_indy$MEAN_DIFF, breaks=100)