# Created by Jeff Kinne, jkinne@cs.indstate.edu
# Weather data is from https://mrcc.illinois.edu/CLIMATE/welcome.jsp

# Step 1...
# First, get your working directory set properly so you'll be able to import
# files without having to put the full path into your commands.

# On your CS computers, uncomment the following if you store your csv files in your home directory
#setwd("~/")

# On a windows computer, something like the following (depends where you keep your files)
#setwd("C:/Users/jkinne/Documents/cs459")

# On my macbook, I am using this - 
setwd("~/Documents")

# When I am working on my office computer, I am using this - 
#setwd("~/public_html/cs459-bd4isu-s2019/code")


# Step 2...
# Import the csv file into a data frame.
# header=TRUE because there is a header in the csv file.  na.strings so the numbers will properly be read as numbers 
#  (and M or T as missing data)
# Try using the following file as well - Champaign-Weather-Station-USC00118740-1950-2018.csv
data_indy <- read.csv("Indianapolis-Weather-Station-USW00093819-1950-2018.csv", 
                 header = TRUE, na.strings=c("M", "T"))
data_champ <- read.csv("Champaign-Weather-Station-USC00118740-1950-2018.csv", 
                  header = TRUE, na.strings=c("M", "T"))

processWeather <- function(data) {
  # Fix a few things with the data.  This is a process of looking at it to make sure 
  #  things are right, and fixing anything that is wrong.  Note - after working through issues
  #  with weather data downloaded for Terre Haute, I decided it wasn't good enough so I switched
  #  to data from the Indy airport.

  # Add a "Year" column so we can do averages and such based on that
  data$Year <- as.numeric(substr(data$Date, 1, 4))
  data$Month <- as.numeric(substr(data$Date, 6,7))
  data$Day <- as.numeric(substr(data$Date, 9,10))
  data$Day365 <- as.numeric(format.Date(data$Date, "%j"))
    
  # Summary of all of the columns.  Check out the output
  #print(summary(data))

  # Check to make sure things look right...
  #View(data)
  return(data)
}
data_indy <- processWeather(data_indy)
data_champ <- processWeather(data_champ)

# Step 3...
# Some analysis and/or plotting

# Note, if we want the average PRCP for 1950, we could do ...
#  summary(data$PRCP[1:365])
#  mean(data$PRCP[1:365], na.rm=TRUE)

makeYearSummaries <- function(data) {
  # Data frame that will have one row for each year of our data
  # Note - unique does what it sounds like
  yearSummaries <- data.frame(year=unique(data$Year))
  
  # Add a column with a count of how many entries there are for each year (hopefully 365 or 366)
  yearSummaries$count <- table(as.factor(data$Year))
  
  #plot(yearSummaries$year, yearSummaries$count)
  
  # Add some other yearly data - average max, average min, average mean, total prcp, total snow
  yearSummaries$tmax <- tapply(data$TMAX, data$Year, mean, na.rm=TRUE)
  yearSummaries$tmin <- tapply(data$TMIN, data$Year, mean, na.rm=TRUE)
  yearSummaries$mean <- tapply(data$MEAN, data$Year, mean, na.rm=TRUE)
  
  yearSummaries$prcp <- tapply(data$PRCP, data$Year, sum, na.rm=TRUE)
  yearSummaries$snow <- tapply(data$SNOW, data$Year, sum, na.rm=TRUE)
  
  # last day of measurable snowfall 
  yearSummaries$lastSnow <- 
    as.numeric(
      lapply(unique(data$Year), function(y) {
      max(data[data$SNOW > 0 & data$Year == y & data$Day365 < 180,"Day365"],na.rm=TRUE)
    })
    )
  
  # first day of measurable snowfall 
  yearSummaries$firstSnow <- 
    as.numeric(
      lapply(unique(data$Year), function(y) {
        min(data[data$SNOW > 0 & data$Year == y & data$Day365 > 180,"Day365"],na.rm=TRUE)
      })
    )
  
  # Last, only keep years with a full set of data
  yearSummaries <- yearSummaries[yearSummaries$count >= 365,]
  # And take a look at the summary
  print(summary(yearSummaries))
  return(yearSummaries)
}
yearSummaries_indy <- makeYearSummaries(data_indy)
yearSummaries_champ <- makeYearSummaries(data_champ)

# convert day-of-year number to approximate month-day
monthDay <- function(x) {
  x <- as.integer(x)
  substring(as.Date(x, origin="2001-01-01"),6)
}

# average first/last days of snowfall in indy
first <- max(yearSummaries_indy$firstSnow)
last <- max(yearSummaries_indy$lastSnow)
print(paste("Average first/last day of snowfall in indy, roughly (mm-dd)",
            monthDay(first),"and",monthDay(last)))

# Now for some plotting
# Plot of all of the years, with a point for the average max, average min, average mean
plot(yearSummaries_indy$year, yearSummaries_indy$tmax, # x coords, y coords
     col="red", ylim=c(30,80), pch=1, xlab="Year", ylab="Temperature (F)")
points(yearSummaries_indy$year, yearSummaries_indy$mean, col="green", pch=2)
points(yearSummaries_indy$year, yearSummaries_indy$tmin, col="blue", pch=3)
points(yearSummaries_champ$year, yearSummaries_champ$mean, col="orange", pch=4)
legend("top", legend=c("avg daily max temp for the year", "avg mean", "avg min", "avg mean Champ."), 
       col=c("red","green","blue", "orange"), pch=c(1,2,3,4), cex=.7)

# And we also compute a linear fit for the max temperature.  Does it look like it's trending up?
fit_max <- lm(tmax ~ year, data=yearSummaries_indy)
#plot(fit_max)
# Use the fit to draw a line of what it would predict
predicted <- fit_max$coefficients[1] + fit_max$coefficients[2] * yearSummaries_indy$year
# would do the same thing, 
#  predicted <- predict(fit_max)
# And add that to the plot
points(yearSummaries_indy$year, predicted, col="black", type="l")


fit_min <- lm(tmin ~ year, data=yearSummaries_indy)
#plot(fit_max)
# Use the fit to draw a line of what it would predict
predicted_min <- fit_min$coefficients[1] + fit_min$coefficients[2] * yearSummaries_indy$year
# And add that to the plot
points(yearSummaries_indy$year, predicted_min, col="black", type="l")


# save the figure
dev.copy(pdf, "yearly_temperatures.pdf"); dev.off()



# last 10 years from yearSummaries
n <- nrow(yearSummaries_indy)
print(yearSummaries_indy[(n-9):n,])
print(tail(yearSummaries_indy, n=10)) # another way

# first 10 years
print(yearSummaries_indy[1:10,])
print(head(yearSummaries_indy, n=10)) # another way


# differences between indy and champaign, out of the year summaries
differences <- data.frame(year=yearSummaries_indy$year, 
                          diff=(yearSummaries_indy$mean - yearSummaries_champ$mean))
plot(differences$year, differences$diff)

data_indy$MEAN_DIFF <- data_indy$MEAN - data_champ$MEAN
plot(data_indy$MEAN_DIFF)
hist(data_indy$MEAN_DIFF, breaks=100, xlab="difference in degF", ylab="how many days", 
     main="Daily temperature difference Indy-Champ.")
points(c(0,0),c(0,2000), type="l", col="green")
dev.copy(pdf, "daily_temperature_difference.pdf"); dev.off()



# average daily temperatures, boxplots
plot(data_indy$Day365, data_indy$MEAN)
boxplot(data_indy$MEAN ~ data_indy$Day365, xlab="day of the year", ylab="degF",
        main="Mean daily temperature, Indy")
dev.copy(pdf,"daily-temperature-indy.pdf"); dev.off()

# plotting the mean
plot(1:366, tapply(data_indy$MEAN, data_indy$Day365, mean), type="l", col="green",
     main="Average daily temperature, Indy", xlab="day of the year", ylab="degF",
     ylim=c(0,90))
points(1:366, tapply(data_indy$TMAX, data_indy$Day365, mean), type="l", col="red")
points(1:366, tapply(data_indy$TMIN, data_indy$Day365, mean), type="l", col="blue")
legend("bottom", legend=c("max", "mean", "min"), text.col=c("red","green","blue"))
dev.copy(pdf,"daily-temperature-indy2.pdf"); dev.off()

# record high and low
plot(1:366, tapply(data_indy$TMAX, data_indy$Day365, max), type="l", col="red",
     main="Record daily temperature, Indy", xlab="day of the year", ylab="degF",
     ylim=c(-30,115))
points(1:366, tapply(data_indy$TMIN, data_indy$Day365, min), type="l", col="blue")
legend("bottom", legend=c("record high", "record low"), text.col=c("red", "blue"))
dev.copy(pdf,"daily-temperature-records.pdf"); dev.off()


# highest temperature of the year, lowest temperature of the year
plot(unique(data_indy$Year), tapply(data_indy$TMAX, data_indy$Year, max), col="red",
     main="Highest/lowest temp of the year, Indy", xlab="year", ylab="degF",
     ylim=c(-30,115), type="l")
points(unique(data_indy$Year), tapply(data_indy$TMIN, data_indy$Year, min), col="blue", type="l")
legend("center", legend=c("highest of the year", "lowest of the year"), text.col=c("red", "blue"))
dev.copy(pdf,"annual-temperature-extremes.pdf"); dev.off()


# temperature variation over time
plot(unique(data_indy$Year), tapply(data_indy$TMAX-data_indy$TMIN, data_indy$Year, mean),
     main="Average high-low temp by year", xlab="year", ylab="degF", type="l",
     ylim=c(15,25))
dev.copy(pdf, "daily-temperature-variation.pdf"); dev.off()


# last day of measureable snowfall