# Created by Jeff Kinne, jkinne@cs.indstate.edu
# Weather data is from https://mrcc.illinois.edu/CLIMATE/welcome.jsp
# Step 1...
# First, get your working directory set properly so you'll be able to import
# files without having to put the full path into your commands.
# On your CS computers, uncomment the following if you store your csv files in your home directory
#setwd("~/")
# On a windows computer, something like the following (depends where you keep your files)
#setwd("C:/Users/jkinne/Documents/cs459")
# On my macbook, I am using this -
#setwd("~/Documents")
# When I am working on my office computer, I am using this -
setwd("~/public_html/cs459-bd4isu-s2019/code")
# Step 2...
# Import the csv file into a data frame.
# header=TRUE because there is a header in the csv file. na.strings so the numbers will properly be read as numbers
# (and M or T as missing data)
# Try using the following file as well - Champaign-Weather-Station-USC00118740-1950-2018.csv
data_indy <- read.csv("Indianapolis-Weather-Station-USW00093819-1950-2018.csv",
header = TRUE, na.strings=c("M", "T"))
data_champ <- read.csv("Champaign-Weather-Station-USC00118740-1950-2018.csv",
header = TRUE, na.strings=c("M", "T"))
processWeather <- function(data) {
# Fix a few things with the data. This is a process of looking at it to make sure
# things are right, and fixing anything that is wrong. Note - after working through issues
# with weather data downloaded for Terre Haute, I decided it wasn't good enough so I switched
# to data from the Indy airport.
# Add a "Year" column so we can do averages and such based on that
data$Year <- as.numeric(substr(data$Date, 1, 4))
# Summary of all of the columns. Check out the output
#print(summary(data))
# Check to make sure things look right...
#View(data)
return(data)
}
data_indy <- processWeather(data_indy)
data_champ <- processWeather(data_champ)
# Step 3...
# Some analysis and/or plotting
# Note, if we want the average PRCP for 1950, we could do ...
# summary(data$PRCP[1:365])
# mean(data$PRCP[1:365], na.rm=TRUE)
makeYearSummaries <- function(data) {
# Data frame that will have one row for each year of our data
# Note - unique does what it sounds like
yearSummaries <- data.frame(year=unique(data$Year))
# Add a column with a count of how many entries there are for each year (hopefully 365 or 366)
yearSummaries$count <- table(as.factor(data$Year))
#plot(yearSummaries$year, yearSummaries$count)
# Add some other yearly data - average max, average min, average mean, total prcp, total snow
yearSummaries$tmax <- tapply(data$TMAX, data$Year, mean, na.rm=TRUE)
yearSummaries$tmin <- tapply(data$TMIN, data$Year, mean, na.rm=TRUE)
yearSummaries$mean <- tapply(data$MEAN, data$Year, mean, na.rm=TRUE)
yearSummaries$prcp <- tapply(data$PRCP, data$Year, sum, na.rm=TRUE)
yearSummaries$snow <- tapply(data$SNOW, data$Year, sum, na.rm=TRUE)
# Last, only keep years with a full set of data
yearSummaries <- yearSummaries[yearSummaries$count >= 365,]
# And take a look at the summary
print(summary(yearSummaries))
return(yearSummaries)
}
yearSummaries_indy <- makeYearSummaries(data_indy)
yearSummaries_champ <- makeYearSummaries(data_champ)
# Now for some plotting
# Plot of all of the years, with a point for the average max, average min, average mean
plot(yearSummaries_indy$year, yearSummaries_indy$tmax, # x coords, y coords
col="red", ylim=c(30,80), pch=1, xlab="Year", ylab="Temperature (F)")
points(yearSummaries_indy$year, yearSummaries_indy$mean, col="green", pch=2)
points(yearSummaries_indy$year, yearSummaries_indy$tmin, col="blue", pch=3)
legend("top", legend=c("avg daily max temp for the year", "avg mean", "avg min"),
col=c("red","green","blue"), pch=c(1,2,3), cex=.7)
points(yearSummaries_champ$year, yearSummaries_champ$mean, col="orange", pch=4)
# And we also compute a linear fit for the max temperature. Does it look like it's trending up?
fit_max <- lm(tmax ~ year, data=yearSummaries_indy)
#plot(fit_max)
# Use the fit to draw a line of what it would predict
predicted <- fit_max$coefficients[1] + fit_max$coefficients[2] * yearSummaries_indy$year
# would do the same thing,
# predicted <- predict(fit_max)
# And add that to the plot
points(yearSummaries_indy$year, predicted, col="black", type="l")
fit_min <- lm(tmin ~ year, data=yearSummaries_indy)
#plot(fit_max)
# Use the fit to draw a line of what it would predict
predicted_min <- fit_min$coefficients[1] + fit_min$coefficients[2] * yearSummaries_indy$year
# And add that to the plot
points(yearSummaries_indy$year, predicted_min, col="black", type="l")
# last 10 years from yearSummaries
n <- nrow(yearSummaries_indy)
print(yearSummaries_indy[(n-9):n,])
print(tail(yearSummaries_indy, n=10)) # another way
# first 10 years
print(yearSummaries_indy[1:10,])
print(head(yearSummaries_indy, n=10)) # another way
# differences between indy and champaign, out of the year summaries
differences <- data.frame(year=yearSummaries_indy$year,
diff=(yearSummaries_indy$mean - yearSummaries_champ$mean))
plot(differences$year, differences$diff)
data_indy$MEAN_DIFF <- data_indy$MEAN - data_champ$MEAN
plot(data_indy$MEAN_DIFF)
hist(data_indy$MEAN_DIFF, breaks=100)