####Lab 5: Generalized Additive Models, Prophet, Gradient Boosting######## ######################################## ######Generalized Additive Models####### ######################################## ##1. Quarterly international arrivals (in thousands) to Australia from Japan, New Zealand, UK and the US. 1981Q1 - 2012Q3 library(fpp2) ?arrivals autoplot(arrivals) autoplot(arrivals[,c(1,2)]) Japan<- arrivals[,1] NZ<- arrivals[,2] UK<- arrivals[,3] US<- arrivals[,4] ######linear regression tt<- (1:length(NZ)) seas <- factor(c(rep(1:4,length(NZ)/4),1:3)) #####1:3 because there are three observations 'out' mod2 <- lm(NZ~ tt+seas) summary(mod2) AIC(mod2) ###linear model with no seasonality mod2a <- lm(NZ~ tt) summary(mod2a) #######we now try with a GAM library(gam) ?s #Values for df should be greater than 1, with df=1 implying a linear fit. Default is df=4 g1 <- gam(NZ~s(tt)+seas) summary(g1) par(mfrow=c(1,2)) plot(g1, se=T) AIC(g1) #######time has a nonlinear effect g1a <- gam(NZ~s(tt)) par(mfrow=c(1,2)) plot(g1a, se=T) summary(g1a) ####linear model may be also performed with library(gam) g0 <- gam(NZ~(tt)+seas) summary(g0) par(mfrow=c(1,2)) plot(g0, se=T) AIC(g0) ####GAM with splines performs better### ####try another option with loess (lo) g2<- gam(NZ~lo(tt)+seas) summary(g2) par(mfrow=c(1,2)) plot(g2, se=T) AIC(g2) #######perform analysis of residuals tsdisplay(residuals(g1)) aar1<- auto.arima(residuals(g1)) plot(as.numeric(NZ), type="l") lines(fitted(g1), col=3) lines(fitted(aar1)+ fitted(g1), col=4) ###Exercise: try also with the other variables ####################################################################### ####Prophet#### ############### library(prophet) ##2. iPhone sales ##data iphone=read.csv("iphone.csv", sep=";") str(iphone) ##some preliminary analysis on the data iphone$quarter=as.Date(iphone$quarter, format="%d/%m/%Y") str(iphone) colnames(iphone)=c("y","ds","cap") summary(iphone$y) ##to explain why cap is 100 #plot plot(iphone$y, type="l", x=iphone$ds, ylab="", xlab="Year") #nonlinear trend #seasonality #Prophet model # #model with no seasonality # m2=prophet(iphone, yearly.seasonality=F, daily.seasonality=F, weekly.seasonality = F, growth="logistic", n.changepoints=15) #model with automatic selection of seasonality (default is "auto") #in this case just yearly seasonality, if daily and weekly seasonsonality are needed we need to specify ##changepoints not specified, automatically set m2=prophet(iphone, growth="logistic") #n.changepoints default number is 25 summary(m2) ##create a future 'window' for prediction future2 <- make_future_dataframe(m2, periods = 20, freq="quarter", include_history = T) tail(future2) future2$cap=100 forecast2 <- predict(m2, future2) tail(forecast2[c("ds", "yhat", "yhat_lower", "yhat_upper")]) #prediction plot plot(m2, forecast2) #Dynamic plot dyplot.prophet(m2, forecast2) #plot with change points plot(m2, forecast2)+add_changepoints_to_plot(m2, threshold=0) #dates corresponding to change points m2$changepoints ##Prophet with no change points and multiplicative seasonality m2=prophet(iphone, growth="logistic", n.changepoints=0, seasonality.mode='multiplicative') #summary(m2) #m2$seasonalities future2 <- make_future_dataframe(m2, periods = 20, freq="quarter", include_history = T) tail(future2) future2$cap=100 forecast2 <- predict(m2, future2) tail(forecast2[c("ds", "yhat", "yhat_lower", "yhat_upper")]) #prediction plot plot(m2, forecast2) dyplot.prophet(m2, forecast2) ################################################################ ###Generalized Additive Models###### ############################################### ##3. 'Movies' Dataset ##preliminary analysis data <- read.csv("movies.csv", stringsAsFactors=TRUE) str(data) #erase columns of indicator variables data<-data[,-c(1,2)] #transform variable release_date in format "data" data$release_date <- as.Date(data$release_date, "%d/%m/%Y") str(data) # Response variable summary(data$vote_average) #boxplot(data$vote_average, col="orange", ylim=c(0,10), main="Movies", ylab="Rating") # hist(data$vote_average, col="orange", main="Movies", xlab="Rating") # #explanatory variables summary(data) summary(data[,c(1,2,5,6)]) par(mfrow=c(2,2)) for(i in c(1,2,5,6)){ hist(data[,i], col="orange", main=paste(colnames(data)[i]), xlab="") } #transform quantitative variables in log scale data$budget <- log(data$budget) data$popularity <- log(data$popularity) data$revenue <- log(data$revenue) summary(data[,c(1,2,5,6)]) par(mfrow=c(2,2)) for(i in c(1,2,5,6)){ hist(data[,i], col="orange", main=paste(colnames(data)[i]), xlab="") } #go back to orginal panel par(mfrow=c(1,1)) #transform release_date in numeric data$release_date<-as.numeric(data$release_date) summary(data$release_date) #as.numeric(as.Date("1970-01-01")) # Set train and test set.seed(1) train = sample (1:nrow(data), 0.7*nrow(data)) data.train=data[train ,] data.test=data[-train ,] # make some variables factor data.train[,c(3,7, 10:24)]= lapply(data.train[,c(3,7, 10:24)],factor) data.test[,c(3,7, 10:24)]= lapply(data.test[,c(3,7, 10:24)],factor) str(data.train) ####Linear Model###################### m1 <- lm(vote_average~.-vote_classes, data=data.train) summary(m1) # Stepwise Regression m2 <- step(m1, direction="both") summary(m2) #Prediction p.lm <- predict(m2, newdata=data.test) dev.lm <- sum((p.lm-data.test$vote_average)^2) dev.lm AIC(m2) ############################# ##GAM######################### library(gam) ################### ##Stepwise GAM #Start with a linear model (df=1) g3 <- gam(vote_average~.-vote_classes, data=data.train) #Show the linear effects par(mfrow=c(3,5)) plot(g3, se=T) #Perform stepwise selection using gam scope #Values for df should be greater than 1, with df=1 implying a linear fit sc = gam.scope(data.train[,-8], response=8, arg=c("df=2","df=3","df=4")) g4<- step.Gam(g3, scope=sc, trace=T) summary(g4) AIC(g4) par(mfrow=c(3,5)) plot(g4, se=T) # if we want to see better some plot par(mfrow=c(1,1)) plot(g4, se=T, ask=T) #Prediction data.test[,c(3,7, 10:24)]= lapply(data.test[,c(3,7, 10:24)],factor) p.gam <- predict(g4,newdata=data.test) dev.gam <- sum((p.gam-data.test$vote_average)^2) dev.gam ################################ ###### Gradient Boosting ####### ################################ install.packages("gbm") library (gbm) ?gbm # 1 Boosting- boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, distribution="gaussian", n.trees=5000, interaction.depth=1) # #for the plot par(mfrow=c(1,1)) # #plot of training error plot(boost.movies$train.error, type="l", ylab="training error") #always decreasing with increasing number of trees # # #relative influence plot summary(boost.movies) #let us modify the graphical parameters to obtain a better plot # #more space on the left # # default vector of parameters mai.old<-par()$mai mai.old #new vector mai.new<-mai.old #new space on the left mai.new[2] <- 2.5 mai.new #modify graphical parameters par(mai=mai.new) summary(boost.movies, las=1) #las=1 horizontal names on y summary(boost.movies, las=1, cBar=10) #cBar defines how many variables #back to orginal window par(mai=mai.old) # test set prediction for every iteration (1:5000) yhat.boost=predict(boost.movies, newdata=data.test, n.trees=1:5000) # calculate the error for each iteration #use 'apply' to perform a 'cycle for' # the first element is the matrix we want to use, 2 means 'by column', #and the third element indicates the function we want to calculate err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average - pred)^2)) # plot(err, type="l") # error comparison (train e test) plot(boost.movies$train.error, type="l") lines(err, type="l", col=2) #minimum error in test set best=which.min(err) abline(v=best, lty=2, col=4) # min(err) #minimum error # 2 Boosting - Deeper trees boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, distribution="gaussian", n.trees=5000, interaction.depth=4) plot(boost.movies$train.error, type="l") #par(mai=mai.new) summary(boost.movies, las=1, cBar=10) #par(mai=mai.old) yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000) err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2)) plot(err, type="l") plot(boost.movies$train.error, type="l") lines(err, type="l", col=2) best=which.min(err) abline(v=best, lty=2, col=4) min(err) # 3 Boosting - Smaller learning rate boost.movies=gbm(vote_average ~ .-vote_classes, data=data.train, distribution="gaussian", n.trees=5000, interaction.depth=1, shrinkage=0.01) plot(boost.movies$train.error, type="l") par(mai=mai.new) summary(boost.movies, las=1, cBar=10) par(mai=mai.old) yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000) err = apply(yhat.boost,2,function(pred) mean((data.test$vote_average-pred)^2)) plot(err, type="l") plot(boost.movies$train.error, type="l") lines(err, type="l", col=2) best=which.min(err) abline(v=best, lty=2, col=4) min(err) # 4 Boosting - combination of previous models boost.movies=gbm(vote_average ~ .-vote_classes ,data=data.train, distribution="gaussian",n.trees=5000, interaction.depth=4, shrinkage=0.01) plot(boost.movies$train.error, type="l") # par(mai=mai.new) summary(boost.movies, las=1, cBar=10) par(mai=mai.old) yhat.boost=predict(boost.movies ,newdata=data.test,n.trees=1:5000) err = apply(yhat.boost, 2, function(pred) mean((data.test$vote_average-pred)^2)) plot(err, type="l") plot(boost.movies$train.error, type="l") lines(err, type="l", col=2) best=which.min(err) abline(v=best, lty=2, col=4) err.boost= min(err) ##Comparison of models in terms of residual deviance dev.gbm<- (sum((yhat.boost[,best]-data.test$vote_average)^2)) dev.gbm dev.gam dev.lm boost.movies # partial dependence plots plot(boost.movies, i.var=1, n.trees = best) plot(boost.movies, i.var=2, n.trees = best) plot(boost.movies, i.var=5, n.trees = best) plot(boost.movies, i.var=c(1,5), n.trees = best) #bivariate (library(viridis) may be necessary) # plot(boost.movies, i.var=3, n.trees = best) # categorical plot(boost.movies, i.var=6, n.trees = best) plot(boost.movies, i=23, n.trees = best)# categorical plot(boost.movies, i=17, n.trees = best) #no effect