library(feather)
library(psych)
library(plotly)
library(dplyr)
library(tidyr)
library(lme4)
library(infotheo)
# PALETTE
PALETTE <- c('#1380A1', '#FAAB18', '#990000', '#588300', '#bf55fb', '#1ce4c1')
FONT <- list(family='Helvetica')
# Load DS
train <- read_feather('train_set')
test <- read_feather('test_set')
manhattan <- read.csv('manhattan.csv')
TAT <- read.csv('Loc_Hour_TAT.csv')
head(train)
train <- train %>%
left_join(manhattan,
by=c('PULocationID'='LocationID')) %>%
replace_na(list('Area'='Other')) %>%
rename(PUArea=Area) %>%
left_join(manhattan,
by=c('DOLocationID'='LocationID')) %>%
replace_na(list('Area'='Other')) %>%
rename(DOArea=Area) %>%
filter(total_amount > 0) %>%
mutate(PUArea = as.factor(PUArea),
DOArea = as.factor(DOArea),
DOLocationID = as.factor(DOLocationID),
PULocationID = as.factor(PULocationID)) %>%
left_join(TAT %>% mutate(LocationID=as.factor(LocationID)),
by=c('PULocationID'='LocationID',
'pickup_hour'='Hour')) %>% drop_na()
test <- test %>%
left_join(manhattan,
by=c('PULocationID'='LocationID')) %>%
replace_na(list('Area'='Other')) %>%
rename(PUArea=Area) %>%
left_join(manhattan,
by=c('DOLocationID'='LocationID')) %>%
replace_na(list('Area'='Other')) %>%
rename(DOArea=Area) %>%
filter(total_amount > 0) %>%
mutate(PUArea = as.factor(PUArea),
DOArea = as.factor(DOArea),
DOLocationID = as.factor(DOLocationID),
PULocationID = as.factor(PULocationID)) %>%
left_join(TAT %>% mutate(LocationID=as.factor(LocationID)),
by=c('PULocationID'='LocationID',
'pickup_hour'='Hour')) %>% drop_na()
# Outlier analysis
tip_rate <- train$tip_amount-train$fare_amount
upperbound <- quantile(tip_rate[tip_rate>0], 0.75)+1.5*quantile(tip_rate[tip_rate>0], 0.75)-1.5*quantile(tip_rate[tip_rate>0], 0.25)
outliers <- train[train$tip_amount-train$fare_amount>=upperbound,]
outliers <- rbind(outliers, train[train$total_amount>2.23,])
write.csv(outliers, 'outliers.csv')
train <- train %>% anti_join(outliers)
# Plot pairwise for 100 random points
set.seed(26)
sample <- train %>% sample_n(100)
pairs.panels(sample,
bg=PALETTE[sample$PUArea],
pch=21,
method = "pearson", # correlation method
hist.col = "#FAAB18",
density = FALSE, # show density plot,
ellipses=FALSE,
rug = FALSE,
stars = TRUE,
cex.cor = 2.5
)
# Comparing levels of locations
bins <- cut(train$total_amount, 100000)
nmi <- function(x,y) {
return(2*mutinformation(x,y)/(entropy(x)+entropy(y)))
}
nmi(bins, train$PULocationID)
nmi(bins, train$PUArea)
nmi(bins, train$DOLocationID)
nmi(bins, train$DOArea)
# Test effect of PU Area on Total Amount
# Non-parametric (median-based) ANOVA
kruskal.test(total_amount ~ PUArea, data=sample)
# Plot boxplot to compare data
plot_ly(data=train,
x=~PUArea, y=~total_amount,
type='box', color=~Area, colors=PALETTE) %>% layout(font=FONT, yaxis=list(title='Total Amount'))
# Freq plot (Not used)
ttable <- as.data.frame.matrix(table(train %>% select(Area, pickup_hour)))
ttable$Area <- row.names(ttable)
row.names(ttable) <- NULL
ttable <- ttable %>% gather(key='Hour', value='Freq', `0`:`23`)
ttable$Hour <- as.numeric(ttable$Hour)
plot_ly(
data=ttable,
x=~Hour, y=~Freq, color=~Area, name=~Area, colors=PALETTE,
type='bar') %>% layout(barmode='stack', bargap=0.1,
xaxis=list(type='linear'),
yaxis=list(title='Total number of trips'),
font=FONT)
# Proportion plot (Not used)
ptable <- prop.table(table(train %>% select(Area, pickup_hour)),2)
ptable <- as.data.frame.matrix(ptable)
ptable$Area <- row.names(ptable)
row.names(ptable) <- NULL
ptable <- ptable %>% gather(key='Hour', value='Prop', `0`:`23`)
ptable$Hour <- as.numeric(ptable$Hour)
ptable$Prop <- ptable$Prop*100
plot_ly(
data=ptable,
x=~Hour, y=~Prop, color=~Area, name=~Area, colors=PALETTE,
type='bar') %>% layout(barmode='stack', bargap=0.1,
xaxis=list(type='linear'),
yaxis=list(title='Proportion (%)'),
font=FONT)
# Simple linear model for sample 500 random points
set.seed(26)
sample <- train %>% sample_n(10000)
# Forward feature selection >> Remove "posteriori" features aka dist, fare, tip
m0.sample <- lm(total_amount ~ (trip_duration+TAT+PUArea+DOArea)^2, data=sample)
m0.sample <- step(m0.sample, scope = ~., data=sample)
anova(m0.sample)
m0.sample <- lm(total_amount ~ TAT+(trip_duration+PUArea+DOArea)^2, data=sample)
anova(m0.sample)
# Build models
m1.sample <- lm(total_amount ~ trip_duration, data=sample)
m2.sample <- lm(total_amount ~ trip_duration + PUArea + DOArea, data=sample)
m3.sample <- lm(total_amount ~ trip_duration + PUArea + DOArea + TAT, data=sample)
m4.sample <- lm(total_amount ~ trip_duration + PUArea * DOArea, data=sample)
m5.sample <- lm(total_amount ~ trip_duration * (PUArea + DOArea), data=sample)
m6.sample <- lm(total_amount ~ (trip_duration + PUArea + DOArea)^2, data=sample)
m7.sample <- lmer(total_amount ~ trip_duration + PUArea + DOArea + (1|PUArea) + (1|DOArea), data=sample)
m8.sample <- lmer(total_amount ~ trip_duration + PUArea + DOArea + (trip_duration|PUArea) + (trip_duration|DOArea), data=sample)
m9.sample <- lmer(total_amount ~ trip_duration + PUArea + DOArea + (TAT|PUArea) + (TAT|DOArea), data=sample, control = lmerControl(optimizer ="Nelder_Mead"))
# Model comparison
model_comparison <- AIC(m0.sample, m1.sample, m2.sample, m3.sample, m4.sample,
m5.sample, m6.sample, m7.sample, m8.sample, m9.sample)
rmse <- c()
score <- function(model) {
10^(sqrt(sum((predict(model, test) - test$total_amount)^2)/dim(test)[1]))
}
rmse <- c(rmse, score(m0.sample))
rmse <- c(rmse, score(m1.sample))
rmse <- c(rmse, score(m2.sample))
rmse <- c(rmse, score(m3.sample))
rmse <- c(rmse, score(m4.sample))
rmse <- c(rmse, score(m5.sample))
rmse <- c(rmse, score(m6.sample))
rmse <- c(rmse, score(m7.sample))
rmse <- c(rmse, score(m8.sample))
rmse <- c(rmse, score(m9.sample))
model_comparison$RMSE <- rmse
model_comparison$MLR <- exp(1/2*(model_comparison$AIC[1]-model_comparison$AIC))
# Model comparison using AIC & rmse plot
model_comparison %>%
plot_ly(
type='bar', y=~RMSE, color=~RMSE, colors='viridis') %>%
layout(font=FONT,
yaxis=list(range=c(1.18,1.5)),
xaxis=list(title='Model', dtick=1))
# Model comparison using AIC & rmse
model_comparison %>%
plot_ly(
type='bar', y=~AIC, color=~AIC, colors='viridis') %>%
layout(font=FONT,
yaxis=list(range=c(-25500,-18500)),
xaxis=list(title='Model', dtick=1))
write.csv(model_comparison, 'model_comparison.csv')
# Diagnostics
par(mfrow = c(2, 2))
plot(m1.sample)
par(mfrow = c(2, 2))
plot(m0.sample)
# Plot predictions
sample_visual <- list()
for (area in unique(test$PUArea)) {
set.seed(7)
sample_visual[[area]] <- test %>% filter(PUArea==area) %>%
sample_n(20) %>%
select(trip_duration, total_amount, PUArea, DOArea) %>%
distinct(trip_duration, PUArea, DOArea, .keep_all = TRUE) %>%
mutate(pred.m1=predict(m1.sample, .),
pred.m4=predict(m2.sample, .),
pred.m7=predict(m7.sample, .))
}
sample_visual <- do.call(rbind, sample_visual) %>% arrange(trip_duration)
sample_visual %>%
plot_ly() %>%
add_trace(x=~trip_duration,
y=~total_amount,
type='scatter', mode='markers', color=~PUArea, colors=PALETTE, alpha=0.5, marker=list(symbol='circle-open')) %>%
add_trace(x=~trip_duration,
y=~pred.m1,
type='scatter', mode='lines', name='Baseline', colors='black') %>%
add_trace(x=~trip_duration,
y=~pred.m4,
type='scatter', mode='lines', color=~PUArea, colors=PALETTE, line=list(dash='dot'), alpha=0.5) %>%
add_trace(x=~trip_duration,
y=~pred.m7,
type='scatter', mode='lines', color=~PUArea, colors=PALETTE, line=list(dash='dash'), alpha=0.5) %>%
layout(xaxis=list(type='linear', title='Trip duration'),
yaxis=list(type='linear', title='Total amount'), font=FONT)
# Fit m0 for the whole train set
mfinal <- lm(total_amount ~ TAT+(trip_duration+PUArea+DOArea)^2, data=train)
summary(mfinal)
predicted <- test %>% mutate(pred=predict(mfinal, .)) %>% select(total_amount, pred, PUArea, DOArea)
write.csv(summary(mfinal)$coef, 'finalModel.csv')
remove(mfinal) # Free memory
# Error analysis
predicted %>% #head(100) %>%
mutate(errors=10^(total_amount-pred),
logerrors=total_amount-pred,
Area=paste0(PUArea,':',DOArea)) %>%
plot_ly(x=~Area, y=~errors, type='box', marker=list(opacity=0), color=~Area, colors=PALETTE) %>%
layout(yaxis=list(type='linear', title='Error'),
xaxis=list(title='Area'), font=FONT)