2.7 Data Preprocessing (Python
)
2.7.1 Phase 1
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import os
def preprocessing(path, month, save_name, sample=False, profiling=False):
print('FILENAME:',path)
if sample:
= pd.read_csv(path, index_col=0).reset_index(drop=True)
df else:
= pd.read_csv(path)
df
= df.shape[0] # Original number of rows
n_rows print('[0] Number of trips in the raw dataset:', n_rows)
# Feature selection
= ['trip_distance', 'DOLocationID',
extraction 'PULocationID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
'payment_type', 'fare_amount', 'tip_amount', 'total_amount']
= df[extraction]
df print('[1] Select features')
# Remove trips with non-zero distance
= df[df['trip_distance']<=0].shape[0]
zero print('[2] Remove trips with non-positive distance:', zero)
= df[df['trip_distance']>0]
df
# Convert to datetime
'tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df[
# Check datetime range
= datetime(2019, month, 1)
d1 if month==12:
= datetime(2020, 1, 1)
d2 else:
= datetime(2019, month+1, 1)
d2 # Check wrong date / month
= df[(df['tpep_pickup_datetime']<d1) | (df['tpep_dropoff_datetime']>=d2)].shape[0]
wrongdate = df[(df['tpep_pickup_datetime']>=d1) & (df['tpep_dropoff_datetime']<d2)]
df print('[3] Convert to date/time & drop wrong dates:', wrongdate)
# Feature engineer 'trip_duration' in minutes
'trip_duration'] = df['tpep_dropoff_datetime']-df['tpep_pickup_datetime']
df['trip_duration'] = round(df['trip_duration'].dt.total_seconds().div(60)).astype(int)
df[# Feature engineer 'pickup_hour' as integer
'pickup_hour'] = df['tpep_pickup_datetime'].dt.hour.astype(int)
df[# Remove `tpep_dropoff_datetime`
= df.drop('tpep_dropoff_datetime',axis=1)
df print('[4] Feature engineer `trip_duration` in minutes and `pickup_hour` as nearest integer')
# Remove trips more than 12 hours (likely forgot to turn off meter)
= df[(df['trip_duration']>=720) | (df['trip_duration']<=0)].shape[0]
duration = df[(df['trip_duration']<720) & (df['trip_duration']>0)]
df print('[5] Remove trips more than 12 hours or non-positive:', duration)
# Recast `PULocationID` as integer
'PULocationID'] = df['PULocationID'].astype(int)
df[= df[(df['PULocationID']==264) | (df['PULocationID']==265)].shape[0]
pu = df[(df['PULocationID']!=265) & (df['PULocationID']!=264)]
df print('[6] Recast `PULocationID` as integer & remove unknown IDs:', pu)
# Recast `DOLocationID` as integer
'DOLocationID'] = df['DOLocationID'].astype(int)
df[= df[(df['DOLocationID']==264) | (df['DOLocationID']==265)].shape[0]
pu = df[(df['DOLocationID']!=265) & (df['DOLocationID']!=264)]
df print('[7] Recast `DOLocationID` as integer & remove unknown IDs:', pu)
# Impute and categorise missing 'payment_type' with 0
'payment_type'] = df['payment_type'].fillna(0)
df['payment_type'] = df['payment_type'].astype(int)
df[print('[8] Impute and categorise missing `payment_type` with 0')
# Drop duplication
= df.duplicated().sum()
duplicate = df.drop_duplicates().reset_index(drop=True)
df print('[9] Drop duplicates & reset index:', duplicate)
# Summary
print()
print('>> Final DF shape:',df.shape)
print('>> Reduction size (%):', (n_rows-df.shape[0])/n_rows*100)
print('>> Missing values:')
print(df.isnull().sum())
# Save
print()
df.to_feather(save_name)print('>> Saved to feather-format')
# Panda Profiling
if profiling:
=True).to_file(output_file=save_name+'_profiling.html')
ProfileReport(df, minimal
= df[df['payment_type']==1].drop('payment_type',axis=1).reset_index(drop=True)
df +'_creditCard')
df.to_feather(save_nameprint('Credit Card only size:',df.shape)
print('>> Saved to feather-format, credit-card only')
print()
print('-----------------------------------------------')
def load_attribute(attr, credit_card=False):
= np.array([])
df for i in MONTHS:
if credit_card:
= os.path.abspath(os.path.join("taxi", "ETL", i+"19"+"_creditCard"))
path else:
= os.path.abspath(os.path.join("taxi", "ETL", i+"19"))
path = np.concatenate([df, pd.read_feather(path)[attr].to_numpy()])
df return df
# Feature engineer
= 0.58
ACPM
= {'location': np.load('PULocationID_Credit.npz')['data'],
df 'total_amount': np.load('total_amount_Credit.npz')['data'],
'tip_amount': np.load('tip_amount_Credit.npz')['data'],
'trip_distance': np.load('trip_distance_Credit.npz')['data'],
'trip_duration': np.load('trip_duration_Credit.npz')['data']}
= pd.DataFrame(df)
df
'rate_per_trip']=(df['total_amount']-ACPM*df['trip_distance'])/df['trip_duration']
df[= df.drop(['trip_distance'],axis=1)
df
'tip_rate'] = df['tip_amount']/(df['total_amount']-df['tip_amount'])
df[= df.drop(['tip_amount','total_amount'],axis=1)
df
= df.groupby('location').mean().reset_index()
df 'location'] = df['location'].astype(int)
df['feature-engineer_by_PULocationID.csv')
df.to_csv(# 103 (Statue of Liberty Island) & 110 doesn't have any trips df.head()
2.7.2 Phase 2
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta
import os
# Stage 1 - Sampling
# Set seed
26)
random.seed(# Randomize 1 million indices
= random.sample(range(59360231), 1000000)
ONEMILLION # 80-20 train test split
= random.sample(ONEMILLION, 800000)
TRAIN_INDEX = np.setdiff1d(ONEMILLION, TRAIN_INDEX)
TEST_INDEX
= ['trip_distance', 'trip_duration', 'PULocationID', 'DOLocationID', 'pickup_hour']
ATTRS = ['fare_amount', 'tip_amount', 'total_amount']
CREDIT
for i in ATTRS+CREDIT:
print('>> Atribute:',i)
= np.load(i+'_credit.npz')['data']
df = df[TRAIN_INDEX]
train_df print('Train Size (MB):', train_df.nbytes/1000000)
+'_train.npz', data=train_df)
np.savez_compressed(idel(train_df)
= df[TEST_INDEX]
test_df print('Test Size (MB):', test_df.nbytes/1000000)
+'_test.npz', data=test_df)
np.savez_compressed(idel(test_df)
print('-------------------------------')
del(df)
from scipy.stats import ks_2samp
# K-S Test for checking sampling integrity
for i in ATTRS+CREDIT:
print('>> Atribute:',i)
= np.load(i+'_test.npz')['data']
test_df = np.load(i+'_train.npz')['data']
train_df = ks_2samp(test_df, train_df)[1]
p if p<0.05:
print('The test set and train set have different distribution')
print('p-value for KS Test is:',p)
if p>=0.05:
print('The test set and train set have similar distribution')
print('p-value for KS Test is:',p)
print('-------------------------------')
del(test_df)
del(train_df)
# Stage 2 - Dataset Creation
= pd.DataFrame()
train = pd.DataFrame()
test
for i in ATTRS+CREDIT:
= np.load(i+'_train.npz')['data']
train[i] = np.load(i+'_test.npz')['data']
test[i]
= ['trip_distance', 'trip_duration', 'fare_amount', 'tip_amount', 'total_amount']
NUM_COLS # Filter non-negative values
= train[(train[NUM_COLS]>=0).all(1)]
train = test[(test[NUM_COLS]>=0).all(1)]
test
# Log transform the numerical cols (+0.001 to handle log(0))
for col in NUM_COLS:
= np.log10(train[col]+0.001)
train[col] = np.log10(test[col]+0.001)
test[col]
print("Train set shape:", train.shape)
print("Test set shape:", test.shape)
# Save to feather format
=True).to_feather('train_set')
train.reset_index(drop=True).to_feather('test_set')
test.reset_index(drop
# Stage 3: Feature Scaling
from matplotlib import pyplot as plt
import seaborn as sns
'D:/DS/0_ASS1/stylesheet.mplstyle')
plt.style.use(### TRIP DISTANCE DISTRIBUTION (LOG SCALE)
#plt.figure(figsize=(10,5))
'trip_distance'], hist=True, kde=True,
sns.distplot(train[=50, color='#86bfd0',
bins={'linewidth': 1, 'bw':0.05, 'color':'#147f9f'})
kde_kws'Density')
plt.ylabel('Log(Trip distance)')
plt.xlabel("Log Distribution of Trip distance (miles) in Train Set",
plt.suptitle(= 'right')
ha
plt.show()### TOTAL AMOUNT DISTRIBUTION (LOG SCALE)
#plt.figure(figsize=(10,5))
'total_amount'], hist=True, kde=True,
sns.distplot(train[=50, color='#fed38f',
bins
={'linewidth': 1, 'bw':0.05, 'color':'#f9ab17'})
kde_kws'Log(Total amount)')
plt.xlabel('Density')
plt.ylabel("Log Distribution of Total amount ($) in Train Set",
plt.suptitle(= 'right')
ha
plt.show()### TIP AMOUNT DISTRIBUTION (LOG SCALE)
#plt.figure(figsize=(10,5))
'tip_amount'], hist=True, kde=True,
sns.distplot(train[=50, color='#c7827b',
bins={'linewidth': 1, 'bw':0.05, 'color':'#a80000'})
kde_kws'Log(Tip amount)')
plt.xlabel('Density')
plt.ylabel("Log Distribution of Tip amount ($) in Train Set",
plt.suptitle(= 'right')
ha
plt.show()### TRIP DURATION DISTRIBUTION (LOG SCALE)
#plt.figure(figsize=(10,5))
'trip_duration'], hist=True, kde=True,
sns.distplot(train[=50, color='#abc27e',
bins={'linewidth': 1, 'bw':0.05, 'color':'#5a8303'})
kde_kws'Log(Trip duration)')
plt.xlabel('Density')
plt.ylabel("Log Distribution of Trip duration (min) in Train Set",
plt.suptitle(= 'right')
ha
plt.show()### PICK UP HOURS
#plt.figure(figsize=(30,20))
= train['pickup_hour']
pickup_hour = np.unique(pickup_hour, return_counts=True)
(unique, counts)
plt.bar(unique, counts,=[str(int(x))+':00' for x in unique],
tick_label=0.9)
width=70)
plt.xticks(rotation'Time of day (hour)')
plt.xlabel('Number of trips')
plt.ylabel("Number of trips by time of day in Train Set",
plt.suptitle(= 'right')
ha
plt.show()### PICK-UP ZONES
#plt.figure(figsize=(30,20))
= train['PULocationID']
pickup_zone = np.unique(pickup_zone, return_counts=True)
(unique, counts)
plt.bar(unique, counts,=0.9, color='#e5a5ff')
width#plt.xticks(rotation=70)
'Pick-up Zone ID')
plt.xlabel('Number of trips')
plt.ylabel("Number of trips by Pick-up Zone in Train Set",
plt.suptitle(= 'right')
ha
plt.show()### DROP-OFF ZONES
#plt.figure(figsize=(30,20))
= train['DOLocationID']
dropoff_zone = np.unique(dropoff_zone, return_counts=True)
(unique, counts)
plt.bar(unique, counts,=0.9, color='#8ef1df')
width'Drop-off Zone ID')
plt.xlabel('Number of trips')
plt.ylabel("Number of trips by Drop-off Zone in Train Set",
plt.suptitle(= 'right')
ha
plt.show()# Omnibus test of normality & Shapiro-Wilk test of normality
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html
# H0 := sample comes from normal dist
from scipy.stats import normaltest, shapiro
for col in NUM_COLS:
print('>> Attribute:', col)
= normaltest(train[col])[1]
p_train = normaltest(test[col])[1]
p_test if p_train<0.05:
print('The train set is NOT approximately normal')
if p_train>=0.05:
print('The train set is approximately normal')
if p_test<0.05:
print('The test set is NOT approximately normal')
if p_test>=0.05:
print('The test set is approximately normal')
print('-------------------------------')