import math
import warnings
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import random
random.seed(131)
''' Global Variables '''
model_vars = ['population_log', "population_group", "burglary_cube_root", "larceny_theft_cube_root"]
def read_data(only_florida=False):
florida = pd.read_excel('./florida_2017.xls')
headers = ['city', 'population', 'violent_crime', 'murder', 'rape', 'robbery',
'assault', 'property_crime', 'burglary', 'larceny_theft',
'motor_vehicle_theft', 'arson']
florida.columns = headers
florida.set_index('city', inplace=True)
florida = florida[florida.population < florida.population.quantile(0.9)]
if only_florida:
return florida
# read in other states for validation purposes
ohio = pd.read_excel('./ohio_2017.xls')
ohio.columns = headers
ohio.set_index('city', inplace=True)
michigan = pd.read_excel('./michigan_2017.xls')
michigan.columns = headers
michigan.set_index('city', inplace=True)
north_carolina = pd.read_excel('./north-carolina_2017.xls')
north_carolina.columns = headers
north_carolina.set_index('city', inplace=True)
crime_cols = ['violent_crime', 'murder', 'rape', 'robbery',
'assault', 'property_crime', 'burglary', 'larceny_theft',
'motor_vehicle_theft', 'arson']
# so we will want to remove some outliers
michigan = michigan[michigan.population < michigan.population.quantile(0.9)]
north_carolina = north_carolina[north_carolina.population < north_carolina.population.quantile(0.9)]
ohio = ohio[ohio.population < ohio.population.quantile(0.9)]
return florida, michigan, north_carolina, ohio
def transform_data(df):
# add a log odds variable (do not include property crimes though)
cols = ['violent_crime', 'murder', 'rape', 'robbery',
'assault', 'burglary', 'larceny_theft',
'motor_vehicle_theft', 'arson']
df["log_odds"] = np.log1p(df.population / df[cols].sum(axis=1))
# log population
df["population_log"] = np.log(df.population)
# log1p first adds 1 to x then logs the result
df["property_crime_log"] = np.log1p(df.property_crime)
# create a population_medium indicator variable
# these are going to be relative to the population in each state
# the medium group is the interquartile range on population (between 1st and 3rd quantiles)
# population thresholds. simply uses the first 3 quantiles
population_low = df.population.quantile(0.25)
popullation_high = df.population.quantile(0.75)
df["population_medium"] = (df.population.between(population_low, popullation_high)).astype("int")
# create n population groups
df["population_group"] = pd.cut(df.population, 5, labels=list(range(1, 6)))
# create robbery dummy var
df["has_robbery"] = np.where(df.robbery > 0, 1, 0)
# because box-cox transforms require x>0, when property_crime is 0 we add 1, else we leave it alone
df["property_crime_2"] = df["property_crime"].apply(lambda x: x + 1 if x == 0 else x)
# burglary_cube_root
df["burglary_cube_root"] = df.burglary ** (1 / 3)
# assault_log
df["assault_log"] = np.log1p(df.assault)
# larceny_theft_cube_root
df["larceny_theft_cube_root"] = df.larceny_theft ** (1 / 3)
# robbery_log
df["robbery_log"] = np.log1p(df.robbery)
# motor_vehicle_theft_log
df["motor_vehicle_theft_log"] = np.log1p(df.motor_vehicle_theft)
return df
def boxcox_transform(df):
bc = boxcox(df["property_crime_2"])
df["property_crime_bc"] = bc[0]
return df, bc[1]
def split(df):
df_train = df.sample(frac=0.7, random_state=41)
df_test_cities = list(set(df.index).difference(set(df_train.index)))
df_test = df.loc[df_test_cities, :]
return df_train, df_test
def build_model(train, test, bc_lambda):
# Now lets see how well we can predict the boxcox transform of property_crime
formula = "property_crime_bc ~ " + ' + '.join(model_vars)
lm1 = smf.ols(formula=formula, data=train).fit()
print(lm1.summary())
pred_train = inv_boxcox(lm1.fittedvalues, bc_lambda)
pred_test = inv_boxcox(lm1.predict(test), bc_lambda)
resids_train = train["property_crime"] - pred_train
resids_test = test["property_crime"] - pred_test
#print("RMSE: {:.4f}".format(resids_train.std()))
#print("------\nTrain\n------")
r2_test_bc = r2_score(test["property_crime_bc"], lm1.predict(test))
r2_test = r2_score(test["property_crime"], pred_test)
print("R-squared (Property Crime Box-Cox): {}".format(r2_test_bc))
print("R-squared (Property Crime): {}".format(r2_test))
#print("------\nTest\n------")
build_evaluate_model(train, test, bc_lambda, model_vars)
return lm1, pred_train, pred_test, resids_train, resids_test
def build_evaluate_model(train, test, bc_lambda, model_vars):
regr = linear_model.LinearRegression()
regr.fit(train[model_vars], train["property_crime_bc"])
y_pred = inv_boxcox(cross_val_predict(regr, test[model_vars], test["property_crime_bc"]), bc_lambda)
print("R-squared (after reverse property crime Box-Cox): {:.4f}".format(r2_score(test["property_crime"], y_pred)))
resids_train = evaluate_model(regr, train, bc_lambda, "Train", model_vars)
resids_test = evaluate_model(regr, test, bc_lambda, "Test", model_vars)
return regr, resids_train, resids_test
''' Call this function once for the train set and once for the test set '''
def evaluate_model(model, df, bc_lambda, name, model_vars):
y_pred_bc = cross_val_predict(model, df[model_vars], df["property_crime_bc"])
cv = cross_val_score(model, df[model_vars], df["property_crime_bc"], cv=5)
print("{}\n-------------------------------------------------------------\n".format(name))
print(cv)
print("cv average is = {:.2f}%".format(cv.mean() * 100))
''' return residuals '''
print("\nBEFORE reversing predicted Box-Cox transform of property crime")
residuals_bc = df["property_crime_bc"] - y_pred_bc.reshape(y_pred_bc.shape[0],)
print("Mean Residual: {:.5f}".format(residuals_bc.mean()))
print("RMSE: {:.2f}".format(residuals_bc.std()))
print("\nAFTER reversing predicted Box-Cox transform of property crime")
y_pred = inv_boxcox(y_pred_bc, bc_lambda).reshape(y_pred_bc.shape[0],)
residuals = df["property_crime"] - y_pred
print("Mean Residual: {}\nRMSE: {}".format(residuals.mean(), residuals.std()))
print("Average Error: {}\n".format(np.abs(residuals).sum() / residuals.shape[0]))
return residuals
def build_evaluate_pls_model(train, test, n_components, bc_lambda, model_vars):
# Fit a linear model using Partial Least Squares Regression.
# Reduce feature space to 3 dimensions.
pls1 = PLSRegression(n_components=n_components)
# Reduce X to R(X) and regress on y.
pls1.fit(train[model_vars], train["property_crime_bc"])
# Save predicted values.
print('R-squared PLSR (Train):', pls1.score(train[model_vars], train["property_crime_bc"]))
resids_train = evaluate_model(pls1, train, bc_lambda, "Train", model_vars)
print('R-squared PLSR (Test):', pls1.score(test[model_vars], test["property_crime_bc"]))
resids_test = evaluate_model(pls1, test, bc_lambda, "Test", model_vars)
return pls1, resids_train, resids_test