1. Project Plan
1. Executive Summary
2. Data Acquistion
3. Exploration
4. Clustering
5. Modeling
6. Conclusions
Using 2017 properties and prediction data from our Zillow database for single unit/single family homes, we were tasking with improving the log error (Zestimate) To accomplish this the team used clustering methodologies to find patterns in terms of which features had the greatest effect on log error Using the features which produced the best clusters, create a model to predict logerror
We will be using the ML Clustering Algorithm KMeans to find clusters within the data to improve our estimate of the log error. Once those clusters have been identified, we will use Regression models to test the efficacy of the clustering work.
The deliverables for this project are the following data assets:
Image(filename="clustering.png", width=800)
conclusion
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
# general
import pandas as pd
import numpy as np
# explore/ stat
import scipy.stats as stats
# visuals
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import Image
# support modules
import acquire
import wrangle
import prepare
import split_scale
import cluster
import model
# modeling
from sklearn.cluster import KMeans, dbscan
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
# view full DF
pd.set_option('display.max_columns', None)
# some handy functions to use along widgets
from IPython.display import display, Markdown, clear_output
# widget packages
import ipywidgets as widgets
# our conclusion button
button = widgets.Button(description='Our Conclusion')
out = widgets.Output()
def on_button_clicked(_):
# "linking function with output"
with out:
# what happens when we press the button
clear_output()
print('Hypothesis and conclusion is unclear.')
print('Our derived variables proved useful, but not significantly.')
print('The "best" performing model was a random forest regressor.')
# linking button and function together using a button's method
button.on_click(on_button_clicked)
conclusion = widgets.VBox([button,out])
import os
from IPython.display import Image
from ipywidgets import interact, interact_manual
Retrieve from the Codeup Zillow_db:
To create this dataframe use the get_zillow_data function from the acquire.py file
df = acquire.get_zillow_data()
df.head()
- ensure there are no nulls
- remove all outliers are removed
- create dervied features
- assigned intuitive names to each feature
df.shape
df.isnull().sum()
df = wrangle.handle_nulls(df)
df.isnull().sum()
Create synthetic features
df = wrangle.prepare_zillow(df)
df = prepare.tax_rate(df)
# remove outliers in these coloumns
col_out = ["bathroomcnt", "bedroomcnt", "tax_rate", "calculatedfinishedsquarefeet",
"lotsizesquarefeet", "structuretaxvaluedollarcnt", "taxvaluedollarcnt", "landtaxvaluedollarcnt"]
df = prepare.remove_outliers_iqr(df, col_out)
# Additional outlier removal
df = df[((df.bathroomcnt <= 7) & (df.bedroomcnt <= 7) &
(df.bathroomcnt > 0) &
(df.bedroomcnt > 0) &
(df.calculatedfinishedsquarefeet < 7000) &
(df.tax_rate < .05)
)]
df = prepare.bed_bath_ratio(df)
df = prepare.better_names(df)
df.head()
df.shape
parcelDensity30000 = cluster.get_pde(df,30000)
plt.figure(figsize=(14,14))
plt.axis("off")
plt.title("The Area We're Focusing On", fontsize=18)
plt.scatter(df['longitude'].values, df['latitude'].values, c=parcelDensity30000,cmap='inferno', s=1, edgecolor='')
plt.show()
train, test = split_scale.train_test(df)
print('train:', train.shape)
print('test:', test.shape)
train.head()
scaler, train_scaled, test_scaled = split_scale.min_max_scaler(train, test)
# code to regulate the size of plots for the rest of notebook
plt.rc('figure', figsize=(13, 10))
plt.rc('font', size=13)
# overall logerror distribution
sns.distplot(df.logerror)
plt.title('Log Error Distribution', fontsize=20)
plt.xlabel('Log Error')
plt.show()
# absolute value of logerror
sns.kdeplot(df.logerror.abs(), shade=True)
plt.title('Absolute Value of Log Error Distribution', fontsize=20)
plt.xlabel('Log Error')
plt.show()
# Heatmap to display correlation of the features and logerror.
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), cmap='Greens', annot=True)
plt.title('Correlation Heatmap of All Features Including the Target', fontsize=20)
plt.show()
plt.figure(figsize=(14,8))
with sns.color_palette('Blues'):
sns.barplot(x='bathroomcnt', y='logerror', data=train)
plt.xlabel('Bathroom Count')
plt.ylabel('Log Error')
plt.title('Does bathroom count impact log error?')
plt.show()
plt.figure(figsize=(14,8))
with sns.color_palette("Blues"):
sns.barplot(x='bedroomcnt', y='logerror', data=train)
plt.xlabel('Bedroom Count')
plt.ylabel('Log Error')
plt.title('Does bedroom count impact log error?')
plt.show()
plt.figure(figsize=(14,8))
sns.scatterplot(x='tax_rate', y='logerror', data=train, alpha=.4)
plt.xlabel('Tax Rate')
plt.ylabel('Log Error')
plt.title('Does tax rate impact log error?')
plt.show()
plt.figure(figsize=(14,8))
sns.scatterplot(x='square_footage', y='logerror', data=train, alpha=.4)
plt.xlabel('Finished Square Feet')
plt.ylabel('Log Error')
plt.title('Does the finished square feet of a home impact log error?')
plt.show()
During this section, we identified patterns in the data which may help to isolate which features were most helpful in improving log error.
x = train_scaled.bedroomcnt
y = train_scaled.bathroomcnt
alternative_hypothesis = 'bedroom count is related to bathroom count'
alpha = .05
corr, p = stats.pearsonr(x, y)
corr, p
if p < alpha:
print("We reject the null hypothesis")
print("We can say that we have confidence that", alternative_hypothesis)
else:
print("We fail to reject the null")
x = train_scaled.house_value
y = train_scaled.logerror
alternative_hypothesis = 'house value is related to logerror'
alpha = .05
corr, p = stats.pearsonr(x, y)
corr, p
if p < alpha:
print("We reject the null hypothesis")
print("We can say that we have confidence that", alternative_hypothesis)
else:
print("We fail to reject the null")
p
plt.scatter(train.logerror,train.house_value)
plt.xlabel('Logerror')
plt.ylabel('House Value')
plt.title('Logerror vs House Value')
- Physical position
- Age of home
- Value of home
- Home features
We constructed several clusters using unique combinations of these features and evaluated them using ttests to determine significance of relationship between each group within the cluster and the target variable logerror.
Our two best performing clusters are displayed below:
- Latitude
- Longitude
- Lot_size
- Square_footage
cluster_vars = train_scaled[['latitude', 'longitude', 'lot_size', 'square_footage']]
cluster_col_name = 'location_size'
centroid_col_names = ['centroid_' + i for i in cluster_vars]
#determine what the best k (number of groups) is
optimal_k = cluster.elbow_method(cluster_vars)
# Function to obtain:
# The train clusters with their observations,
# test clusters and their observations
# and a df of the number of observations per cluster on train
kmeans, train_clusters, test_clusters, cluster_counts = cluster.get_clusters_and_counts(5, ['latitude', 'longitude',
'lot_size', 'square_footage'],
'location_size', train_scaled, test_scaled)
X_train_scaled = train_scaled[["latitude","longitude","square_footage", "lot_size", "full_value","age", "lot_size"]]
X_test_scaled = test_scaled[["latitude","longitude", "square_footage", "lot_size", "full_value","age", "lot_size"]]
# Function to obtain:
# dataframe of the train clusters with their observations,
# test clusters and their observations
# and a df of the number of observations per cluster on train.
X_train_scaled, train_scaled, X_test_scaled, test_scaled, centroids = cluster.append_clusters_and_centroids(
X_train_scaled, train_scaled, train_clusters,
X_test_scaled, test_scaled, test_clusters,
cluster_col_name, centroid_col_names, kmeans)
X_train_scaled.head()
- Lot_size
- Age
- full_value
# Define variables
cluster_vars = train_scaled[['lot_size', 'age', 'full_value']]
cluster_col_name = 'size_age_value'
centroid_col_names = ['centroid_' + i for i in cluster_vars]
#determine what the best k (number of groups) is
optimal_k = cluster.elbow_method(cluster_vars)
# Function to obtain:
# The train clusters with their observations,
# test clusters and their observations
# and a df of the number of observations per cluster on train
kmeans, train_clusters, test_clusters, cluster_counts = cluster.get_clusters_and_counts(5, ['age',
'lot_size', 'full_value'],
'size_age_value', train_scaled, test_scaled)
# Function to obtain:
# dataframe of the train clusters with their observations,
# test clusters and their observations
# and a df of the number of observations per cluster on train.
X_train_scaled, train_scaled, X_test_scaled, test_scaled, centroids = cluster.append_clusters_and_centroids(
X_train_scaled, train_scaled, train_clusters,
X_test_scaled, test_scaled, test_clusters,
cluster_col_name, centroid_col_names, kmeans)
X_train_scaled.head()
cluster.test_sig(X_train_scaled.location_size, train_scaled)
plt.scatter(train_scaled.square_footage, train_scaled.lot_size, c=X_train_scaled.location_size)
plt.xlabel('Square Footage')
plt.ylabel('Lot Size')
plt.title('Square Footage vs. Lot Size colored by Cluster')
cluster.test_sig(X_train_scaled.size_age_value, train_scaled)
plt.scatter(train_scaled.age, train_scaled.full_value, c=X_train_scaled.size_age_value)
plt.xlabel('Age of House')
plt.ylabel('Full Value of House')
plt.title('Age vs. Full Value colored by Cluster')
Use selected features or clustered data to predict the target logerror
linear regression, decision tree regressor, random forest regressor
Data for modeling:
all select features via clustering and clusters generated using selected features
Random Forest Regressor performed the best and better than the baseline
# Splitting out X and Y variables for modeling
X_train = train_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_train = train_scaled[['logerror']]
X_test = test_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_test = test_scaled[['logerror']]
# Creating a dataframe of the actual log errors for comparision throughout modeling process
predictions = y_train[['logerror']]
predictions = predictions.rename(columns={'logerror': 'actual'})
predictions.head()
# Creating a baseline model
X_train = train_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_train = train_scaled[['logerror']]
y_train['mean_logerror'] = y_train.logerror.mean()
rmse_baseline = np.sqrt(mean_squared_error(y_train.logerror, y_train.mean_logerror))
r2_baseline = r2_score(y_train.logerror, y_train.mean_logerror)
predictions['rsme_baseline'] = ('{:.4f}'.format(rmse_baseline))
print('This is the baseline dataset model performance')
print(f'RSME = {rmse_baseline:.4f}')
print(f'R2 score = {r2_baseline:.4f}')
# Create decision tree regressor
X_train = train_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_train = train_scaled[['logerror']]
regressor = DecisionTreeRegressor(max_depth = 8, random_state=121)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
rmse_dt_train = np.sqrt(mean_squared_error(y_train, y_pred))
r2_dt_train = r2_score(y_train, y_pred)
predictions['rsme_dt'] = ('{:.4f}'.format(rmse_dt_train))
print('This is the train dataset model performance')
print(f'RSME = {rmse_dt_train:.4f}')
print(f'R2 score = {r2_dt_train:.4f}')
# Create random forest regressor
X_train = train_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_train = train_scaled[['logerror']]
regressor = RandomForestRegressor(max_depth = 2, random_state=121, n_estimators=100)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
rmse_rf_train = np.sqrt(mean_squared_error(y_train, y_pred))
r2_rf_train = r2_score(y_train, y_pred)
predictions['rsme_rf'] = ('{:.4f}'.format(rmse_rf_train))
print('This is the train dataset model performance')
print(f'RSME = {rmse_rf_train:.4f}')
print(f'R2 score = {r2_rf_train:.4f}')
# Create linear model
X_train = train_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_train = train_scaled[['logerror']]
lm = LinearRegression()
# Use Recursive feature selection to find top 3 features
rfe = RFE(lm, 3)
X_rfe = rfe.fit_transform(X_train,y_train)
# Fitting the data to model
lm.fit(X_rfe, y_train)
y_pred = lm.predict(X_rfe)
rmse_lm_train = np.sqrt(mean_squared_error(y_train, y_pred))
r2_lm_train = r2_score(y_train, y_pred)
predictions['rsme_lm'] = ('{:.4f}'.format(rmse_lm_train))
print('This is the train dataset model performance')
print(f'RSME = {rmse_lm_train:.4f}')
print(f'R2 score = {r2_lm_train:.4f}')
# Our first look at the predictions dataframe
predictions
# Make a variables out of the clusters
X_train_scaled['lot_cluster'] = X_train_scaled['size_age_value'] == 2
X_train_scaled['lot_cluster'] = X_train_scaled['lot_cluster'].astype(int)
X_train_scaled['loc_cluster'] = X_train_scaled['location_size'] == 3
X_train_scaled['loc_cluster'] = X_train_scaled['loc_cluster'].astype(int)
# Verify the variables are ready to go
X_train_scaled.head()
# Take a look at the number of observations we'll be using
X_train_scaled.loc_cluster.value_counts()
# Take a look at the number of observations we'll be using
X_train_scaled.lot_cluster.value_counts()
# run the models against the cluster variables
# create decision tree regressor
X_train = X_train_scaled[['loc_cluster']]
y_train = train_scaled[['logerror']]
regressor = DecisionTreeRegressor(max_depth = 8, random_state=121)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
rmse_dt_train_loc = np.sqrt(mean_squared_error(y_train, y_pred))
r2_dt_train_loc = r2_score(y_train, y_pred)
predictions['rsme_dt_loc'] = ('{:.4f}'.format(rmse_dt_train_loc))
print('This is the cluster variable dataset using the loc_cluster model performance')
print(f'RSME = {rmse_dt_train_loc:.15f}')
print(f'R2 score = {r2_dt_train_loc:.4f}')
# create decision tree regressor
X_train = X_train_scaled[['lot_cluster']]
y_train = train_scaled[['logerror']]
regressor = DecisionTreeRegressor(max_depth = 8, random_state=121)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
rmse_dt_train_lot = np.sqrt(mean_squared_error(y_train, y_pred))
r2_dt_train_lot = r2_score(y_train, y_pred)
predictions['rsme_dt_lot'] = ('{:.4f}'.format(rmse_dt_train_lot))
print('This is the cluster variable dataset using the lot_cluster model performance')
print(f'RSME = {rmse_dt_train_lot:5f}')
print(f'R2 score = {r2_dt_train_lot:.4f}')
# create random forest regressor
X_train = X_train_scaled[['loc_cluster']]
y_train = train_scaled[['logerror']]
regressor = RandomForestRegressor(max_depth = 2, random_state=121, n_estimators=100)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
rmse_rf_train_loc = np.sqrt(mean_squared_error(y_train, y_pred))
r2_rf_train_loc = r2_score(y_train, y_pred)
predictions['rsme_rf_loc'] = ('{:.4f}'.format(rmse_rf_train_loc))
print('This is the cluster variable dataset model performance')
print(f'RSME = {rmse_rf_train_loc:.15f}')
print(f'R2 score = {r2_rf_train_loc:.15f}')
# create random forest regressor
X_train = X_train_scaled[['lot_cluster']]
y_train = train_scaled[['logerror']]
regressor = RandomForestRegressor(max_depth = 2, random_state=121, n_estimators=100)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_train)
rmse_rf_train_lot = np.sqrt(mean_squared_error(y_train, y_pred))
r2_rf_train_lot = r2_score(y_train, y_pred)
predictions['rsme_rf_lot'] = ('{:.4f}'.format(rmse_rf_train_lot))
print('This is the cluster variable dataset model performance')
print(f'RSME = {rmse_rf_train_lot:.15f}')
print(f'R2 score = {r2_rf_train_lot:.15f}')
predictions
X_train = train_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_train = train_scaled[['logerror']]
X_test = test_scaled[['lot_size','age','full_value','longitude','latitude','square_footage']]
y_test = test_scaled[['logerror']]
regressor = RandomForestRegressor(max_depth = 2, random_state=121, n_estimators=100)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
rmse_rf_test = np.sqrt(mean_squared_error(y_test, y_pred))
r2_rf_test = r2_score(y_test, y_pred)
predictions['rsme_rf_test'] = ('{:.4f}'.format(rmse_rf_test))
print('This is the test dataset model performance')
print(f'RSME = {rmse_rf_test:.15f}')
print(f'R2 score = {r2_rf_test:.4f}')
predictions.head(10)
Hypothesis and conclusion is unclear. Our derived variables proved useful, but not significantly.
Our main drivers appeared to hover around the overarching geological data and clustering using the selected features associated with those data points.
The linear regression model performed quite poorly. However, the random forest regressors did slightly better than baseline.
We observed some statistical difference between log error with regards to these features:
It appears either more time is necessary to evaluate the different clustering opportunities within the data. Or that, perhaps, clustering is not the best approach for this data.