In this project, we are going to predict the price of a house using its 80 features. Basically we are solving the Kaggle Competition.
Follow the “House Prices Prediction: Advanced Regression Techniques End to End Project” step by step to get 3 Bonus.
1. Raw Dataset
2. Ready to use Clean Dataset for ML project
3. Full Project in Jupyter Notebook File
House Prices: Advanced Regression Techniques¶
Goal of the Project¶
Predict the price of a house by its features. If you are a buyer or seller of the house but you don’t know the exact price of the house, so supervised machine learning regression algorithms can help you to predict the price of the house just providing features of the target house.
Import essential libraries¶
In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Load Data Set¶
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("Shape of train: ", train.shape)
print("Shape of test: ", test.shape)
In [3]:
train.head(10)
Out[3]:
In [4]:
test.head(10)
Out[4]:
In [5]:
## concat train and test
df = pd.concat((train, test))
temp_df = df
print("Shape of df: ", df.shape)
In [6]:
df.head(6)
Out[6]:
In [7]:
df.tail(6)
Out[7]:
Exploratory Data Analysis (EDA)¶
In [8]:
# To show the all columns
pd.set_option("display.max_columns", 2000)
pd.set_option("display.max_rows", 85)
In [9]:
df.head(6)
Out[9]:
In [10]:
df.tail(6)
Out[10]:
In [11]:
df.info()
In [12]:
df.describe()
Out[12]:
In [13]:
df.select_dtypes(include=['int64', 'float64']).columns
Out[13]:
In [14]:
df.select_dtypes(include=['object']).columns
Out[14]:
In [15]:
# Set index as Id column
df = df.set_index("Id")
In [16]:
df.head(6)
Out[16]:
In [17]:
# Show the null values using heatmap
plt.figure(figsize=(16,9))
sns.heatmap(df.isnull())
Out[17]:
In [18]:
# Get the percentages of null value
null_percent = df.isnull().sum()/df.shape[0]*100
null_percent
Out[18]:
In [19]:
col_for_drop = null_percent[null_percent > 20].keys() # if the null value % 20 or > 20 so need to drop it
In [20]:
# drop columns
df = df.drop(col_for_drop, "columns")
df.shape
Out[20]:
In [21]:
# find the unique value count
for i in df.columns:
print(i + "\t" + str(len(df[i].unique())))
In [22]:
# find unique values of each column
for i in df.columns:
print("Unique value of:>>> {} ({})\n{}\n".format(i, len(df[i].unique()), df[i].unique()))
In [23]:
# Describe the target
train["SalePrice"].describe()
Out[23]:
In [24]:
# Plot the distplot of target
plt.figure(figsize=(10,8))
bar = sns.distplot(train["SalePrice"])
bar.legend(["Skewness: {:.2f}".format(train['SalePrice'].skew())])
Out[24]:
In [25]:
# correlation heatmap
plt.figure(figsize=(25,25))
ax = sns.heatmap(train.corr(), cmap = "coolwarm", annot=True, linewidth=2)
# to fix the bug "first and last row cut in half of heatmap plot"
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
Out[25]:
In [26]:
# correlation heatmap of higly correlated features with SalePrice
hig_corr = train.corr()
hig_corr_features = hig_corr.index[abs(hig_corr["SalePrice"]) >= 0.5]
hig_corr_features
Out[26]:
In [27]:
plt.figure(figsize=(10,8))
ax = sns.heatmap(train[hig_corr_features].corr(), cmap = "coolwarm", annot=True, linewidth=3)
# to fix the bug "first and last row cut in half of heatmap plot"
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
Out[27]:
In [28]:
# Plot regplot to get the nature of highly correlated data
plt.figure(figsize=(16,9))
for i in range(len(hig_corr_features)):
if i <= 9:
plt.subplot(3,4,i+1)
plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
sns.regplot(data=train, x = hig_corr_features[i], y = 'SalePrice')
Handling Missing Value¶
In [29]:
missing_col = df.columns[df.isnull().any()]
missing_col
Out[29]:
Handling missing value of Bsmt feature¶
In [30]:
bsmt_col = ['BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1',
'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'TotalBsmtSF']
bsmt_feat = df[bsmt_col]
bsmt_feat
Out[30]:
In [31]:
bsmt_feat.info()
In [32]:
bsmt_feat.isnull().sum()
Out[32]:
In [33]:
bsmt_feat = bsmt_feat[bsmt_feat.isnull().any(axis=1)]
bsmt_feat
Out[33]:
In [34]:
bsmt_feat_all_nan = bsmt_feat[(bsmt_feat.isnull() | bsmt_feat.isin([0])).all(1)]
bsmt_feat_all_nan
Out[34]:
In [35]:
bsmt_feat_all_nan.shape
Out[35]:
In [36]:
qual = list(df.loc[:, df.dtypes == 'object'].columns.values)
qual
Out[36]:
In [37]:
# Fillinf the mising value in bsmt features
for i in bsmt_col:
if i in qual:
bsmt_feat_all_nan[i] = bsmt_feat_all_nan[i].replace(np.nan, 'NA') # replace the NAN value by 'NA'
else:
bsmt_feat_all_nan[i] = bsmt_feat_all_nan[i].replace(np.nan, 0) # replace the NAN value inplace of 0
bsmt_feat.update(bsmt_feat_all_nan) # update bsmt_feat df by bsmt_feat_all_nan
df.update(bsmt_feat_all_nan) # update df by bsmt_feat_all_nan
"""
>>> df = pd.DataFrame({'A': [1, 2, 3],
... 'B': [400, 500, 600]})
>>> new_df = pd.DataFrame({'B': [4, 5, 6],
... 'C': [7, 8, 9]})
>>> df.update(new_df)
>>> df
A B
0 1 4
1 2 5
2 3 6
"""
Out[37]:
In [38]:
bsmt_feat = bsmt_feat[bsmt_feat.isin([np.nan]).any(axis=1)]
bsmt_feat
Out[38]:
In [39]:
bsmt_feat.shape
Out[39]:
In [40]:
print(df['BsmtFinSF2'].max())
print(df['BsmtFinSF2'].min())
In [41]:
pd.cut(range(0,1526),5) # create a bucket
Out[41]:
In [42]:
df_slice = df[(df['BsmtFinSF2'] >= 305) & (df['BsmtFinSF2'] <= 610)]
df_slice
Out[42]:
In [43]:
bsmt_feat.at[333,'BsmtFinType2'] = df_slice['BsmtFinType2'].mode()[0] # replace NAN value of BsmtFinType2 by mode of buet ((305.0, 610.0)
In [44]:
bsmt_feat
Out[44]:
In [ ]:
In [45]:
bsmt_feat['BsmtExposure'] = bsmt_feat['BsmtExposure'].replace(np.nan, df[df['BsmtQual'] =='Gd']['BsmtExposure'].mode()[0])
In [46]:
bsmt_feat['BsmtCond'] = bsmt_feat['BsmtCond'].replace(np.nan, df['BsmtCond'].mode()[0])
bsmt_feat['BsmtQual'] = bsmt_feat['BsmtQual'].replace(np.nan, df['BsmtQual'].mode()[0])
In [47]:
df.update(bsmt_feat)
In [48]:
bsmt_feat.isnull().sum()
Out[48]:
Handling missing value of Garage feature¶
In [49]:
df.columns[df.isnull().any()]
Out[49]:
In [50]:
garage_col = ['GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt',]
garage_feat = df[garage_col]
garage_feat = garage_feat[garage_feat.isnull().any(axis=1)]
garage_feat
Out[50]:
In [51]:
garage_feat.shape
Out[51]:
In [52]:
garage_feat_all_nan = garage_feat[(garage_feat.isnull() | garage_feat.isin([0])).all(1)]
garage_feat_all_nan.shape
Out[52]:
In [53]:
for i in garage_feat:
if i in qual:
garage_feat_all_nan[i] = garage_feat_all_nan[i].replace(np.nan, 'NA')
else:
garage_feat_all_nan[i] = garage_feat_all_nan[i].replace(np.nan, 0)
garage_feat.update(garage_feat_all_nan)
df.update(garage_feat_all_nan)
In [54]:
garage_feat = garage_feat[garage_feat.isnull().any(axis=1)]
garage_feat
Out[54]:
In [55]:
for i in garage_col:
garage_feat[i] = garage_feat[i].replace(np.nan, df[df['GarageType'] == 'Detchd'][i].mode()[0])
In [56]:
garage_feat.isnull().any()
Out[56]:
In [57]:
df.update(garage_feat)
Handling missing value of remain feature¶
In [58]:
df.columns[df.isnull().any()]
Out[58]:
In [59]:
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['Functional'] = df['Functional'].fillna(df['Functional'].mode()[0])
df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])
df['MSZoning'] = df['MSZoning'].fillna(df['MSZoning'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
df['Utilities'] = df['Utilities'].fillna(df['Utilities'].mode()[0])
df['MasVnrType'] = df['MasVnrType'].fillna(df['MasVnrType'].mode()[0])
In [60]:
df.columns[df.isnull().any()]
Out[60]:
In [61]:
df[df['MasVnrArea'].isnull() == True]['MasVnrType'].unique()
Out[61]:
In [62]:
df.loc[(df['MasVnrType'] == 'None') & (df['MasVnrArea'].isnull() == True), 'MasVnrArea'] = 0
In [63]:
df.isnull().sum()/df.shape[0] * 100
Out[63]:
Handling missing value of LotFrontage feature¶
In [64]:
lotconfig = ['Corner', 'Inside', 'CulDSac', 'FR2', 'FR3']
for i in lotconfig:
df['LotFrontage'] = pd.np.where((df['LotFrontage'].isnull() == True) & (df['LotConfig'] == i) , df[df['LotConfig'] == i] ['LotFrontage'].mean(), df['LotFrontage'])
In [65]:
df.isnull().sum()
Out[65]:
Feature Transformation¶
In [66]:
df.columns
Out[66]:
In [67]:
# converting columns in str which have categorical nature but in int64
feat_dtype_convert = ['MSSubClass', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
for i in feat_dtype_convert:
df[i] = df[i].astype(str)
In [68]:
df['MoSold'].unique() # MoSold = Month of sold
Out[68]:
In [69]:
# conver in month abbrevation
import calendar
df['MoSold'] = df['MoSold'].apply(lambda x : calendar.month_abbr[x])
In [70]:
df['MoSold'].unique()
Out[70]:
In [71]:
quan = list(df.loc[:, df.dtypes != 'object'].columns.values)
In [72]:
quan
Out[72]:
In [73]:
len(quan)
Out[73]:
In [74]:
obj_feat = list(df.loc[:, df.dtypes == 'object'].columns.values)
obj_feat
Out[74]:
Conver categorical code into order¶
In [75]:
from pandas.api.types import CategoricalDtype
df['BsmtCond'] = df['BsmtCond'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
In [76]:
df['BsmtCond'].unique()
Out[76]:
In [77]:
df['BsmtExposure'] = df['BsmtExposure'].astype(CategoricalDtype(categories=['NA', 'Mn', 'Av', 'Gd'], ordered = True)).cat.codes
In [78]:
df['BsmtExposure'].unique()
Out[78]:
In [79]:
df['BsmtFinType1'] = df['BsmtFinType1'].astype(CategoricalDtype(categories=['NA', 'Unf', 'LwQ', 'Rec', 'BLQ','ALQ', 'GLQ'], ordered = True)).cat.codes
df['BsmtFinType2'] = df['BsmtFinType2'].astype(CategoricalDtype(categories=['NA', 'Unf', 'LwQ', 'Rec', 'BLQ','ALQ', 'GLQ'], ordered = True)).cat.codes
df['BsmtQual'] = df['BsmtQual'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['ExterQual'] = df['ExterQual'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['ExterCond'] = df['ExterCond'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['Functional'] = df['Functional'].astype(CategoricalDtype(categories=['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod','Min2','Min1', 'Typ'], ordered = True)).cat.codes
df['GarageCond'] = df['GarageCond'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['GarageQual'] = df['GarageQual'].astype(CategoricalDtype(categories=['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['GarageFinish'] = df['GarageFinish'].astype(CategoricalDtype(categories=['NA', 'Unf', 'RFn', 'Fin'], ordered = True)).cat.codes
df['HeatingQC'] = df['HeatingQC'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['KitchenQual'] = df['KitchenQual'].astype(CategoricalDtype(categories=['Po', 'Fa', 'TA', 'Gd', 'Ex'], ordered = True)).cat.codes
df['PavedDrive'] = df['PavedDrive'].astype(CategoricalDtype(categories=['N', 'P', 'Y'], ordered = True)).cat.codes
df['Utilities'] = df['Utilities'].astype(CategoricalDtype(categories=['ELO', 'NASeWa', 'NASeWr', 'AllPub'], ordered = True)).cat.codes
In [80]:
df['Utilities'].unique()
Out[80]:
Show skewness of feature with distplot¶
In [81]:
skewed_features = ['1stFlrSF',
'2ndFlrSF',
'3SsnPorch',
'BedroomAbvGr',
'BsmtFinSF1',
'BsmtFinSF2',
'BsmtFullBath',
'BsmtHalfBath',
'BsmtUnfSF',
'EnclosedPorch',
'Fireplaces',
'FullBath',
'GarageArea',
'GarageCars',
'GrLivArea',
'HalfBath',
'KitchenAbvGr',
'LotArea',
'LotFrontage',
'LowQualFinSF',
'MasVnrArea',
'MiscVal',
'OpenPorchSF',
'PoolArea',
'ScreenPorch',
'TotRmsAbvGrd',
'TotalBsmtSF',
'WoodDeckSF']
In [82]:
quan == skewed_features
Out[82]:
In [83]:
plt.figure(figsize=(25,20))
for i in range(len(skewed_features)):
if i <= 28:
plt.subplot(7,4,i+1)
plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
ax = sns.distplot(df[skewed_features[i]])
ax.legend(["Skewness: {:.2f}".format(df[skewed_features[i]].skew())], fontsize = 'xx-large')
In [84]:
df_back = df
In [85]:
# decrease the skewnwnes of the data
for i in skewed_features:
df[i] = np.log(df[i] + 1)
In [86]:
plt.figure(figsize=(25,20))
for i in range(len(skewed_features)):
if i <= 28:
plt.subplot(7,4,i+1)
plt.subplots_adjust(hspace = 0.5, wspace = 0.5)
ax = sns.distplot(df[skewed_features[i]])
ax.legend(["Skewness: {:.2f}".format(df[skewed_features[i]].skew())], fontsize = 'xx-large')