Kaggle Housing Prices
This is a Kaggle starter notebook for Predicting Housing Prices
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor, XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from functools import partial
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
DATA_DIR = Path("/kaggle/input/house-prices-advanced-regression-techniques/")
train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df = pd.read_csv(DATA_DIR / "test.csv")
sub_df = pd.read_csv(DATA_DIR / "sample_submission.csv")
train_df.head()
# Choose only columns that have at least 1000 non-null values
train_df.info()
train_df["SalePrice"].hist(bins=50)
# So We should convert target to Log, even if the Metric is not using it
np.log(train_df["SalePrice"]).hist(bins=50);
# May be we should exclude them from dataset since there's no much we can gain from two data points
train_df.plot(kind="scatter", x="Id", y="SalePrice", alpha=0.25)
plt.figure(figsize=(12,10))
sns.heatmap(train_df.corr(), cmap='Greys');
train_df.corr()
corr_cols = train_df.corr()["SalePrice"].nlargest(15).index
corr_cols
plt.figure(figsize=(10, 6))
sns.heatmap(train_df.loc[:, corr_cols].corr(), annot=True, cmap="gray")
# Overall Quality has big impact on SalePrice
train_df.plot(kind="scatter", x="OverallQual", y="SalePrice", alpha=0.25)
# GrLivAera is also strongly correlated.
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GrLivArea", y="SalePrice", alpha=0.25)
train_df[train_df.GrLivArea > 4000]
# GrLivAera is also strongly correlated.
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GarageArea", y="SalePrice", alpha=0.25)
train_df[train_df.GarageArea > 1200]
train_df = train_df[train_df["SalePrice"] < 700000]
def get_features(train_df):
num_features, cat_features = [], []
for col in train_df.columns:
if col in ["Id", "SalePrice"]:
continue
dtype = train_df[col].dtype
ratio = pd.notna(train_df[col]).sum() / len(train_df[col])
if ratio < 0.5:
continue
if dtype == "object":
cat_features.append(col)
else:
num_features.append(col)
return num_features, cat_features
num_features, cat_features = get_features(train_df)
cat_features
def get_preprocess_pipeline(train_df, sample_features=False):
# Get Numeric and Categorical Features
numeric_features, categorical_features = get_features(train_df)
target = "SalePrice"
if sample_features:
numeric_features = ["LotArea"]
categorical_features = ["SaleType", "SaleCondition"]
numeric_transformer = Pipeline(steps=[
('imputer', KNNImputer(n_neighbors=5)),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
('onehpt', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
return preprocessor, numeric_features + categorical_features, target
def train_LR(train_df, test_df, sample_features=False):
"""
Train a Linear Regression Model
"""
# Start with simple linear Model
preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=sample_features)
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LinearRegression())])
X_train = train_df[features]
y_train = np.log(train_df[target])
X_test = test_df[features]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
clf.fit(X_train, y_train)
print("RMSE Log Error", rmse(clf.predict(X_valid), y_valid))
# On Prediction, do exp to inverse the loge done during training
sub_df = pd.DataFrame({
"Id": test_df["Id"],
"SalePrice": np.exp(clf.predict(X_test))
})
return sub_df
sub_df = train_LR(train_df, test_df, sample_features=True)
sub_df.to_csv("submission_lr_sample.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
sub_df = train_LR(train_df, test_df, sample_features=False)
sub_df.to_csv("submission_lr.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for p1, p2 in kfold.split(range(20)):
print(p1, p2)
kfold = KFold(n_splits=7, shuffle=True, random_state=42)
for idxs in kfold.split(train_df):
print(idxs[0].shape, idxs[1].shape)
idxs
def hyperparam_finder_nocv(df, model_fn, trial):
"""
Hyperparameter Finder
"""
# Start with simple linear Model
model = model_fn(trial)
preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', model)])
X_train = train_df[features]
y_train = np.log(train_df[target])
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
clf.fit(X_train, y_train)
return rmse(clf.predict(X_valid), y_valid)
def hyperparam_finder(df, model_fn, trial):
"""
Hyperparameter Finder
"""
# Start with simple linear Model
model = model_fn(trial)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
valid_errors = []
test_preds = []
valid_preds = []
for train_idxs, valid_idxs in kfold.split(df):
train_df = df.iloc[train_idxs]
valid_df = df.iloc[valid_idxs]
preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', model)])
X_train = train_df[features]
y_train = np.log(train_df[target])
X_valid = valid_df[features]
y_valid = np.log(valid_df[target])
X_test = test_df[features]
clf.fit(X_train, y_train)
y_valid_preds = clf.predict(X_valid)
valid_errors.append(rmse(y_valid_preds, y_valid))
# Return Valid Pred Score for HyperParam Tuning
return np.mean(valid_errors)
# Now lets do KFold (5 Fold)
def train_kfold(df, test_df, ModelClass, **model_kwargs):
"""
Train a Regression Model with 5 Fold CV
"""
# Start with simple linear Model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
valid_errors = []
test_preds = []
valid_preds = []
for train_idxs, valid_idxs in kfold.split(df):
train_df = df.iloc[train_idxs]
valid_df = df.iloc[valid_idxs]
preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', ModelClass(**model_kwargs))])
X_train = train_df[features]
y_train = np.log(train_df[target])
X_valid = valid_df[features]
y_valid = np.log(valid_df[target])
X_test = test_df[features]
clf.fit(X_train, y_train)
y_valid_preds = clf.predict(X_valid)
valid_errors.append(rmse(y_valid_preds, y_valid))
test_preds.append(np.exp(clf.predict(X_test)))
valid_preds.append(pd.DataFrame({
"Id": valid_df["Id"],
"SalePrice": np.exp(y_valid_preds)
}))
print("RMSE Log Error", np.mean(valid_errors))
# On Prediction, do exp to inverse the loge done during training
sub_df = pd.DataFrame({
"Id": test_df["Id"],
"SalePrice": np.mean(test_preds, axis=0)
})
# Return test prediction with CV and the Validation Prediction (For Stacking later)
return sub_df, pd.concat(valid_preds)
model1_sub_df, model1_valid_preds = train_kfold(train_df, test_df, LinearRegression)
sub_df.to_csv("submission_lr_kfold.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
# Score might have improved over the LR without kfold.
def train_XGB_kfold(df, test_df):
"""
Train a XGBoost Model with 5 Fold CV
"""
# Start with simple linear Model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
valid_errors = []
test_preds = []
valid_preds = []
for train_idxs, valid_idxs in kfold.split(df.index.values):
train_df = df.loc[train_idxs]
valid_df = df.loc[valid_idxs]
preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', XGBRegressor(n_jobs=-1, n_estimators=500, max_depth=20))])
X_train = train_df[features]
y_train = np.log(train_df[target])
X_valid = valid_df[features]
y_valid = np.log(valid_df[target])
X_test = test_df[features]
clf.fit(X_train, y_train)
y_valid_preds = clf.predict(X_valid)
valid_errors.append(rmse(y_valid_preds, y_valid))
test_preds.append(np.exp(clf.predict(X_test)))
valid_preds.append(pd.DataFrame({
"Id": valid_df["Id"],
"SalePrice": np.exp(y_valid_preds)
}))
print("Mean Squared Log Error", np.mean(valid_errors))
# On Prediction, do exp to inverse the loge done during training
sub_df = pd.DataFrame({
"Id": test_df["Id"],
"SalePrice": np.mean(test_preds, axis=0)
})
# Return test prediction with CV and the Validation Prediction (For Stacking later)
return sub_df, pd.concat(valid_preds)
def train_RF_kfold(df, test_df):
"""
Train a RF Model with 5 Fold CV
"""
# Start with simple linear Model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
valid_errors = []
test_preds = []
valid_preds = []
for train_idxs, valid_idxs in kfold.split(df.index.values):
train_df = df.loc[train_idxs]
valid_df = df.loc[valid_idxs]
preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestRegressor(n_jobs=-1, max_depth=20))])
X_train = train_df[features]
y_train = np.log(train_df[target])
X_valid = valid_df[features]
y_valid = np.log(valid_df[target])
X_test = test_df[features]
clf.fit(X_train, y_train)
y_valid_preds = clf.predict(X_valid)
valid_errors.append(mean_squared_error(y_valid_preds, y_valid))
test_preds.append(np.exp(clf.predict(X_test)))
valid_preds.append(pd.DataFrame({
"Id": valid_df["Id"],
"SalePrice": np.exp(y_valid_preds)
}))
print("Mean Squared Log Error", np.mean(valid_errors))
# On Prediction, do exp to inverse the loge done during training
sub_df = pd.DataFrame({
"Id": test_df["Id"],
"SalePrice": np.mean(test_preds, axis=0)
})
# Return test prediction with CV and the Validation Prediction (For Stacking later)
return sub_df, pd.concat(valid_preds)
Ridge()
def lasso_hparams_finder(trial):
alpha = trial.suggest_float("alpha", 0, 1.0)
max_iter = trial.suggest_int("max_iter", 500, 5000)
return Lasso(alpha=alpha, max_iter=max_iter)
def ridge_hparams_finder(trial):
alpha = trial.suggest_float("alpha", 0, 1.0)
max_iter = trial.suggest_int("max_iter", 500, 5000)
return Ridge(alpha=alpha, max_iter=max_iter)
def xgb_hparams_finder(trial):
max_depth = trial.suggest_int("max_depth", 5, 30)
n_estimators = trial.suggest_int("n_estimators", 100, 300)
learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
tree_method = trial.suggest_categorical("tree_method", ["gpu_hist"])
gamma = trial.suggest_float("gamma", 0, 1)
eta = trial.suggest_float("eta", 0, 1)
return XGBRegressor(
max_depth=max_depth,
n_estimators=n_estimators,
learning_rate=learning_rate,
tree_method=tree_method,
gamma=gamma,
eta=eta
)
def rf_hparams_finder(trial):
max_depth = trial.suggest_int("max_depth", 10, 50)
n_estimators = trial.suggest_int("n_estimators", 100, 300)
return RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
def lightgbm_hparams_finder(trial):
max_depth = trial.suggest_int("max_depth", 5, 30)
n_estimators = trial.suggest_int("n_estimators", 100, 300)
learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
reg_alpha = trial.suggest_float("reg_alpha", 0., 1)
reg_lambda = trial.suggest_float("reg_lambda", 0., 1)
return LGBMRegressor(max_depth=max_depth,
n_estimators=n_estimators,
learning_rate=learning_rate, reg_alpha=reg_alpha, reg_lambda=reg_lambda)
def svr_hparams_finder(trial):
kernel = trial.suggest_categorical("kernel",
['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'])
degree = trial.suggest_int("degree", 1, 4)
c = trial.suggest_float("c", 0, 1)
max_iter = trial.suggest_float("max_iter", 50, 500)
return SVR(kernel=kernel, degree=degree, C=c, max_iterm=max_iter)
optuna.create_study()
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder),
n_trials=100,
show_progress_bar=True
)
lasso_params = study.best_params # E.g. {'x': 2.002108042}
model_options = {
"ridge": ridge_hparams_finder,
"lasso": lasso_hparams_finder,
"xgb": xgb_hparams_finder,
"rf": rf_hparams_finder,
"ligtgbm": lightgbm_hparams_finder,
"svr": svr_hparams_finder
}
best_hparams = []
for model_name, model_hparams_fn in model_options.items():
print(model_name)
optuna.create_study()
study = optuna.create_study()
study.optimize(partial(hyperparam_finder_nocv, train_df, model_hparams_fn),
n_trials=20,
show_progress_bar=True
)
best_hparams.append({
model_name: study.best_params
})
optuna.create_study()
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder),
n_trials=100,
show_progress_bar=True
)
lasso_params = study.best_params # E.g. {'x': 2.002108042}
model1_test_preds, model1_valid_preds = train_kfold(train_df, test_df, Ridge)
# Lets improve the model by choosing XGBoost over Linear Classifier
model2_test_preds, model2_valid_preds = train_kfold(train_df, test_df, XGBRegressor, n_jobs=4, n_estimators=500, max_depth=20)
# Lets improve the model by choosing RF over Linear Classifier
model3_test_preds, model3_valid_preds = train_kfold(train_df, test_df, RandomForestRegressor,n_jobs=4, n_estimators=500, max_depth=20)
# Lets improve the model by choosing SVR
model4_test_preds, model4_valid_preds = train_kfold(train_df, test_df, SVR)
# Lets improve the model by choosing LightGBM
model5_test_preds, model5_valid_preds = train_kfold(train_df, test_df, LGBMRegressor, n_jobs=4, n_estimators=500, max_depth=20)
model1_valid_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_valid_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_valid_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_valid_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_valid_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)
model1_test_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_test_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_test_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_test_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_test_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)
pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model3_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model4_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model5_test_preds, left_on="Id", right_on="Id")
sub_df["SalePrice"] = (sub_df["model1_preds"] + sub_df["model2_preds"] + sub_df["model3_preds"] + sub_df["model4_preds"] + sub_df["model5_preds"])/5
sub_df[["Id", "SalePrice"]].to_csv("submission_model_blend.csv", index=False)
# Now Lets do Model Stacking.
# Choose three Models - LinearRegression, RandomForest and XGBoost and get Predictions
# Average ALL three Predictions (With KFold) and make a Submission. This will give you Model Blending
# TODO
layer1_test_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model3_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model4_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model5_test_preds, left_on="Id", right_on="Id")
layer1_test_df.head()
layer1_train_df = pd.merge(model1_valid_preds, model2_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model3_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model4_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model5_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, train_df[["Id", "SalePrice"]], left_on="Id", right_on="Id")
layer1_train_df.head()
# You would have seen an improvement on your Leaderboard by providing it better features
# Now lets do KFold (5 Fold)
def train_model_stacking_kfold(df, test_df):
"""
Train a Linear Regression Model with 5 Fold CV
"""
# Start with simple linear Model
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
valid_errors = []
test_preds = []
features = ["model1_preds", "model2_preds", "model3_preds", "model4_preds", "model5_preds"]
df = df.copy()
test_df = test_df.copy()
for feat in features:
df[feat] = np.log(df[feat])
test_df[feat] = np.log(test_df[feat])
df["SalePrice"] = np.log(df["SalePrice"])
for train_idxs, valid_idxs in kfold.split(df):
train_df = df.loc[train_idxs]
valid_df = df.loc[valid_idxs]
X_train = train_df[features]
y_train = train_df["SalePrice"]
X_valid = valid_df[features]
y_valid = valid_df["SalePrice"]
X_test = test_df[features]
clf = LinearRegression()
clf.fit(X_train, y_train)
y_valid_preds = clf.predict(X_valid)
valid_errors.append(rmse(y_valid_preds, y_valid))
test_preds.append(np.exp(clf.predict(X_test)))
print("RMSE Log Error", np.mean(valid_errors))
# On Prediction, do exp to inverse the loge done during training
sub_df = pd.DataFrame({
"Id": test_df["Id"],
"SalePrice": np.mean(test_preds, axis=0)
})
# Return test prediction with CV
return sub_df
sub_df = train_model_stacking_kfold(layer1_train_df, layer1_test_df)
sub_df[["Id", "SalePrice"]].to_csv("submission_model_stack.csv", index=False)
import lightgbm