# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor, XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, mean_squared_error

from functools import partial
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
DATA_DIR = Path("/kaggle/input/house-prices-advanced-regression-techniques/")

train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df = pd.read_csv(DATA_DIR / "test.csv")
sub_df = pd.read_csv(DATA_DIR / "sample_submission.csv")
train_df.head()
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

# Choose only columns that have at least 1000 non-null values
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB

EDA

train_df["SalePrice"].hist(bins=50)
<AxesSubplot:>
# So We should convert target to Log, even if the Metric is not using it
np.log(train_df["SalePrice"]).hist(bins=50);
# May be we should exclude them from dataset since there's no much we can gain from two data points
train_df.plot(kind="scatter", x="Id", y="SalePrice", alpha=0.25)
<AxesSubplot:xlabel='Id', ylabel='SalePrice'>
plt.figure(figsize=(12,10))
sns.heatmap(train_df.corr(), cmap='Greys');
train_df.corr()
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SalePrice
Id 1.000000 0.011156 -0.010601 -0.033226 -0.028365 0.012609 -0.012713 -0.021998 -0.050298 -0.005024 ... -0.029643 -0.000477 0.002889 -0.046635 0.001330 0.057044 -0.006242 0.021172 0.000712 -0.021917
MSSubClass 0.011156 1.000000 -0.386347 -0.139781 0.032628 -0.059316 0.027850 0.040581 0.022936 -0.069836 ... -0.012579 -0.006100 -0.012037 -0.043825 -0.026030 0.008283 -0.007683 -0.013585 -0.021407 -0.084284
LotFrontage -0.010601 -0.386347 1.000000 0.426095 0.251646 -0.059213 0.123349 0.088866 0.193458 0.233633 ... 0.088521 0.151972 0.010700 0.070029 0.041383 0.206167 0.003368 0.011200 0.007450 0.351799
LotArea -0.033226 -0.139781 0.426095 1.000000 0.105806 -0.005636 0.014228 0.013788 0.104160 0.214103 ... 0.171698 0.084774 -0.018340 0.020423 0.043160 0.077672 0.038068 0.001205 -0.014261 0.263843
OverallQual -0.028365 0.032628 0.251646 0.105806 1.000000 -0.091932 0.572323 0.550684 0.411876 0.239666 ... 0.238923 0.308819 -0.113937 0.030371 0.064886 0.065166 -0.031406 0.070815 -0.027347 0.790982
OverallCond 0.012609 -0.059316 -0.059213 -0.005636 -0.091932 1.000000 -0.375983 0.073741 -0.128101 -0.046231 ... -0.003334 -0.032589 0.070356 0.025504 0.054811 -0.001985 0.068777 -0.003511 0.043950 -0.077856
YearBuilt -0.012713 0.027850 0.123349 0.014228 0.572323 -0.375983 1.000000 0.592855 0.315707 0.249503 ... 0.224880 0.188686 -0.387268 0.031355 -0.050364 0.004950 -0.034383 0.012398 -0.013618 0.522897
YearRemodAdd -0.021998 0.040581 0.088866 0.013788 0.550684 0.073741 0.592855 1.000000 0.179618 0.128451 ... 0.205726 0.226298 -0.193919 0.045286 -0.038740 0.005829 -0.010286 0.021490 0.035743 0.507101
MasVnrArea -0.050298 0.022936 0.193458 0.104160 0.411876 -0.128101 0.315707 0.179618 1.000000 0.264736 ... 0.159718 0.125703 -0.110204 0.018796 0.061466 0.011723 -0.029815 -0.005965 -0.008201 0.477493
BsmtFinSF1 -0.005024 -0.069836 0.233633 0.214103 0.239666 -0.046231 0.249503 0.128451 0.264736 1.000000 ... 0.204306 0.111761 -0.102303 0.026451 0.062021 0.140491 0.003571 -0.015727 0.014359 0.386420
BsmtFinSF2 -0.005968 -0.065649 0.049900 0.111170 -0.059119 0.040229 -0.049107 -0.067759 -0.072319 -0.050117 ... 0.067898 0.003093 0.036543 -0.029993 0.088871 0.041709 0.004940 -0.015211 0.031706 -0.011378
BsmtUnfSF -0.007940 -0.140759 0.132644 -0.002618 0.308159 -0.136841 0.149040 0.181133 0.114442 -0.495251 ... -0.005316 0.129005 -0.002538 0.020764 -0.012579 -0.035092 -0.023837 0.034888 -0.041258 0.214479
TotalBsmtSF -0.015415 -0.238518 0.392075 0.260833 0.537808 -0.171098 0.391452 0.291066 0.363936 0.522396 ... 0.232019 0.247264 -0.095478 0.037384 0.084489 0.126053 -0.018479 0.013196 -0.014969 0.613581
1stFlrSF 0.010496 -0.251758 0.457181 0.299475 0.476224 -0.144203 0.281986 0.240379 0.344501 0.445863 ... 0.235459 0.211671 -0.065292 0.056104 0.088758 0.131525 -0.021096 0.031372 -0.013604 0.605852
2ndFlrSF 0.005590 0.307886 0.080177 0.050986 0.295493 0.028942 0.010308 0.140024 0.174561 -0.137079 ... 0.092165 0.208026 0.061989 -0.024358 0.040606 0.081487 0.016197 0.035164 -0.028700 0.319334
LowQualFinSF -0.044230 0.046474 0.038469 0.004779 -0.030429 0.025494 -0.183784 -0.062419 -0.069071 -0.064503 ... -0.025444 0.018251 0.061081 -0.004296 0.026799 0.062157 -0.003793 -0.022174 -0.028921 -0.025606
GrLivArea 0.008273 0.074853 0.402797 0.263116 0.593007 -0.079686 0.199010 0.287389 0.390857 0.208171 ... 0.247433 0.330224 0.009113 0.020643 0.101510 0.170205 -0.002416 0.050240 -0.036526 0.708624
BsmtFullBath 0.002289 0.003491 0.100949 0.158155 0.111098 -0.054942 0.187599 0.119470 0.085310 0.649212 ... 0.175315 0.067341 -0.049911 -0.000106 0.023148 0.067616 -0.023047 -0.025361 0.067049 0.227122
BsmtHalfBath -0.020155 -0.002333 -0.007234 0.048046 -0.040150 0.117821 -0.038162 -0.012337 0.026673 0.067418 ... 0.040161 -0.025324 -0.008555 0.035114 0.032121 0.020025 -0.007367 0.032873 -0.046524 -0.016844
FullBath 0.005587 0.131608 0.198769 0.126031 0.550600 -0.194149 0.468271 0.439046 0.276833 0.058543 ... 0.187703 0.259977 -0.115093 0.035353 -0.008106 0.049604 -0.014290 0.055872 -0.019669 0.560664
HalfBath 0.006784 0.177354 0.053532 0.014259 0.273458 -0.060769 0.242656 0.183331 0.201444 0.004262 ... 0.108080 0.199740 -0.095317 -0.004972 0.072426 0.022381 0.001290 -0.009050 -0.010269 0.284108
BedroomAbvGr 0.037719 -0.023438 0.263170 0.119690 0.101676 0.012980 -0.070651 -0.040581 0.102821 -0.107355 ... 0.046854 0.093810 0.041570 -0.024478 0.044300 0.070703 0.007767 0.046544 -0.036014 0.168213
KitchenAbvGr 0.002951 0.281721 -0.006069 -0.017784 -0.183882 -0.087001 -0.174800 -0.149598 -0.037610 -0.081007 ... -0.090130 -0.070091 0.037312 -0.024600 -0.051613 -0.014525 0.062341 0.026589 0.031687 -0.135907
TotRmsAbvGrd 0.027239 0.040380 0.352096 0.190015 0.427452 -0.057583 0.095589 0.191740 0.280682 0.044316 ... 0.165984 0.234192 0.004151 -0.006683 0.059383 0.083757 0.024763 0.036907 -0.034516 0.533723
Fireplaces -0.019772 -0.045569 0.266639 0.271364 0.396765 -0.023820 0.147716 0.112581 0.249070 0.260011 ... 0.200019 0.169405 -0.024822 0.011257 0.184530 0.095074 0.001409 0.046357 -0.024096 0.466929
GarageYrBlt 0.000072 0.085072 0.070250 -0.024947 0.547766 -0.324297 0.825667 0.642277 0.252691 0.153484 ... 0.224577 0.228425 -0.297003 0.023544 -0.075418 -0.014501 -0.032417 0.005337 -0.001014 0.486362
GarageCars 0.016570 -0.040110 0.285691 0.154871 0.600671 -0.185758 0.537850 0.420622 0.364204 0.224054 ... 0.226342 0.213569 -0.151434 0.035765 0.050494 0.020934 -0.043080 0.040522 -0.039117 0.640409
GarageArea 0.017634 -0.098672 0.344997 0.180403 0.562022 -0.151521 0.478954 0.371600 0.373066 0.296970 ... 0.224666 0.241435 -0.121777 0.035087 0.051412 0.061047 -0.027400 0.027974 -0.027378 0.623431
WoodDeckSF -0.029643 -0.012579 0.088521 0.171698 0.238923 -0.003334 0.224880 0.205726 0.159718 0.204306 ... 1.000000 0.058661 -0.125989 -0.032771 -0.074181 0.073378 -0.009551 0.021011 0.022270 0.324413
OpenPorchSF -0.000477 -0.006100 0.151972 0.084774 0.308819 -0.032589 0.188686 0.226298 0.125703 0.111761 ... 0.058661 1.000000 -0.093079 -0.005842 0.074304 0.060762 -0.018584 0.071255 -0.057619 0.315856
EnclosedPorch 0.002889 -0.012037 0.010700 -0.018340 -0.113937 0.070356 -0.387268 -0.193919 -0.110204 -0.102303 ... -0.125989 -0.093079 1.000000 -0.037305 -0.082864 0.054203 0.018361 -0.028887 -0.009916 -0.128578
3SsnPorch -0.046635 -0.043825 0.070029 0.020423 0.030371 0.025504 0.031355 0.045286 0.018796 0.026451 ... -0.032771 -0.005842 -0.037305 1.000000 -0.031436 -0.007992 0.000354 0.029474 0.018645 0.044584
ScreenPorch 0.001330 -0.026030 0.041383 0.043160 0.064886 0.054811 -0.050364 -0.038740 0.061466 0.062021 ... -0.074181 0.074304 -0.082864 -0.031436 1.000000 0.051307 0.031946 0.023217 0.010694 0.111447
PoolArea 0.057044 0.008283 0.206167 0.077672 0.065166 -0.001985 0.004950 0.005829 0.011723 0.140491 ... 0.073378 0.060762 0.054203 -0.007992 0.051307 1.000000 0.029669 -0.033737 -0.059689 0.092404
MiscVal -0.006242 -0.007683 0.003368 0.038068 -0.031406 0.068777 -0.034383 -0.010286 -0.029815 0.003571 ... -0.009551 -0.018584 0.018361 0.000354 0.031946 0.029669 1.000000 -0.006495 0.004906 -0.021190
MoSold 0.021172 -0.013585 0.011200 0.001205 0.070815 -0.003511 0.012398 0.021490 -0.005965 -0.015727 ... 0.021011 0.071255 -0.028887 0.029474 0.023217 -0.033737 -0.006495 1.000000 -0.145721 0.046432
YrSold 0.000712 -0.021407 0.007450 -0.014261 -0.027347 0.043950 -0.013618 0.035743 -0.008201 0.014359 ... 0.022270 -0.057619 -0.009916 0.018645 0.010694 -0.059689 0.004906 -0.145721 1.000000 -0.028923
SalePrice -0.021917 -0.084284 0.351799 0.263843 0.790982 -0.077856 0.522897 0.507101 0.477493 0.386420 ... 0.324413 0.315856 -0.128578 0.044584 0.111447 0.092404 -0.021190 0.046432 -0.028923 1.000000

38 rows × 38 columns

corr_cols = train_df.corr()["SalePrice"].nlargest(15).index
corr_cols
Index(['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',
       'YearRemodAdd', 'GarageYrBlt', 'MasVnrArea', 'Fireplaces',
       'BsmtFinSF1'],
      dtype='object')
plt.figure(figsize=(10, 6))
sns.heatmap(train_df.loc[:, corr_cols].corr(), annot=True, cmap="gray")
<AxesSubplot:>
# Overall Quality has big impact on SalePrice
train_df.plot(kind="scatter", x="OverallQual", y="SalePrice", alpha=0.25)
<AxesSubplot:xlabel='OverallQual', ylabel='SalePrice'>
# GrLivAera is also strongly correlated.  
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GrLivArea", y="SalePrice", alpha=0.25)
<AxesSubplot:xlabel='GrLivArea', ylabel='SalePrice'>
train_df[train_df.GrLivArea > 4000]
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
523 524 60 RL 130.0 40094 Pave NaN IR1 Bnk AllPub ... 0 NaN NaN NaN 0 10 2007 New Partial 184750
691 692 60 RL 104.0 21535 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 1 2007 WD Normal 755000
1182 1183 60 RL 160.0 15623 Pave NaN IR1 Lvl AllPub ... 555 Ex MnPrv NaN 0 7 2007 WD Abnorml 745000
1298 1299 60 RL 313.0 63887 Pave NaN IR3 Bnk AllPub ... 480 Gd NaN NaN 0 1 2008 New Partial 160000

4 rows × 81 columns

# GrLivAera is also strongly correlated.  
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GarageArea", y="SalePrice", alpha=0.25)
<AxesSubplot:xlabel='GarageArea', ylabel='SalePrice'>
train_df[train_df.GarageArea > 1200]
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
581 582 20 RL 98.0 12704 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 8 2009 New Partial 253293
825 826 20 RL 114.0 14803 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 6 2008 New Partial 385000
1061 1062 30 C (all) 120.0 18000 Grvl NaN Reg Low AllPub ... 0 NaN NaN Shed 560 8 2008 ConLD Normal 81000
1190 1191 190 RL NaN 32463 Pave NaN Reg Low AllPub ... 0 NaN NaN NaN 0 3 2007 WD Normal 168000
1298 1299 60 RL 313.0 63887 Pave NaN IR3 Bnk AllPub ... 480 Gd NaN NaN 0 1 2008 New Partial 160000

5 rows × 81 columns

train_df = train_df[train_df["SalePrice"] < 700000]
def get_features(train_df):
    num_features, cat_features = [], []
    for col in train_df.columns:
        if col in ["Id", "SalePrice"]:
            continue
        dtype = train_df[col].dtype
        ratio = pd.notna(train_df[col]).sum() / len(train_df[col])
        if ratio < 0.5:
            continue
        if dtype == "object":
            cat_features.append(col)
        else:
            num_features.append(col)
    return num_features, cat_features
num_features, cat_features = get_features(train_df)
cat_features
['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']
def get_preprocess_pipeline(train_df, sample_features=False):
    # Get Numeric and Categorical Features
    numeric_features, categorical_features = get_features(train_df)
    target = "SalePrice"
    if sample_features:
        numeric_features = ["LotArea"]
        categorical_features = ["SaleType", "SaleCondition"]
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
        ('onehpt', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    return preprocessor, numeric_features + categorical_features, target
def train_LR(train_df, test_df, sample_features=False):
    """
    Train a Linear Regression Model 
    """
    # Start with simple linear Model
    preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=sample_features)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', LinearRegression())])
    X_train = train_df[features]
    y_train = np.log(train_df[target])
    X_test = test_df[features]
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    
    clf.fit(X_train, y_train)
    print("RMSE Log Error", rmse(clf.predict(X_valid), y_valid))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.exp(clf.predict(X_test))
    })
    return sub_df
sub_df = train_LR(train_df, test_df, sample_features=True)
sub_df.to_csv("submission_lr_sample.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
RMSE Log Error 0.3749878871754616
sub_df = train_LR(train_df, test_df, sample_features=False)
sub_df.to_csv("submission_lr.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
RMSE Log Error 0.13212079796339762
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for p1, p2 in kfold.split(range(20)):
    print(p1, p2)
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 18 19] [ 0 17]
[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 16 17 18 19] [ 1 15]
[ 0  1  2  3  4  6  7  9 10 11 12 13 14 15 16 17 18 19] [5 8]
[ 0  1  2  4  5  6  7  8  9 10 12 13 14 15 16 17 18 19] [ 3 11]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 19] [16 18]
[ 0  1  3  4  5  6  7  8  9 10 11 12 14 15 16 17 18 19] [ 2 13]
[ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16 17 18] [ 9 19]
[ 0  1  2  3  5  6  7  8  9 10 11 13 14 15 16 17 18 19] [ 4 12]
[ 0  1  2  3  4  5  6  8  9 11 12 13 14 15 16 17 18 19] [ 7 10]
[ 0  1  2  3  4  5  7  8  9 10 11 12 13 15 16 17 18 19] [ 6 14]
kfold = KFold(n_splits=7, shuffle=True, random_state=42)
for idxs in kfold.split(train_df):
    print(idxs[0].shape, idxs[1].shape)
(1249,) (209,)
(1249,) (209,)
(1250,) (208,)
(1250,) (208,)
(1250,) (208,)
(1250,) (208,)
(1250,) (208,)
idxs
(array([   0,    2,    3, ..., 1454, 1456, 1457]),
 array([   1,   13,   14,   20,   21,   34,   40,   64,   87,   91,   95,
          98,  112,  121,  122,  130,  134,  143,  161,  166,  187,  189,
         197,  200,  201,  202,  205,  206,  216,  219,  230,  241,  246,
         252,  269,  276,  279,  288,  293,  295,  315,  317,  330,  337,
         343,  378,  379,  384,  385,  387,  391,  392,  397,  400,  401,
         406,  418,  441,  455,  459,  466,  474,  484,  492,  502,  508,
         510,  520,  524,  540,  556,  562,  563,  564,  565,  577,  592,
         600,  608,  612,  627,  632,  641,  642,  645,  646,  647,  648,
         663,  681,  683,  686,  698,  699,  702,  709,  719,  725,  729,
         742,  747,  748,  751,  763,  766,  769,  775,  776,  779,  791,
         794,  795,  804,  805,  815,  831,  835,  840,  854,  856,  860,
         863,  870,  871,  878,  883,  891,  897,  929,  951,  955,  957,
         960,  972,  975,  981,  995, 1012, 1016, 1017, 1020, 1021, 1025,
        1028, 1038, 1044, 1051, 1056, 1059, 1064, 1071, 1076, 1082, 1086,
        1095, 1104, 1109, 1123, 1126, 1129, 1130, 1136, 1152, 1153, 1154,
        1162, 1180, 1183, 1184, 1194, 1207, 1215, 1238, 1241, 1248, 1254,
        1256, 1257, 1264, 1266, 1267, 1275, 1281, 1282, 1286, 1294, 1297,
        1316, 1327, 1332, 1335, 1337, 1339, 1346, 1349, 1363, 1369, 1370,
        1378, 1384, 1390, 1391, 1396, 1420, 1421, 1435, 1437, 1455]))
def hyperparam_finder_nocv(df, model_fn, trial):
    """
    Hyperparameter Finder
    """
    # Start with simple linear Model
    model = model_fn(trial)
    preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    X_train = train_df[features]
    y_train = np.log(train_df[target])
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    
    clf.fit(X_train, y_train)
    return rmse(clf.predict(X_valid), y_valid)
def hyperparam_finder(df, model_fn, trial):
    """
    Hyperparameter Finder
    """
    # Start with simple linear Model
    model = model_fn(trial)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.iloc[train_idxs]
        valid_df = df.iloc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
    # Return Valid Pred Score for HyperParam Tuning
    return np.mean(valid_errors)
# Now lets do KFold (5 Fold)
def train_kfold(df, test_df, ModelClass, **model_kwargs):
    """
    Train a Regression Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.iloc[train_idxs]
        valid_df = df.iloc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', ModelClass(**model_kwargs))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))
        valid_preds.append(pd.DataFrame({
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)
        }))

    print("RMSE Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)
model1_sub_df, model1_valid_preds = train_kfold(train_df, test_df, LinearRegression)
sub_df.to_csv("submission_lr_kfold.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
# Score might have improved over the LR without kfold.
RMSE Log Error 0.15228419074391034
def train_XGB_kfold(df, test_df):
    """
    Train a XGBoost Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df.index.values):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', XGBRegressor(n_jobs=-1, n_estimators=500, max_depth=20))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))
        valid_preds.append(pd.DataFrame({
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)
        }))

    print("Mean Squared Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)

def train_RF_kfold(df, test_df):
    """
    Train a RF Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df.index.values):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestRegressor(n_jobs=-1, max_depth=20))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(mean_squared_error(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))
        valid_preds.append(pd.DataFrame({
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)
        }))

    print("Mean Squared Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)
Ridge()
def lasso_hparams_finder(trial):
    alpha = trial.suggest_float("alpha", 0, 1.0)
    max_iter = trial.suggest_int("max_iter", 500, 5000)
    return Lasso(alpha=alpha, max_iter=max_iter)

def ridge_hparams_finder(trial):
    alpha = trial.suggest_float("alpha", 0, 1.0)
    max_iter = trial.suggest_int("max_iter", 500, 5000)
    return Ridge(alpha=alpha, max_iter=max_iter)

def xgb_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 5, 30)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
    tree_method = trial.suggest_categorical("tree_method", ["gpu_hist"])
    gamma = trial.suggest_float("gamma", 0, 1)
    eta = trial.suggest_float("eta", 0, 1)
    return XGBRegressor(
        max_depth=max_depth, 
        n_estimators=n_estimators, 
        learning_rate=learning_rate, 
        tree_method=tree_method, 
        gamma=gamma, 
        eta=eta
    )

def rf_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 10, 50)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    return RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)

def lightgbm_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 5, 30)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
    reg_alpha = trial.suggest_float("reg_alpha", 0., 1)
    reg_lambda = trial.suggest_float("reg_lambda", 0., 1)
    return LGBMRegressor(max_depth=max_depth, 
        n_estimators=n_estimators, 
        learning_rate=learning_rate, reg_alpha=reg_alpha, reg_lambda=reg_lambda)

def svr_hparams_finder(trial):
    kernel = trial.suggest_categorical("kernel", 
                                          ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'])
    degree = trial.suggest_int("degree", 1, 4)
    c = trial.suggest_float("c", 0, 1)
    max_iter = trial.suggest_float("max_iter", 50, 500)
    return SVR(kernel=kernel, degree=degree, C=c, max_iterm=max_iter)
optuna.create_study()
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder), 
               n_trials=100, 
               show_progress_bar=True
              )
lasso_params = study.best_params  # E.g. {'x': 2.002108042}
/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()
model_options = {
        "ridge": ridge_hparams_finder,
        "lasso": lasso_hparams_finder,
        "xgb": xgb_hparams_finder,
        "rf": rf_hparams_finder,
        "ligtgbm": lightgbm_hparams_finder,
        "svr": svr_hparams_finder
    }
best_hparams = []
for model_name, model_hparams_fn in model_options.items():
    print(model_name)
    optuna.create_study()
    study = optuna.create_study()
    study.optimize(partial(hyperparam_finder_nocv, train_df, model_hparams_fn), 
                   n_trials=20, 
                   show_progress_bar=True
                  )

    best_hparams.append({
        model_name: study.best_params
    })
ridge
/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()
lasso
/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()
xgb
/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()
<optuna.trial._trial.Trial object at 0x7f6260bd0910>
optuna.create_study()
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder), 
               n_trials=100, 
               show_progress_bar=True
              )
lasso_params = study.best_params  # E.g. {'x': 2.002108042}
model1_test_preds, model1_valid_preds = train_kfold(train_df, test_df, Ridge)

# Lets improve the model by choosing XGBoost over Linear Classifier
model2_test_preds, model2_valid_preds = train_kfold(train_df, test_df,  XGBRegressor, n_jobs=4, n_estimators=500, max_depth=20)

# Lets improve the model by choosing RF over Linear Classifier
model3_test_preds, model3_valid_preds = train_kfold(train_df, test_df, RandomForestRegressor,n_jobs=4, n_estimators=500, max_depth=20)

# Lets improve the model by choosing SVR
model4_test_preds, model4_valid_preds = train_kfold(train_df, test_df,  SVR)

# Lets improve the model by choosing  LightGBM
model5_test_preds, model5_valid_preds = train_kfold(train_df, test_df,  LGBMRegressor, n_jobs=4, n_estimators=500, max_depth=20)
RMSE Log Error 0.14397705141849326
RMSE Log Error 0.14641944560849735
RMSE Log Error 0.14289517901441579
RMSE Log Error 0.13849838128051936
RMSE Log Error 0.12958699005124785
model1_valid_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_valid_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_valid_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_valid_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_valid_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)
model1_test_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_test_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_test_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_test_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_test_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)
pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
Id model1_preds model2_preds
0 1461 117713.829211 124097.960938
1 1462 146528.150506 162885.296875
2 1463 177353.856048 179494.609375
3 1464 197262.388246 189156.562500
4 1465 198261.170350 191763.968750
... ... ... ...
1454 2915 82789.895485 82117.929688
1455 2916 84069.656304 83893.187500
1456 2917 163975.093345 169720.000000
1457 2918 113470.524225 113896.640625
1458 2919 218925.672950 231103.156250

1459 rows × 3 columns

sub_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model3_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model4_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model5_test_preds, left_on="Id", right_on="Id")
sub_df["SalePrice"] = (sub_df["model1_preds"] + sub_df["model2_preds"] + sub_df["model3_preds"] + sub_df["model4_preds"] + sub_df["model5_preds"])/5
sub_df[["Id", "SalePrice"]].to_csv("submission_model_blend.csv", index=False)
# Now Lets do Model Stacking.
# Choose three Models - LinearRegression, RandomForest and XGBoost and get Predictions
# Average ALL three Predictions (With KFold) and make a Submission.  This will give you Model Blending
# TODO
layer1_test_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model3_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model4_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model5_test_preds, left_on="Id", right_on="Id")

layer1_test_df.head()
Id model1_preds model2_preds model3_preds model4_preds model5_preds
0 1461 117713.829211 124097.960938 126180.720900 119344.635252 125128.727896
1 1462 146528.150506 162885.296875 153965.312504 182667.415871 150943.897192
2 1463 177353.856048 179494.609375 177589.522503 180658.023391 187343.404402
3 1464 197262.388246 189156.562500 182245.150263 194350.575145 185932.924479
4 1465 198261.170350 191763.968750 196898.030631 189742.998719 184235.322421
layer1_train_df = pd.merge(model1_valid_preds, model2_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model3_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model4_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model5_valid_preds, left_on="Id", right_on="Id")

layer1_train_df = pd.merge(layer1_train_df, train_df[["Id", "SalePrice"]], left_on="Id", right_on="Id")
layer1_train_df.head()
Id model1_preds model2_preds model3_preds model4_preds model5_preds SalePrice
0 16 139940.327034 133994.156250 156806.829127 134248.224855 151408.259753 132000
1 24 132404.159793 143172.593750 140933.363142 143026.241725 133380.441989 129900
2 30 78196.145620 72757.703125 74290.411810 77943.702050 75490.226312 68500
3 31 65289.900643 85316.335938 95203.013404 85049.653410 92450.252907 40000
4 33 201075.364368 195446.609375 206302.013767 182591.810407 188616.393510 179900
# You would have seen an improvement on your Leaderboard by providing it better features
# Now lets do KFold (5 Fold)
def train_model_stacking_kfold(df, test_df):
    """
    Train a Linear Regression Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    features = ["model1_preds", "model2_preds", "model3_preds", "model4_preds", "model5_preds"]
    df = df.copy()
    test_df = test_df.copy()
    for feat in features:
        df[feat] = np.log(df[feat])
        test_df[feat] = np.log(test_df[feat])
    df["SalePrice"] = np.log(df["SalePrice"])
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]
        X_train = train_df[features]
        y_train = train_df["SalePrice"]
        X_valid = valid_df[features]
        y_valid = valid_df["SalePrice"]
        X_test = test_df[features]
        clf = LinearRegression()
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))

    print("RMSE Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV
    return sub_df
sub_df = train_model_stacking_kfold(layer1_train_df, layer1_test_df)
sub_df[["Id", "SalePrice"]].to_csv("submission_model_stack.csv", index=False)
RMSE Log Error 0.12191138856220185
import lightgbm