# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv

from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor, XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, mean_squared_error

from functools import partial
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

DATA_DIR = Path("/kaggle/input/house-prices-advanced-regression-techniques/")

train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df = pd.read_csv(DATA_DIR / "test.csv")
sub_df = pd.read_csv(DATA_DIR / "sample_submission.csv")

train_df.head()

# Choose only columns that have at least 1000 non-null values
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB

EDA

train_df["SalePrice"].hist(bins=50)

<AxesSubplot:>

# So We should convert target to Log, even if the Metric is not using it
np.log(train_df["SalePrice"]).hist(bins=50);

# May be we should exclude them from dataset since there's no much we can gain from two data points
train_df.plot(kind="scatter", x="Id", y="SalePrice", alpha=0.25)

<AxesSubplot:xlabel='Id', ylabel='SalePrice'>

plt.figure(figsize=(12,10))
sns.heatmap(train_df.corr(), cmap='Greys');

train_df.corr()

corr_cols = train_df.corr()["SalePrice"].nlargest(15).index
corr_cols

Index(['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',
       'YearRemodAdd', 'GarageYrBlt', 'MasVnrArea', 'Fireplaces',
       'BsmtFinSF1'],
      dtype='object')

plt.figure(figsize=(10, 6))
sns.heatmap(train_df.loc[:, corr_cols].corr(), annot=True, cmap="gray")

<AxesSubplot:>

# Overall Quality has big impact on SalePrice
train_df.plot(kind="scatter", x="OverallQual", y="SalePrice", alpha=0.25)

<AxesSubplot:xlabel='OverallQual', ylabel='SalePrice'>

# GrLivAera is also strongly correlated.  
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GrLivArea", y="SalePrice", alpha=0.25)

<AxesSubplot:xlabel='GrLivArea', ylabel='SalePrice'>

train_df[train_df.GrLivArea > 4000]

# GrLivAera is also strongly correlated.  
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GarageArea", y="SalePrice", alpha=0.25)

<AxesSubplot:xlabel='GarageArea', ylabel='SalePrice'>

train_df[train_df.GarageArea > 1200]

train_df = train_df[train_df["SalePrice"] < 700000]

def get_features(train_df):
    num_features, cat_features = [], []
    for col in train_df.columns:
        if col in ["Id", "SalePrice"]:
            continue
        dtype = train_df[col].dtype
        ratio = pd.notna(train_df[col]).sum() / len(train_df[col])
        if ratio < 0.5:
            continue
        if dtype == "object":
            cat_features.append(col)
        else:
            num_features.append(col)
    return num_features, cat_features

num_features, cat_features = get_features(train_df)

cat_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

def get_preprocess_pipeline(train_df, sample_features=False):
    # Get Numeric and Categorical Features
    numeric_features, categorical_features = get_features(train_df)
    target = "SalePrice"
    if sample_features:
        numeric_features = ["LotArea"]
        categorical_features = ["SaleType", "SaleCondition"]
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
        ('onehpt', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    return preprocessor, numeric_features + categorical_features, target

def train_LR(train_df, test_df, sample_features=False):
    """
    Train a Linear Regression Model 
    """
    # Start with simple linear Model
    preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=sample_features)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', LinearRegression())])
    X_train = train_df[features]
    y_train = np.log(train_df[target])
    X_test = test_df[features]
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    
    clf.fit(X_train, y_train)
    print("RMSE Log Error", rmse(clf.predict(X_valid), y_valid))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.exp(clf.predict(X_test))
    })
    return sub_df

sub_df = train_LR(train_df, test_df, sample_features=True)
sub_df.to_csv("submission_lr_sample.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)

RMSE Log Error 0.3749878871754616

sub_df = train_LR(train_df, test_df, sample_features=False)
sub_df.to_csv("submission_lr.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)

RMSE Log Error 0.13212079796339762

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for p1, p2 in kfold.split(range(20)):
    print(p1, p2)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 18 19] [ 0 17]
[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 16 17 18 19] [ 1 15]
[ 0  1  2  3  4  6  7  9 10 11 12 13 14 15 16 17 18 19] [5 8]
[ 0  1  2  4  5  6  7  8  9 10 12 13 14 15 16 17 18 19] [ 3 11]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 19] [16 18]
[ 0  1  3  4  5  6  7  8  9 10 11 12 14 15 16 17 18 19] [ 2 13]
[ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16 17 18] [ 9 19]
[ 0  1  2  3  5  6  7  8  9 10 11 13 14 15 16 17 18 19] [ 4 12]
[ 0  1  2  3  4  5  6  8  9 11 12 13 14 15 16 17 18 19] [ 7 10]
[ 0  1  2  3  4  5  7  8  9 10 11 12 13 15 16 17 18 19] [ 6 14]

kfold = KFold(n_splits=7, shuffle=True, random_state=42)
for idxs in kfold.split(train_df):
    print(idxs[0].shape, idxs[1].shape)

(1249,) (209,)
(1249,) (209,)
(1250,) (208,)
(1250,) (208,)
(1250,) (208,)
(1250,) (208,)
(1250,) (208,)

idxs

(array([   0,    2,    3, ..., 1454, 1456, 1457]),
 array([   1,   13,   14,   20,   21,   34,   40,   64,   87,   91,   95,
          98,  112,  121,  122,  130,  134,  143,  161,  166,  187,  189,
         197,  200,  201,  202,  205,  206,  216,  219,  230,  241,  246,
         252,  269,  276,  279,  288,  293,  295,  315,  317,  330,  337,
         343,  378,  379,  384,  385,  387,  391,  392,  397,  400,  401,
         406,  418,  441,  455,  459,  466,  474,  484,  492,  502,  508,
         510,  520,  524,  540,  556,  562,  563,  564,  565,  577,  592,
         600,  608,  612,  627,  632,  641,  642,  645,  646,  647,  648,
         663,  681,  683,  686,  698,  699,  702,  709,  719,  725,  729,
         742,  747,  748,  751,  763,  766,  769,  775,  776,  779,  791,
         794,  795,  804,  805,  815,  831,  835,  840,  854,  856,  860,
         863,  870,  871,  878,  883,  891,  897,  929,  951,  955,  957,
         960,  972,  975,  981,  995, 1012, 1016, 1017, 1020, 1021, 1025,
        1028, 1038, 1044, 1051, 1056, 1059, 1064, 1071, 1076, 1082, 1086,
        1095, 1104, 1109, 1123, 1126, 1129, 1130, 1136, 1152, 1153, 1154,
        1162, 1180, 1183, 1184, 1194, 1207, 1215, 1238, 1241, 1248, 1254,
        1256, 1257, 1264, 1266, 1267, 1275, 1281, 1282, 1286, 1294, 1297,
        1316, 1327, 1332, 1335, 1337, 1339, 1346, 1349, 1363, 1369, 1370,
        1378, 1384, 1390, 1391, 1396, 1420, 1421, 1435, 1437, 1455]))

def hyperparam_finder_nocv(df, model_fn, trial):
    """
    Hyperparameter Finder
    """
    # Start with simple linear Model
    model = model_fn(trial)
    preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    X_train = train_df[features]
    y_train = np.log(train_df[target])
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    
    clf.fit(X_train, y_train)
    return rmse(clf.predict(X_valid), y_valid)

def hyperparam_finder(df, model_fn, trial):
    """
    Hyperparameter Finder
    """
    # Start with simple linear Model
    model = model_fn(trial)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.iloc[train_idxs]
        valid_df = df.iloc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
    # Return Valid Pred Score for HyperParam Tuning
    return np.mean(valid_errors)

# Now lets do KFold (5 Fold)
def train_kfold(df, test_df, ModelClass, **model_kwargs):
    """
    Train a Regression Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.iloc[train_idxs]
        valid_df = df.iloc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', ModelClass(**model_kwargs))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))
        valid_preds.append(pd.DataFrame({
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)
        }))

    print("RMSE Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)

model1_sub_df, model1_valid_preds = train_kfold(train_df, test_df, LinearRegression)
sub_df.to_csv("submission_lr_kfold.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
# Score might have improved over the LR without kfold.

RMSE Log Error 0.15228419074391034

def train_XGB_kfold(df, test_df):
    """
    Train a XGBoost Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df.index.values):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', XGBRegressor(n_jobs=-1, n_estimators=500, max_depth=20))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))
        valid_preds.append(pd.DataFrame({
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)
        }))

    print("Mean Squared Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)

def train_RF_kfold(df, test_df):
    """
    Train a RF Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df.index.values):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestRegressor(n_jobs=-1, max_depth=20))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(mean_squared_error(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))
        valid_preds.append(pd.DataFrame({
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)
        }))

    print("Mean Squared Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)

Ridge()

def lasso_hparams_finder(trial):
    alpha = trial.suggest_float("alpha", 0, 1.0)
    max_iter = trial.suggest_int("max_iter", 500, 5000)
    return Lasso(alpha=alpha, max_iter=max_iter)

def ridge_hparams_finder(trial):
    alpha = trial.suggest_float("alpha", 0, 1.0)
    max_iter = trial.suggest_int("max_iter", 500, 5000)
    return Ridge(alpha=alpha, max_iter=max_iter)

def xgb_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 5, 30)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
    tree_method = trial.suggest_categorical("tree_method", ["gpu_hist"])
    gamma = trial.suggest_float("gamma", 0, 1)
    eta = trial.suggest_float("eta", 0, 1)
    return XGBRegressor(
        max_depth=max_depth, 
        n_estimators=n_estimators, 
        learning_rate=learning_rate, 
        tree_method=tree_method, 
        gamma=gamma, 
        eta=eta
    )

def rf_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 10, 50)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    return RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)

def lightgbm_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 5, 30)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
    reg_alpha = trial.suggest_float("reg_alpha", 0., 1)
    reg_lambda = trial.suggest_float("reg_lambda", 0., 1)
    return LGBMRegressor(max_depth=max_depth, 
        n_estimators=n_estimators, 
        learning_rate=learning_rate, reg_alpha=reg_alpha, reg_lambda=reg_lambda)

def svr_hparams_finder(trial):
    kernel = trial.suggest_categorical("kernel", 
                                          ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'])
    degree = trial.suggest_int("degree", 1, 4)
    c = trial.suggest_float("c", 0, 1)
    max_iter = trial.suggest_float("max_iter", 50, 500)
    return SVR(kernel=kernel, degree=degree, C=c, max_iterm=max_iter)

optuna.create_study()
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder), 
               n_trials=100, 
               show_progress_bar=True
              )
lasso_params = study.best_params  # E.g. {'x': 2.002108042}

/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()

model_options = {
        "ridge": ridge_hparams_finder,
        "lasso": lasso_hparams_finder,
        "xgb": xgb_hparams_finder,
        "rf": rf_hparams_finder,
        "ligtgbm": lightgbm_hparams_finder,
        "svr": svr_hparams_finder
    }
best_hparams = []
for model_name, model_hparams_fn in model_options.items():
    print(model_name)
    optuna.create_study()
    study = optuna.create_study()
    study.optimize(partial(hyperparam_finder_nocv, train_df, model_hparams_fn), 
                   n_trials=20, 
                   show_progress_bar=True
                  )

    best_hparams.append({
        model_name: study.best_params
    })

ridge

/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()

lasso

/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()

xgb

/opt/conda/lib/python3.7/site-packages/optuna/progress_bar.py:47: ExperimentalWarning: Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
  self._init_valid()

<optuna.trial._trial.Trial object at 0x7f6260bd0910>

optuna.create_study()
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder), 
               n_trials=100, 
               show_progress_bar=True
              )
lasso_params = study.best_params  # E.g. {'x': 2.002108042}

model1_test_preds, model1_valid_preds = train_kfold(train_df, test_df, Ridge)

# Lets improve the model by choosing XGBoost over Linear Classifier
model2_test_preds, model2_valid_preds = train_kfold(train_df, test_df,  XGBRegressor, n_jobs=4, n_estimators=500, max_depth=20)

# Lets improve the model by choosing RF over Linear Classifier
model3_test_preds, model3_valid_preds = train_kfold(train_df, test_df, RandomForestRegressor,n_jobs=4, n_estimators=500, max_depth=20)

# Lets improve the model by choosing SVR
model4_test_preds, model4_valid_preds = train_kfold(train_df, test_df,  SVR)

# Lets improve the model by choosing  LightGBM
model5_test_preds, model5_valid_preds = train_kfold(train_df, test_df,  LGBMRegressor, n_jobs=4, n_estimators=500, max_depth=20)

RMSE Log Error 0.14397705141849326
RMSE Log Error 0.14641944560849735
RMSE Log Error 0.14289517901441579
RMSE Log Error 0.13849838128051936
RMSE Log Error 0.12958699005124785

model1_valid_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_valid_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_valid_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_valid_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_valid_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)

model1_test_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_test_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_test_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_test_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_test_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)

pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")

sub_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model3_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model4_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model5_test_preds, left_on="Id", right_on="Id")
sub_df["SalePrice"] = (sub_df["model1_preds"] + sub_df["model2_preds"] + sub_df["model3_preds"] + sub_df["model4_preds"] + sub_df["model5_preds"])/5
sub_df[["Id", "SalePrice"]].to_csv("submission_model_blend.csv", index=False)

# Now Lets do Model Stacking.
# Choose three Models - LinearRegression, RandomForest and XGBoost and get Predictions
# Average ALL three Predictions (With KFold) and make a Submission.  This will give you Model Blending
# TODO

layer1_test_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model3_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model4_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model5_test_preds, left_on="Id", right_on="Id")

layer1_test_df.head()

layer1_train_df = pd.merge(model1_valid_preds, model2_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model3_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model4_valid_preds, left_on="Id", right_on="Id")
layer1_train_df = pd.merge(layer1_train_df, model5_valid_preds, left_on="Id", right_on="Id")

layer1_train_df = pd.merge(layer1_train_df, train_df[["Id", "SalePrice"]], left_on="Id", right_on="Id")
layer1_train_df.head()

# You would have seen an improvement on your Leaderboard by providing it better features
# Now lets do KFold (5 Fold)
def train_model_stacking_kfold(df, test_df):
    """
    Train a Linear Regression Model with 5 Fold CV
    """
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    features = ["model1_preds", "model2_preds", "model3_preds", "model4_preds", "model5_preds"]
    df = df.copy()
    test_df = test_df.copy()
    for feat in features:
        df[feat] = np.log(df[feat])
        test_df[feat] = np.log(test_df[feat])
    df["SalePrice"] = np.log(df["SalePrice"])
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]
        X_train = train_df[features]
        y_train = train_df["SalePrice"]
        X_valid = valid_df[features]
        y_valid = valid_df["SalePrice"]
        X_test = test_df[features]
        clf = LinearRegression()
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
        test_preds.append(np.exp(clf.predict(X_test)))

    print("RMSE Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    })
    # Return test prediction with CV
    return sub_df

sub_df = train_model_stacking_kfold(layer1_train_df, layer1_test_df)
sub_df[["Id", "SalePrice"]].to_csv("submission_model_stack.csv", index=False)

RMSE Log Error 0.12191138856220185

import lightgbm

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SalePrice
Id	1.000000	0.011156	-0.010601	-0.033226	-0.028365	0.012609	-0.012713	-0.021998	-0.050298	-0.005024	...	-0.029643	-0.000477	0.002889	-0.046635	0.001330	0.057044	-0.006242	0.021172	0.000712	-0.021917
MSSubClass	0.011156	1.000000	-0.386347	-0.139781	0.032628	-0.059316	0.027850	0.040581	0.022936	-0.069836	...	-0.012579	-0.006100	-0.012037	-0.043825	-0.026030	0.008283	-0.007683	-0.013585	-0.021407	-0.084284
LotFrontage	-0.010601	-0.386347	1.000000	0.426095	0.251646	-0.059213	0.123349	0.088866	0.193458	0.233633	...	0.088521	0.151972	0.010700	0.070029	0.041383	0.206167	0.003368	0.011200	0.007450	0.351799
LotArea	-0.033226	-0.139781	0.426095	1.000000	0.105806	-0.005636	0.014228	0.013788	0.104160	0.214103	...	0.171698	0.084774	-0.018340	0.020423	0.043160	0.077672	0.038068	0.001205	-0.014261	0.263843
OverallQual	-0.028365	0.032628	0.251646	0.105806	1.000000	-0.091932	0.572323	0.550684	0.411876	0.239666	...	0.238923	0.308819	-0.113937	0.030371	0.064886	0.065166	-0.031406	0.070815	-0.027347	0.790982
OverallCond	0.012609	-0.059316	-0.059213	-0.005636	-0.091932	1.000000	-0.375983	0.073741	-0.128101	-0.046231	...	-0.003334	-0.032589	0.070356	0.025504	0.054811	-0.001985	0.068777	-0.003511	0.043950	-0.077856
YearBuilt	-0.012713	0.027850	0.123349	0.014228	0.572323	-0.375983	1.000000	0.592855	0.315707	0.249503	...	0.224880	0.188686	-0.387268	0.031355	-0.050364	0.004950	-0.034383	0.012398	-0.013618	0.522897
YearRemodAdd	-0.021998	0.040581	0.088866	0.013788	0.550684	0.073741	0.592855	1.000000	0.179618	0.128451	...	0.205726	0.226298	-0.193919	0.045286	-0.038740	0.005829	-0.010286	0.021490	0.035743	0.507101
MasVnrArea	-0.050298	0.022936	0.193458	0.104160	0.411876	-0.128101	0.315707	0.179618	1.000000	0.264736	...	0.159718	0.125703	-0.110204	0.018796	0.061466	0.011723	-0.029815	-0.005965	-0.008201	0.477493
BsmtFinSF1	-0.005024	-0.069836	0.233633	0.214103	0.239666	-0.046231	0.249503	0.128451	0.264736	1.000000	...	0.204306	0.111761	-0.102303	0.026451	0.062021	0.140491	0.003571	-0.015727	0.014359	0.386420
BsmtFinSF2	-0.005968	-0.065649	0.049900	0.111170	-0.059119	0.040229	-0.049107	-0.067759	-0.072319	-0.050117	...	0.067898	0.003093	0.036543	-0.029993	0.088871	0.041709	0.004940	-0.015211	0.031706	-0.011378
BsmtUnfSF	-0.007940	-0.140759	0.132644	-0.002618	0.308159	-0.136841	0.149040	0.181133	0.114442	-0.495251	...	-0.005316	0.129005	-0.002538	0.020764	-0.012579	-0.035092	-0.023837	0.034888	-0.041258	0.214479
TotalBsmtSF	-0.015415	-0.238518	0.392075	0.260833	0.537808	-0.171098	0.391452	0.291066	0.363936	0.522396	...	0.232019	0.247264	-0.095478	0.037384	0.084489	0.126053	-0.018479	0.013196	-0.014969	0.613581
1stFlrSF	0.010496	-0.251758	0.457181	0.299475	0.476224	-0.144203	0.281986	0.240379	0.344501	0.445863	...	0.235459	0.211671	-0.065292	0.056104	0.088758	0.131525	-0.021096	0.031372	-0.013604	0.605852
2ndFlrSF	0.005590	0.307886	0.080177	0.050986	0.295493	0.028942	0.010308	0.140024	0.174561	-0.137079	...	0.092165	0.208026	0.061989	-0.024358	0.040606	0.081487	0.016197	0.035164	-0.028700	0.319334
LowQualFinSF	-0.044230	0.046474	0.038469	0.004779	-0.030429	0.025494	-0.183784	-0.062419	-0.069071	-0.064503	...	-0.025444	0.018251	0.061081	-0.004296	0.026799	0.062157	-0.003793	-0.022174	-0.028921	-0.025606
GrLivArea	0.008273	0.074853	0.402797	0.263116	0.593007	-0.079686	0.199010	0.287389	0.390857	0.208171	...	0.247433	0.330224	0.009113	0.020643	0.101510	0.170205	-0.002416	0.050240	-0.036526	0.708624
BsmtFullBath	0.002289	0.003491	0.100949	0.158155	0.111098	-0.054942	0.187599	0.119470	0.085310	0.649212	...	0.175315	0.067341	-0.049911	-0.000106	0.023148	0.067616	-0.023047	-0.025361	0.067049	0.227122
BsmtHalfBath	-0.020155	-0.002333	-0.007234	0.048046	-0.040150	0.117821	-0.038162	-0.012337	0.026673	0.067418	...	0.040161	-0.025324	-0.008555	0.035114	0.032121	0.020025	-0.007367	0.032873	-0.046524	-0.016844
FullBath	0.005587	0.131608	0.198769	0.126031	0.550600	-0.194149	0.468271	0.439046	0.276833	0.058543	...	0.187703	0.259977	-0.115093	0.035353	-0.008106	0.049604	-0.014290	0.055872	-0.019669	0.560664
HalfBath	0.006784	0.177354	0.053532	0.014259	0.273458	-0.060769	0.242656	0.183331	0.201444	0.004262	...	0.108080	0.199740	-0.095317	-0.004972	0.072426	0.022381	0.001290	-0.009050	-0.010269	0.284108
BedroomAbvGr	0.037719	-0.023438	0.263170	0.119690	0.101676	0.012980	-0.070651	-0.040581	0.102821	-0.107355	...	0.046854	0.093810	0.041570	-0.024478	0.044300	0.070703	0.007767	0.046544	-0.036014	0.168213
KitchenAbvGr	0.002951	0.281721	-0.006069	-0.017784	-0.183882	-0.087001	-0.174800	-0.149598	-0.037610	-0.081007	...	-0.090130	-0.070091	0.037312	-0.024600	-0.051613	-0.014525	0.062341	0.026589	0.031687	-0.135907
TotRmsAbvGrd	0.027239	0.040380	0.352096	0.190015	0.427452	-0.057583	0.095589	0.191740	0.280682	0.044316	...	0.165984	0.234192	0.004151	-0.006683	0.059383	0.083757	0.024763	0.036907	-0.034516	0.533723
Fireplaces	-0.019772	-0.045569	0.266639	0.271364	0.396765	-0.023820	0.147716	0.112581	0.249070	0.260011	...	0.200019	0.169405	-0.024822	0.011257	0.184530	0.095074	0.001409	0.046357	-0.024096	0.466929
GarageYrBlt	0.000072	0.085072	0.070250	-0.024947	0.547766	-0.324297	0.825667	0.642277	0.252691	0.153484	...	0.224577	0.228425	-0.297003	0.023544	-0.075418	-0.014501	-0.032417	0.005337	-0.001014	0.486362
GarageCars	0.016570	-0.040110	0.285691	0.154871	0.600671	-0.185758	0.537850	0.420622	0.364204	0.224054	...	0.226342	0.213569	-0.151434	0.035765	0.050494	0.020934	-0.043080	0.040522	-0.039117	0.640409
GarageArea	0.017634	-0.098672	0.344997	0.180403	0.562022	-0.151521	0.478954	0.371600	0.373066	0.296970	...	0.224666	0.241435	-0.121777	0.035087	0.051412	0.061047	-0.027400	0.027974	-0.027378	0.623431
WoodDeckSF	-0.029643	-0.012579	0.088521	0.171698	0.238923	-0.003334	0.224880	0.205726	0.159718	0.204306	...	1.000000	0.058661	-0.125989	-0.032771	-0.074181	0.073378	-0.009551	0.021011	0.022270	0.324413
OpenPorchSF	-0.000477	-0.006100	0.151972	0.084774	0.308819	-0.032589	0.188686	0.226298	0.125703	0.111761	...	0.058661	1.000000	-0.093079	-0.005842	0.074304	0.060762	-0.018584	0.071255	-0.057619	0.315856
EnclosedPorch	0.002889	-0.012037	0.010700	-0.018340	-0.113937	0.070356	-0.387268	-0.193919	-0.110204	-0.102303	...	-0.125989	-0.093079	1.000000	-0.037305	-0.082864	0.054203	0.018361	-0.028887	-0.009916	-0.128578
3SsnPorch	-0.046635	-0.043825	0.070029	0.020423	0.030371	0.025504	0.031355	0.045286	0.018796	0.026451	...	-0.032771	-0.005842	-0.037305	1.000000	-0.031436	-0.007992	0.000354	0.029474	0.018645	0.044584
ScreenPorch	0.001330	-0.026030	0.041383	0.043160	0.064886	0.054811	-0.050364	-0.038740	0.061466	0.062021	...	-0.074181	0.074304	-0.082864	-0.031436	1.000000	0.051307	0.031946	0.023217	0.010694	0.111447
PoolArea	0.057044	0.008283	0.206167	0.077672	0.065166	-0.001985	0.004950	0.005829	0.011723	0.140491	...	0.073378	0.060762	0.054203	-0.007992	0.051307	1.000000	0.029669	-0.033737	-0.059689	0.092404
MiscVal	-0.006242	-0.007683	0.003368	0.038068	-0.031406	0.068777	-0.034383	-0.010286	-0.029815	0.003571	...	-0.009551	-0.018584	0.018361	0.000354	0.031946	0.029669	1.000000	-0.006495	0.004906	-0.021190
MoSold	0.021172	-0.013585	0.011200	0.001205	0.070815	-0.003511	0.012398	0.021490	-0.005965	-0.015727	...	0.021011	0.071255	-0.028887	0.029474	0.023217	-0.033737	-0.006495	1.000000	-0.145721	0.046432
YrSold	0.000712	-0.021407	0.007450	-0.014261	-0.027347	0.043950	-0.013618	0.035743	-0.008201	0.014359	...	0.022270	-0.057619	-0.009916	0.018645	0.010694	-0.059689	0.004906	-0.145721	1.000000	-0.028923
SalePrice	-0.021917	-0.084284	0.351799	0.263843	0.790982	-0.077856	0.522897	0.507101	0.477493	0.386420	...	0.324413	0.315856	-0.128578	0.044584	0.111447	0.092404	-0.021190	0.046432	-0.028923	1.000000

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolArea	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
523	524	60	RL	130.0	40094	Pave	NaN	IR1	Bnk	AllPub	...	0	NaN	NaN	NaN	10	2007	New	Partial	184750
691	692	60	RL	104.0	21535	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	1	2007	WD	Normal	755000
1182	1183	60	RL	160.0	15623	Pave	NaN	IR1	Lvl	AllPub	...	555	Ex	MnPrv	NaN	7	2007	WD	Abnorml	745000
1298	1299	60	RL	313.0	63887	Pave	NaN	IR3	Bnk	AllPub	...	480	Gd	NaN	NaN	1	2008	New	Partial	160000

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
581	582	20	RL	98.0	12704	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2009	New	Partial	253293
825	826	20	RL	114.0	14803	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2008	New	Partial	385000
1061	1062	30	C (all)	120.0	18000	Grvl	NaN	Reg	Low	AllPub	...	0	NaN	NaN	Shed	560	8	2008	ConLD	Normal	81000
1190	1191	190	RL	NaN	32463	Pave	NaN	Reg	Low	AllPub	...	0	NaN	NaN	NaN	0	3	2007	WD	Normal	168000
1298	1299	60	RL	313.0	63887	Pave	NaN	IR3	Bnk	AllPub	...	480	Gd	NaN	NaN	0	1	2008	New	Partial	160000

	Id	model1_preds	model2_preds
0	1461	117713.829211	124097.960938
1	1462	146528.150506	162885.296875
2	1463	177353.856048	179494.609375
3	1464	197262.388246	189156.562500
4	1465	198261.170350	191763.968750
...	...	...	...
1454	2915	82789.895485	82117.929688
1455	2916	84069.656304	83893.187500
1456	2917	163975.093345	169720.000000
1457	2918	113470.524225	113896.640625
1458	2919	218925.672950	231103.156250

	Id	model1_preds	model2_preds	model3_preds	model4_preds	model5_preds	SalePrice
0	16	139940.327034	133994.156250	156806.829127	134248.224855	151408.259753	132000
1	24	132404.159793	143172.593750	140933.363142	143026.241725	133380.441989	129900
2	30	78196.145620	72757.703125	74290.411810	77943.702050	75490.226312	68500
3	31	65289.900643	85316.335938	95203.013404	85049.653410	92450.252907	40000
4	33	201075.364368	195446.609375	206302.013767	182591.810407	188616.393510	179900