from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor, XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, mean_squared_error

from functools import partial
import optuna
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
DATA_DIR = Path("/kaggle/input/house-prices-advanced-regression-techniques/")

train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df = pd.read_csv(DATA_DIR / "test.csv")
sub_df = pd.read_csv(DATA_DIR / "sample_submission.csv")
# Choose only columns that have at least 1000 non-null values
# So We should convert target to Log, even if the Metric is not using it
# May be we should exclude them from dataset since there's no much we can gain from two data points
train_df.plot(kind="scatter", x="Id", y="SalePrice", alpha=0.25)
sns.heatmap(train_df.corr(), cmap='Greys');
corr_cols = train_df.corr()["SalePrice"].nlargest(15).index
sns.heatmap(train_df.loc[:, corr_cols].corr(), annot=True, cmap="gray")
# Overall Quality has big impact on SalePrice
train_df.plot(kind="scatter", x="OverallQual", y="SalePrice", alpha=0.25)
# GrLivAera is also strongly correlated.  
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GrLivArea", y="SalePrice", alpha=0.25)
train_df[train_df.GrLivArea > 4000]
# GrLivAera is also strongly correlated.  
# There are only four datapoints over 4K sqFeet. Should we include them?
train_df.plot(kind="scatter", x="GarageArea", y="SalePrice", alpha=0.25)
<AxesSubplot:xlabel='GarageArea', ylabel='SalePrice'>
train_df[train_df.GarageArea > 1200]
train_df = train_df[train_df["SalePrice"] < 700000]
def get_features(train_df):
    num_features, cat_features = [], []
    for col in train_df.columns:
        if col in ["Id", "SalePrice"]:
        dtype = train_df[col].dtype
        ratio = pd.notna(train_df[col]).sum() / len(train_df[col])
        if ratio < 0.5:
        if dtype == "object":
    return num_features, cat_features
num_features, cat_features = get_features(train_df)
def get_preprocess_pipeline(train_df, sample_features=False):
    # Get Numeric and Categorical Features
    numeric_features, categorical_features = get_features(train_df)
    target = "SalePrice"
    if sample_features:
        numeric_features = ["LotArea"]
        categorical_features = ["SaleType", "SaleCondition"]
    numeric_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
        ('onehpt', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    return preprocessor, numeric_features + categorical_features, target
def train_LR(train_df, test_df, sample_features=False):
    Train a Linear Regression Model 
    # Start with simple linear Model
    preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=sample_features)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', LinearRegression())])
    X_train = train_df[features]
    y_train = np.log(train_df[target])
    X_test = test_df[features]
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    clf.fit(X_train, y_train)
    print("RMSE Log Error", rmse(clf.predict(X_valid), y_valid))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.exp(clf.predict(X_test))
    return sub_df
sub_df = train_LR(train_df, test_df, sample_features=True)
sub_df.to_csv("submission_lr_sample.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
sub_df = train_LR(train_df, test_df, sample_features=False)
sub_df.to_csv("submission_lr.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for p1, p2 in kfold.split(range(20)):
    print(p1, p2)
kfold = KFold(n_splits=7, shuffle=True, random_state=42)
for idxs in kfold.split(train_df):
    print(idxs[0].shape, idxs[1].shape)
def hyperparam_finder_nocv(df, model_fn, trial):
    Hyperparameter Finder
    # Start with simple linear Model
    model = model_fn(trial)
    preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', model)])
    X_train = train_df[features]
    y_train = np.log(train_df[target])
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    clf.fit(X_train, y_train)
    return rmse(clf.predict(X_valid), y_valid)
def hyperparam_finder(df, model_fn, trial):
    Hyperparameter Finder
    # Start with simple linear Model
    model = model_fn(trial)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.iloc[train_idxs]
        valid_df = df.iloc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
    # Return Valid Pred Score for HyperParam Tuning
    return np.mean(valid_errors)
# Now lets do KFold (5 Fold)
def train_kfold(df, test_df, ModelClass, **model_kwargs):
    Train a Regression Model with 5 Fold CV
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.iloc[train_idxs]
        valid_df = df.iloc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', ModelClass(**model_kwargs))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)

    print("RMSE Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)
model1_sub_df, model1_valid_preds = train_kfold(train_df, test_df, LinearRegression)
sub_df.to_csv("submission_lr_kfold.csv", index=False)
# Make a submission to Kaggle after downloading the submission file from right side (data -> output)
# Score might have improved over the LR without kfold.
RMSE Log Error 0.15228419074391034
def train_XGB_kfold(df, test_df):
    Train a XGBoost Model with 5 Fold CV
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df.index.values):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', XGBRegressor(n_jobs=-1, n_estimators=500, max_depth=20))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)

    print("Mean Squared Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)

def train_RF_kfold(df, test_df):
    Train a RF Model with 5 Fold CV
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    valid_preds = []
    for train_idxs, valid_idxs in kfold.split(df.index.values):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]

        preprocessor, features, target = get_preprocess_pipeline(train_df, sample_features=False)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestRegressor(n_jobs=-1, max_depth=20))])
        X_train = train_df[features]
        y_train = np.log(train_df[target])
        X_valid = valid_df[features]
        y_valid = np.log(valid_df[target])
        X_test = test_df[features]
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(mean_squared_error(y_valid_preds, y_valid))
            "Id": valid_df["Id"],
            "SalePrice": np.exp(y_valid_preds)

    print("Mean Squared Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    # Return test prediction with CV and the Validation Prediction (For Stacking later)
    return sub_df, pd.concat(valid_preds)
def lasso_hparams_finder(trial):
    alpha = trial.suggest_float("alpha", 0, 1.0)
    max_iter = trial.suggest_int("max_iter", 500, 5000)
    return Lasso(alpha=alpha, max_iter=max_iter)

def ridge_hparams_finder(trial):
    alpha = trial.suggest_float("alpha", 0, 1.0)
    max_iter = trial.suggest_int("max_iter", 500, 5000)
    return Ridge(alpha=alpha, max_iter=max_iter)

def xgb_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 5, 30)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
    tree_method = trial.suggest_categorical("tree_method", ["gpu_hist"])
    gamma = trial.suggest_float("gamma", 0, 1)
    eta = trial.suggest_float("eta", 0, 1)
    return XGBRegressor(

def rf_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 10, 50)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    return RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)

def lightgbm_hparams_finder(trial):
    max_depth = trial.suggest_int("max_depth", 5, 30)
    n_estimators = trial.suggest_int("n_estimators", 100, 300)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
    reg_alpha = trial.suggest_float("reg_alpha", 0., 1)
    reg_lambda = trial.suggest_float("reg_lambda", 0., 1)
    return LGBMRegressor(max_depth=max_depth, 
        learning_rate=learning_rate, reg_alpha=reg_alpha, reg_lambda=reg_lambda)

def svr_hparams_finder(trial):
    kernel = trial.suggest_categorical("kernel", 
                                          ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'])
    degree = trial.suggest_int("degree", 1, 4)
    c = trial.suggest_float("c", 0, 1)
    max_iter = trial.suggest_float("max_iter", 50, 500)
    return SVR(kernel=kernel, degree=degree, C=c, max_iterm=max_iter)
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder), 
lasso_params = study.best_params  # E.g. {'x': 2.002108042}
model_options = {
        "ridge": ridge_hparams_finder,
        "lasso": lasso_hparams_finder,
        "xgb": xgb_hparams_finder,
        "rf": rf_hparams_finder,
        "ligtgbm": lightgbm_hparams_finder,
        "svr": svr_hparams_finder
best_hparams = []
for model_name, model_hparams_fn in model_options.items():
    study = optuna.create_study()
    study.optimize(partial(hyperparam_finder_nocv, train_df, model_hparams_fn), 

        model_name: study.best_params
study = optuna.create_study()
study.optimize(partial(hyperparam_finder, train_df, lasso_hparams_finder), 
lasso_params = study.best_params  # E.g. {'x': 2.002108042}
model1_test_preds, model1_valid_preds = train_kfold(train_df, test_df, Ridge)

# Lets improve the model by choosing XGBoost over Linear Classifier
model2_test_preds, model2_valid_preds = train_kfold(train_df, test_df,  XGBRegressor, n_jobs=4, n_estimators=500, max_depth=20)

# Lets improve the model by choosing RF over Linear Classifier
model3_test_preds, model3_valid_preds = train_kfold(train_df, test_df, RandomForestRegressor,n_jobs=4, n_estimators=500, max_depth=20)

# Lets improve the model by choosing SVR
model4_test_preds, model4_valid_preds = train_kfold(train_df, test_df,  SVR)

# Lets improve the model by choosing  LightGBM
model5_test_preds, model5_valid_preds = train_kfold(train_df, test_df,  LGBMRegressor, n_jobs=4, n_estimators=500, max_depth=20)
model1_valid_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_valid_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_valid_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_valid_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_valid_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)
model1_test_preds.rename(columns={"SalePrice": "model1_preds"}, inplace=True)
model2_test_preds.rename(columns={"SalePrice": "model2_preds"}, inplace=True)
model3_test_preds.rename(columns={"SalePrice": "model3_preds"}, inplace=True)
model4_test_preds.rename(columns={"SalePrice": "model4_preds"}, inplace=True)
model5_test_preds.rename(columns={"SalePrice": "model5_preds"}, inplace=True)
pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model3_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model4_test_preds, left_on="Id", right_on="Id")
sub_df = pd.merge(sub_df, model5_test_preds, left_on="Id", right_on="Id")
sub_df["SalePrice"] = (sub_df["model1_preds"] + sub_df["model2_preds"] + sub_df["model3_preds"] + sub_df["model4_preds"] + sub_df["model5_preds"])/5
sub_df[["Id", "SalePrice"]].to_csv("submission_model_blend.csv", index=False)
# Now Lets do Model Stacking.
# Choose three Models - LinearRegression, RandomForest and XGBoost and get Predictions
# Average ALL three Predictions (With KFold) and make a Submission.  This will give you Model Blending
layer1_test_df = pd.merge(model1_test_preds, model2_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model3_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model4_test_preds, left_on="Id", right_on="Id")
layer1_test_df = pd.merge(layer1_test_df, model5_test_preds, left_on="Id", right_on="Id")

# You would have seen an improvement on your Leaderboard by providing it better features
# Now lets do KFold (5 Fold)
def train_model_stacking_kfold(df, test_df):
    Train a Linear Regression Model with 5 Fold CV
    # Start with simple linear Model
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    valid_errors = []
    test_preds = []
    features = ["model1_preds", "model2_preds", "model3_preds", "model4_preds", "model5_preds"]
    df = df.copy()
    test_df = test_df.copy()
    for feat in features:
        df[feat] = np.log(df[feat])
        test_df[feat] = np.log(test_df[feat])
    df["SalePrice"] = np.log(df["SalePrice"])
    for train_idxs, valid_idxs in kfold.split(df):
        train_df = df.loc[train_idxs]
        valid_df = df.loc[valid_idxs]
        X_train = train_df[features]
        y_train = train_df["SalePrice"]
        X_valid = valid_df[features]
        y_valid = valid_df["SalePrice"]
        X_test = test_df[features]
        clf = LinearRegression()
        clf.fit(X_train, y_train)
        y_valid_preds = clf.predict(X_valid)
        valid_errors.append(rmse(y_valid_preds, y_valid))

    print("RMSE Log Error", np.mean(valid_errors))
    # On Prediction, do exp to inverse the loge done during training
    sub_df = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": np.mean(test_preds, axis=0)
    # Return test prediction with CV
    return sub_df
sub_df = train_model_stacking_kfold(layer1_train_df, layer1_test_df)
sub_df[["Id", "SalePrice"]].to_csv("submission_model_stack.csv", index=False)
RMSE Log Error 0.12191138856220185
import lightgbm