diff --git a/krasnodar_catboost_pipeline.ipynb b/krasnodar_catboost_pipeline.ipynb deleted file mode 100644 index 2672b27..0000000 --- a/krasnodar_catboost_pipeline.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyNZctONPU2EVkxxRRjxT6Z6"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Import"],"metadata":{"id":"RpR8nK-X_hgw"}},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"OsZspeG7-ig9","executionInfo":{"status":"ok","timestamp":1676664405476,"user_tz":480,"elapsed":18621,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}},"outputId":"a02c974a-660e-45fb-a1d9-c54d8caa2c43"},"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m575.9/575.9 KB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.6/76.6 MB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h"]}],"source":["!pip install shap catboost -qqq"]},{"cell_type":"code","source":["import pandas as pd\n","import numpy as np\n","import random\n","import os\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","from sklearn.metrics import mean_absolute_error\n","from catboost import CatBoostRegressor, Pool\n","from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold\n","from copy import deepcopy \n","import pickle\n","import sklearn.metrics\n","\n","import warnings\n","warnings.filterwarnings('ignore')"],"metadata":{"id":"RP5Hvy3U_B3k","executionInfo":{"status":"ok","timestamp":1676667073297,"user_tz":480,"elapsed":132,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}}},"execution_count":57,"outputs":[]},{"cell_type":"code","source":["def set_seed(seed: int):\n"," \"\"\"Set a random seed for complete reproducibility.\"\"\"\n"," random.seed(seed)\n"," np.random.seed(seed)\n"," os.environ['PYTHONHASHSEED'] = str(seed)\n","\n","set_seed(0xBACED)"],"metadata":{"id":"fc5OTTsL_KKO","executionInfo":{"status":"ok","timestamp":1676664442131,"user_tz":480,"elapsed":160,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')\n","%cd /content/drive/MyDrive/krasnodar_hack/\n","!ls"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WZdpl2tz_Qs7","executionInfo":{"status":"ok","timestamp":1676664495035,"user_tz":480,"elapsed":17468,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}},"outputId":"aed1f89d-88bc-432b-d372-a191dcfb88cd"},"execution_count":4,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n","/content/drive/MyDrive/krasnodar_hack\n","krasnodar_catboost_pipeline.ipynb krasnodar_data.ipynb\n","krasnodar_data_final.csv\n"]}]},{"cell_type":"markdown","source":["# Modeling"],"metadata":{"id":"Evu6SGtcBszQ"}},{"cell_type":"code","source":["class RegressionModel:\n"," def __init__(self, data=None, agg_cols=None, agg_cols_double=None, cat_features=['community', 'liter_num', 'done_date', 'developer', 'floor'],\n"," target_col='price_per_m', n_splits=5, stratify_col='price_per_m', group_col='community', params={}, path_to_save='model_reg.pkl'):\n"," self.data = data.fillna(-1)\n"," self.target_col = target_col\n"," \n"," self.agg_cols = agg_cols\n"," self.agg_cols_double = agg_cols_double\n","\n"," self.cat_features = cat_features \n","\n"," self.params = params\n"," self.params['cat_features'] = self.cat_features\n"," self.path_to_save = path_to_save\n"," self.models = []\n","\n"," self.n_splits = n_splits\n"," self.stratify_col = stratify_col\n"," self.group_col = group_col\n"," if group_col:\n"," self.kfold = StratifiedGroupKFold(n_splits=self.n_splits, shuffle=True, random_state=0xBACED)\n"," else:\n"," self.kfold = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=0xBACED)\n","\n"," def make_model(self):\n"," return CatBoostRegressor(**self.params)\n","\n"," def get_overall_features(self, df_train, df_val=None):\n"," ret_dfs = []\n"," for df in [df_train, df_val]:\n"," df['date'] = pd.to_datetime(df['date'])\n","\n"," df['year'] = df['date'].dt.year\n"," df['month'] = df['date'].dt.month\n","\n"," df['price_per_m'] = np.log(df['price_per_m'])\n","\n"," ret_dfs.append(df)\n"," \n"," return ret_dfs\n","\n"," def split_data(self):\n"," if self.group_col:\n"," self.data[self.group_col] = self.data[self.group_col].astype(str)\n","\n"," splits = self.kfold.split(self.data, self.data[self.stratify_col], self.data[self.group_col])\n"," else:\n"," splits = self.kfold.split(self.data, self.data[self.stratify_col])\n","\n"," for fold, (train_index, val_index) in enumerate(splits):\n"," df_train = self.data.iloc[train_index]\n"," df_val = self.data.iloc[val_index]\n","\n"," df_train, df_val = self.get_overall_features(df_train, df_val)\n","\n"," self.num_features = df_train.columns[~df_train.columns.isin(self.cat_features+['price_per_m', 'date', 'to_group'])].tolist() \n","\n"," cols = self.num_features + self.cat_features\n"," X_train = df_train[cols]\n"," X_val = df_val[cols]\n"," y_val = np.log(df_val[self.target_col])\n"," y_train = np.log(df_train[self.target_col])\n","\n"," print(X_train.shape, X_val.shape)\n"," yield fold, (X_train, X_val, y_train, y_val)\n","\n"," def get_data(self, fold):\n"," for f, (X_train, X_val, y_train, y_val) in self.split_data():\n"," if f == fold:\n"," return X_train, X_val, y_train, y_val\n","\n"," def fit_model(self):\n"," for fold, (X_train, X_val, y_train, y_val) in self.split_data():\n"," print(f\"Training for fold {fold} Stratified by {self.stratify_col}\")\n"," print('-'*30)\n"," cat = self.make_model()\n","\n"," cat.fit(X_train, y_train,\n"," eval_set=(X_val, y_val))\n"," self.models.append(cat)\n","\n"," with open(self.path_to_save, 'wb') as f:\n"," pickle.dump(self.models, f) \n","\n"," metrics = self.evaluate()\n","\n"," return metrics\n","\n"," def predict(self, data):\n"," orig = deepcopy(data)\n"," if not self.models:\n"," print('Train or load models before prediction phase')\n","\n"," self.data, _ = self.get_overall_features(self.data)\n"," # X = merge_val_df(self.data, orig, self.agg_cols)\n","\n"," X = X[self.models[0].feature_names_]\n","\n"," preds = [model.predict(X) for model in self.models]\n"," preds = np.mean(np.array(preds), axis=0)\n"," return preds\n"," \n"," \n"," def evaluate(self):\n"," if not self.models:\n"," print('Train or load models before evaluation phase')\n","\n"," metrics = {'MAE': [], 'RMSE': [], 'MAPE': []}\n","\n"," for fold, (X_train, X_val, y_train, y_val) in self.split_data():\n"," y_pred = np.exp(self.models[fold].predict(X_val[self.models[fold].feature_names_]))\n"," y_val = np.exp(y_val)\n","\n"," mse = sklearn.metrics.mean_squared_error(y_val, y_pred)\n"," mae = sklearn.metrics.mean_absolute_error(y_val, y_pred)\n"," mape = 1-sklearn.metrics.mean_absolute_percentage_error(y_val, y_pred)\n"," rmse = np.sqrt(mse)\n","\n"," metrics['MAE'].append(mae)\n"," metrics['MAPE'].append(mape)\n"," metrics['RMSE'].append(rmse)\n","\n","\n"," print('### Overall metrics ###')\n"," for key, values in metrics.items():\n"," print(f'For {key}: {np.mean(values).round(4)}')\n"," print(len('### Overall metrics ###') * '#')\n"," print()\n","\n"," return metrics\n","\n"," def load_model(self, path):\n"," with open(path, 'rb') as f:\n"," self.models = pickle.load(f) "],"metadata":{"id":"l2T42ahmA3Je","executionInfo":{"status":"ok","timestamp":1676668374033,"user_tz":480,"elapsed":172,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}}},"execution_count":96,"outputs":[]},{"cell_type":"code","source":["df = pd.read_csv('krasnodar_data_final.csv')\n","df['date'] = pd.to_datetime(df['date'])\n","df['year'] = df['date'].dt.year\n","df['to_group'] = df['community'] + '_' + df['liter_num']\n","\n","regression_params = dict(\n"," iterations=1000, \n"," learning_rate=0.05,\n"," cat_features=None,\n"," verbose=100,\n"," early_stopping_rounds=500,\n"," loss_function='RMSE',\n"," eval_metric='MAE',\n"," task_type='CPU',\n"," # bootstrap_type='Poisson'\n",")\n","\n","model = RegressionModel(\n"," data=df, \n"," agg_cols=[],\n"," target_col='price_per_m', \n"," n_splits=5, \n"," stratify_col='price_per_m', \n"," group_col='to_group',\n"," params=regression_params, \n",")"],"metadata":{"id":"p3VEyhN7GsCG","executionInfo":{"status":"ok","timestamp":1676668374539,"user_tz":480,"elapsed":2,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}}},"execution_count":97,"outputs":[]},{"cell_type":"code","source":["metrics = model.fit_model()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TD2srfAHHUFK","executionInfo":{"status":"ok","timestamp":1676668391607,"user_tz":480,"elapsed":15546,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}},"outputId":"c38db97a-4ebb-4b51-e15e-d3039299bd8a"},"execution_count":98,"outputs":[{"output_type":"stream","name":"stdout","text":["(1178, 9) (111, 9)\n","Training for fold 0 Stratified by price_per_m\n","------------------------------\n","0:\tlearn: 0.0183076\ttest: 0.0153729\tbest: 0.0153729 (0)\ttotal: 3.57ms\tremaining: 3.57s\n","100:\tlearn: 0.0039936\ttest: 0.0052346\tbest: 0.0052346 (100)\ttotal: 311ms\tremaining: 2.76s\n","200:\tlearn: 0.0030294\ttest: 0.0048681\tbest: 0.0048681 (200)\ttotal: 623ms\tremaining: 2.48s\n","300:\tlearn: 0.0025245\ttest: 0.0049576\tbest: 0.0048681 (200)\ttotal: 1.18s\tremaining: 2.73s\n","400:\tlearn: 0.0021508\ttest: 0.0048496\tbest: 0.0048471 (390)\ttotal: 1.94s\tremaining: 2.9s\n","500:\tlearn: 0.0018880\ttest: 0.0047996\tbest: 0.0047642 (485)\ttotal: 2.76s\tremaining: 2.75s\n","600:\tlearn: 0.0016759\ttest: 0.0048038\tbest: 0.0047642 (485)\ttotal: 3.58s\tremaining: 2.38s\n","700:\tlearn: 0.0015244\ttest: 0.0048103\tbest: 0.0047642 (485)\ttotal: 4.42s\tremaining: 1.89s\n","800:\tlearn: 0.0014093\ttest: 0.0047755\tbest: 0.0047577 (768)\ttotal: 4.91s\tremaining: 1.22s\n","900:\tlearn: 0.0013101\ttest: 0.0047158\tbest: 0.0047095 (882)\ttotal: 5.21s\tremaining: 573ms\n","999:\tlearn: 0.0012238\ttest: 0.0046975\tbest: 0.0046771 (943)\ttotal: 5.5s\tremaining: 0us\n","\n","bestTest = 0.004677130625\n","bestIteration = 943\n","\n","Shrink model to first 944 iterations.\n","(1019, 9) (270, 9)\n","Training for fold 1 Stratified by price_per_m\n","------------------------------\n","0:\tlearn: 0.0189938\ttest: 0.0152091\tbest: 0.0152091 (0)\ttotal: 5.49ms\tremaining: 5.49s\n","100:\tlearn: 0.0039330\ttest: 0.0054161\tbest: 0.0053876 (93)\ttotal: 297ms\tremaining: 2.64s\n","200:\tlearn: 0.0029411\ttest: 0.0058529\tbest: 0.0053344 (111)\ttotal: 575ms\tremaining: 2.29s\n","300:\tlearn: 0.0023228\ttest: 0.0058582\tbest: 0.0053344 (111)\ttotal: 842ms\tremaining: 1.96s\n","400:\tlearn: 0.0019468\ttest: 0.0061122\tbest: 0.0053344 (111)\ttotal: 1.11s\tremaining: 1.66s\n","500:\tlearn: 0.0017086\ttest: 0.0063125\tbest: 0.0053344 (111)\ttotal: 1.4s\tremaining: 1.4s\n","600:\tlearn: 0.0015209\ttest: 0.0063740\tbest: 0.0053344 (111)\ttotal: 1.7s\tremaining: 1.13s\n","Stopped by overfitting detector (500 iterations wait)\n","\n","bestTest = 0.005334417346\n","bestIteration = 111\n","\n","Shrink model to first 112 iterations.\n","(859, 9) (430, 9)\n","Training for fold 2 Stratified by price_per_m\n","------------------------------\n","0:\tlearn: 0.0130150\ttest: 0.0292111\tbest: 0.0292111 (0)\ttotal: 3.83ms\tremaining: 3.83s\n","100:\tlearn: 0.0034040\ttest: 0.0293268\tbest: 0.0281034 (16)\ttotal: 243ms\tremaining: 2.16s\n","200:\tlearn: 0.0026375\ttest: 0.0308591\tbest: 0.0281034 (16)\ttotal: 475ms\tremaining: 1.89s\n","300:\tlearn: 0.0021564\ttest: 0.0306285\tbest: 0.0281034 (16)\ttotal: 737ms\tremaining: 1.71s\n","400:\tlearn: 0.0017409\ttest: 0.0308015\tbest: 0.0281034 (16)\ttotal: 999ms\tremaining: 1.49s\n","500:\tlearn: 0.0015137\ttest: 0.0306867\tbest: 0.0281034 (16)\ttotal: 1.25s\tremaining: 1.25s\n","Stopped by overfitting detector (500 iterations wait)\n","\n","bestTest = 0.02810337199\n","bestIteration = 16\n","\n","Shrink model to first 17 iterations.\n","(1090, 9) (199, 9)\n","Training for fold 3 Stratified by price_per_m\n","------------------------------\n","0:\tlearn: 0.0187981\ttest: 0.0140491\tbest: 0.0140491 (0)\ttotal: 3.3ms\tremaining: 3.3s\n","100:\tlearn: 0.0040912\ttest: 0.0050387\tbest: 0.0047003 (74)\ttotal: 292ms\tremaining: 2.6s\n","200:\tlearn: 0.0031168\ttest: 0.0054071\tbest: 0.0047003 (74)\ttotal: 581ms\tremaining: 2.31s\n","300:\tlearn: 0.0025175\ttest: 0.0055865\tbest: 0.0047003 (74)\ttotal: 866ms\tremaining: 2.01s\n","400:\tlearn: 0.0021220\ttest: 0.0056781\tbest: 0.0047003 (74)\ttotal: 1.15s\tremaining: 1.72s\n","500:\tlearn: 0.0017955\ttest: 0.0057054\tbest: 0.0047003 (74)\ttotal: 1.44s\tremaining: 1.44s\n","Stopped by overfitting detector (500 iterations wait)\n","\n","bestTest = 0.004700271121\n","bestIteration = 74\n","\n","Shrink model to first 75 iterations.\n","(1010, 9) (279, 9)\n","Training for fold 4 Stratified by price_per_m\n","------------------------------\n","0:\tlearn: 0.0188050\ttest: 0.0150863\tbest: 0.0150863 (0)\ttotal: 3.61ms\tremaining: 3.6s\n","100:\tlearn: 0.0038160\ttest: 0.0098701\tbest: 0.0093704 (45)\ttotal: 298ms\tremaining: 2.65s\n","200:\tlearn: 0.0028236\ttest: 0.0101171\tbest: 0.0093704 (45)\ttotal: 575ms\tremaining: 2.29s\n","300:\tlearn: 0.0022027\ttest: 0.0102054\tbest: 0.0093704 (45)\ttotal: 855ms\tremaining: 1.98s\n","400:\tlearn: 0.0018134\ttest: 0.0102956\tbest: 0.0093704 (45)\ttotal: 1.14s\tremaining: 1.7s\n","500:\tlearn: 0.0015809\ttest: 0.0102020\tbest: 0.0093704 (45)\ttotal: 1.41s\tremaining: 1.4s\n","Stopped by overfitting detector (500 iterations wait)\n","\n","bestTest = 0.009370404766\n","bestIteration = 45\n","\n","Shrink model to first 46 iterations.\n","(1178, 9) (111, 9)\n","(1019, 9) (270, 9)\n","(859, 9) (430, 9)\n","(1090, 9) (199, 9)\n","(1010, 9) (279, 9)\n","### Overall metrics ###\n","For MAE: 0.1234\n","For RMSE: 0.1568\n","For MAPE: 0.9897\n","#######################\n","\n"]}]},{"cell_type":"code","source":["metrics"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cRw5_XX1HWPX","executionInfo":{"status":"ok","timestamp":1676668391608,"user_tz":480,"elapsed":24,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}},"outputId":"d13b74b1-4c93-4b16-f5ab-5c617625aad7"},"execution_count":99,"outputs":[{"output_type":"execute_result","data":{"text/plain":["{'MAE': [0.05448504858283975,\n"," 0.062343977097046,\n"," 0.33544665440639787,\n"," 0.0546641594066206,\n"," 0.11008324406993113],\n"," 'RMSE': [0.06927946822197725,\n"," 0.07957608603990474,\n"," 0.42269099504684476,\n"," 0.07031775478421469,\n"," 0.14190741201572488],\n"," 'MAPE': [0.9953292000899124,\n"," 0.9946654467229404,\n"," 0.972440092666718,\n"," 0.9952906658465586,\n"," 0.9906571162083089]}"]},"metadata":{},"execution_count":99}]},{"cell_type":"code","source":["model.load_model('model_reg.pkl')"],"metadata":{"id":"cwVeqmTpM4GV","executionInfo":{"status":"ok","timestamp":1676668391608,"user_tz":480,"elapsed":19,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}}},"execution_count":100,"outputs":[]},{"cell_type":"code","source":["X_train, X_val, y_train, y_val = model.get_data(0)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"a2J_TlXmM9DK","executionInfo":{"status":"ok","timestamp":1676668041671,"user_tz":480,"elapsed":180,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}},"outputId":"98f3b201-4706-4ce5-cda4-5e4060cbdd44"},"execution_count":89,"outputs":[{"output_type":"stream","name":"stdout","text":["(1178, 9) (111, 9)\n"]}]},{"cell_type":"code","source":["import shap\n","shap.initjs()\n","\n","explainer = shap.TreeExplainer(model.models[0])\n","shap_values = explainer.shap_values(X_train)\n","\n","shap.summary_plot(shap_values, X_train)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":404},"id":"i1ro6UhnKY6j","executionInfo":{"status":"ok","timestamp":1676668079151,"user_tz":480,"elapsed":1920,"user":{"displayName":"Радмир Зосимов","userId":"04742503366191314986"}},"outputId":"0a39b4f3-0e47-486f-a72a-c9c6a1fb0526"},"execution_count":91,"outputs":[{"output_type":"display_data","data":{"text/plain":[""],"text/html":["
"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{"needs_background":"light"}}]},{"cell_type":"code","source":[],"metadata":{"id":"UiFF7J9YNFln"},"execution_count":null,"outputs":[]}]} \ No newline at end of file