{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "import pandas as pd\n", "import numpy as np" ], "metadata": { "id": "1wCgQ3uby0j8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = pd.read_excel('/content/krasnodar_hack_data.xlsx')\n", "df['date'] = df['date'].apply(lambda x: x[1:-1])\n", "df['date'] = pd.to_datetime(df['date'])\n", "df['floor'] = df['floor'].apply(lambda x: str(x).replace('--', '-'))\n", "df['liter_num'] = df['liter_num'].apply(lambda x: str(x).replace('--', '-'))\n", "df['price_per_m'] = df['price_per_m'].apply(lambda x: x.replace('\\xa0', '').replace(' ', '')).astype(int)\n", "df['num_beds'] = df['num_beds'].astype(int)\n", "df['area'] = df['area'].apply(lambda x: str(x).replace(',', '.')).astype(float)\n", "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "zYk8ByTx8DXL", "outputId": "909d53dd-a9a5-4185-dda0-49fd511f3055" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " community liter_num done_date developer floor \\\n", "0 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 1-4 \n", "1 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 5-8 \n", "2 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 9-12 \n", "3 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 13-16 \n", "4 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 1-4 \n", "... ... ... ... ... ... \n", "1284 ЖК «Role Clef» nan 2 полугодие 2021 ООО «Уютный дом» 2-12 \n", "1285 ЖК «Role Clef» nan 2 полугодие 2021 ООО «Уютный дом» 2-12 \n", "1286 ЖК «Role Clef» nan 2 полугодие 2021 ООО «Уютный дом» 2-12 \n", "1287 ЖК «Мелодия» nan дом сдан NaN 5 \n", "1288 ЖК «Мелодия» nan дом сдан NaN 11 \n", "\n", " area num_beds price_per_m date \n", "0 32.95 1 118000 2022-01-25 \n", "1 32.95 1 119000 2022-01-25 \n", "2 32.95 1 120000 2022-01-25 \n", "3 32.95 1 121000 2022-01-25 \n", "4 35.20 1 117000 2022-01-25 \n", "... ... ... ... ... \n", "1284 NaN 1 586000 2023-02-15 \n", "1285 NaN 2 400000 2023-02-15 \n", "1286 NaN 3 395000 2023-02-15 \n", "1287 NaN 2 94000 2023-02-15 \n", "1288 NaN 2 94000 2023-02-15 \n", "\n", "[1289 rows x 9 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
communityliter_numdone_datedeveloperfloorareanum_bedsprice_per_mdate
0ЖК «Смородина»1-51 квартал 2023ООО «АЛЬФА»1-432.9511180002022-01-25
1ЖК «Смородина»1-51 квартал 2023ООО «АЛЬФА»5-832.9511190002022-01-25
2ЖК «Смородина»1-51 квартал 2023ООО «АЛЬФА»9-1232.9511200002022-01-25
3ЖК «Смородина»1-51 квартал 2023ООО «АЛЬФА»13-1632.9511210002022-01-25
4ЖК «Смородина»1-51 квартал 2023ООО «АЛЬФА»1-435.2011170002022-01-25
..............................
1284ЖК «Role Clef»nan2 полугодие 2021ООО «Уютный дом»2-12NaN15860002023-02-15
1285ЖК «Role Clef»nan2 полугодие 2021ООО «Уютный дом»2-12NaN24000002023-02-15
1286ЖК «Role Clef»nan2 полугодие 2021ООО «Уютный дом»2-12NaN33950002023-02-15
1287ЖК «Мелодия»nanдом сданNaN5NaN2940002023-02-15
1288ЖК «Мелодия»nanдом сданNaN11NaN2940002023-02-15
\n", "

1289 rows × 9 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 22 } ] }, { "cell_type": "code", "source": [ "df.to_csv('krasnodar_data_final.csv', index=False)" ], "metadata": { "id": "v7B0OICi8LOx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/krasnodar_data_final.csv')\n", "df = df[~df['area'].isna()]\n", "df = df[~df['done_date'].isna()]\n" ], "metadata": { "id": "UNgPSTGM8Ypj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "mgrkF997ncDd" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "Mk8Qx67bsjVX" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_uniq = df.drop(columns=['price_per_m', 'date', 'floor']).drop_duplicates() #.to_csv('unique_communities.csv', index=False)" ], "metadata": { "id": "iZBXk0P7ndiS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = df.fillna('nan')" ], "metadata": { "id": "Wf8Al8YdnlFk" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t = df.groupby(['community', 'liter_num', 'done_date', 'developer', 'area', 'num_beds', 'date'])['price_per_m'].mean()" ], "metadata": { "id": "1QEUL1kKpY9I" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t = t.reset_index()" ], "metadata": { "id": "344wp42UpZwo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t.community.value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PE9rV5olp36f", "outputId": "64bfe96a-47ee-43f9-8de9-a9c768444595" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "ЖК «Novella» 53\n", "ЖК «Зеленодар» 47\n", "ЖК «AVrorA», 42\n", "ЖК «Смородина» 40\n", "ЖК «Fresh» 39\n", "ЖК «The Grand Palace» 21\n", "ЖК «Небо» 20\n", "ЖК «URAL», 18\n", "ЖК «Спортивный парк» 17\n", "ЖК «Дом на Лаврова» 8\n", "Name: community, dtype: int64" ] }, "metadata": {}, "execution_count": 144 } ] }, { "cell_type": "code", "source": [ "rep = {\n", " '4 квартал 2021' : \"4 квартал 2021\",\n", " '9 квартал 2021' : \"4 квартал 2021\",\n", " '8 квартал 2021' : \"4 квартал 2021\",\n", " '7 квартал 2021' : \"4 квартал 2021\",\n", " '6 квартал 2021' : \"4 квартал 2021\",\n", " '5 квартал 2021' : \"4 квартал 2021\",\n", " '18 квартал 2021' : \"4 квартал 2021\",\n", " '17 квартал 2021' : \"4 квартал 2021\",\n", " '16 квартал 2021' : \"4 квартал 2021\",\n", " '14 квартал 2021' : \"4 квартал 2021\",\n", " '13 квартал 2021' : \"4 квартал 2021\",\n", " '12 квартал 2021' : \"4 квартал 2021\",\n", " '11 квартал 2021' : \"4 квартал 2021\",\n", " '10 квартал 2021' : \"4 квартал 2021\",\n", " '15 квартал 2021' : \"4 квартал 2021\",\n", "}\n", "kvartal_to_date = {\n", " '4 квартал 2021': '2021-12-15',\n", " '2 квартал 2022': '2022-06-15',\n", " '3 квартал 2022': '2022-09-15',\n", " '1 квартал 2023': '2023-03-15',\n", " '2 квартал 2023': '2023-06-15',\n", " '3 квартал 2023': '2023-09-15',\n", " '4 квартал 2023': '2023-12-15',\n", " '1 квартал 2024': '2024-03-15',\n", " '2 квартал 2024': '2024-06-15',\n", " '3 квартал 2024': '2024-09-15',\n", " '4 квартал 2024': '2024-12-15',\n", " 'дом сдан': '1990-12-15'\n", "}" ], "metadata": { "id": "OjfuBFcqs1pa" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t['done_date'] = t.done_date.replace(rep)" ], "metadata": { "id": "HnG-tuphqo29" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "Hr0tYXDnq7Xx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t.done_date = t.done_date.replace(kvartal_to_date)" ], "metadata": { "id": "1YCU5SSkuXMB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t.done_date = pd.to_datetime(t.done_date)\n", "t.date = pd.to_datetime(t.date)\n", "t['days_to_done'] = t['done_date'] - t['date']\n", "t['days_to_done'] = t['days_to_done'].apply(lambda x: int(str(x).split()[0]))\n" ], "metadata": { "id": "LaxUGtDRubue" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "az33D3T0u1Fe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "xJxBzp7Bwe2i" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "trend = []\n", "for idx, row in t.iterrows():\n", " if row.date == pd.to_datetime('2022-01-25'):\n", " trend.append(0)\n", " continue\n", " elif row.date == pd.to_datetime('2022-11-06') and row.area == t.iloc[idx-1].area:\n", " trend.append(row.price_per_m/(t.iloc[idx-1].price_per_m)*100)\n", " continue\n", " elif row.date == pd.to_datetime('2023-02-15') and row.area == t.iloc[idx-1].area:\n", " trend.append(row.price_per_m/(t.iloc[idx-2].price_per_m)*100)\n", " continue\n", " else:\n", " trend.append(0)" ], "metadata": { "id": "yGCs4h8V1izN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t['trend'] = trend" ], "metadata": { "id": "B6LkDXQ123BG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t['trend'] = t.trend.apply(lambda x: round(x, 2))" ], "metadata": { "id": "65q_WpTE24zn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t = t[~((t['trend'] < 100) & (t['trend'] > 50))]" ], "metadata": { "id": "MY0wDX8s388F" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "same_cols = ['community', 'liter_num', 'done_date', 'developer', 'area', 'num_beds']" ], "metadata": { "id": "3ZviqlKJHHga" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "new_df = []\n", "for idx, row in t.iterrows():\n", " if row.date == pd.to_datetime('2022-01-25'):\n", " i = 1\n", " if idx+i < t.shape[0]:\n", " if (t.iloc[idx+i][same_cols] == row[same_cols]).values.sum() == 6:\n", " row['price_per_m'] = t.iloc[idx+i]['price_per_m']\n", " row['trend'] = t.iloc[idx+i]['trend']\n", "\n", " new_df.append(row)\n", " if row.date == pd.to_datetime('2022-11-06'):\n", " i = 1\n", " if idx+i < t.shape[0]:\n", " if (t.iloc[idx+i][same_cols] == row[same_cols]).values.sum() == 6:\n", " row['price_per_m'] = t.iloc[idx+i]['price_per_m']\n", " row['trend'] = t.iloc[idx+i]['trend']\n", "\n", " new_df.append(row)" ], "metadata": { "id": "93JSoKBK941f" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "new_df = pd.DataFrame(new_df)" ], "metadata": { "id": "EqxOuJDpH5OG" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "new_df = new_df[~(new_df['trend'] == 0)]" ], "metadata": { "id": "Nu_gJz4cIDAf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "rgQgfyA3ItLS", "outputId": "7ce259e9-638d-4317-db58-eb8fed976978" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " community liter_num done_date developer \\\n", "0 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n", "1 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n", "2 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n", "3 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n", "4 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n", ".. ... ... ... ... \n", "300 ЖК «Спортивный парк» 7 2022-06-15 ООО «АЛЬФА» \n", "301 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n", "302 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n", "303 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n", "304 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n", "\n", " area num_beds date price_per_m days_to_done trend \n", "0 44.0 1 2022-01-25 187000.0 233 0.00 \n", "1 44.0 1 2022-11-06 202000.0 -52 108.02 \n", "2 44.0 1 2023-02-15 236600.0 -153 126.52 \n", "3 47.0 1 2022-01-25 200750.0 233 0.00 \n", "4 47.0 1 2022-11-06 205750.0 -52 102.49 \n", ".. ... ... ... ... ... ... \n", "300 45.0 1 2023-02-15 101800.0 -245 117.01 \n", "301 48.0 1 2022-01-25 84000.0 141 0.00 \n", "302 48.0 1 2022-11-06 98000.0 -144 116.67 \n", "303 51.0 1 2022-01-25 79000.0 141 0.00 \n", "304 51.0 1 2022-11-06 92500.0 -144 117.09 \n", "\n", "[300 rows x 10 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
communityliter_numdone_datedeveloperareanum_bedsdateprice_per_mdays_to_donetrend
0ЖК «AVrorA»,nan2022-09-15ООО «АльфаСтройКомплекс»44.012022-01-25187000.02330.00
1ЖК «AVrorA»,nan2022-09-15ООО «АльфаСтройКомплекс»44.012022-11-06202000.0-52108.02
2ЖК «AVrorA»,nan2022-09-15ООО «АльфаСтройКомплекс»44.012023-02-15236600.0-153126.52
3ЖК «AVrorA»,nan2022-09-15ООО «АльфаСтройКомплекс»47.012022-01-25200750.02330.00
4ЖК «AVrorA»,nan2022-09-15ООО «АльфаСтройКомплекс»47.012022-11-06205750.0-52102.49
.................................
300ЖК «Спортивный парк»72022-06-15ООО «АЛЬФА»45.012023-02-15101800.0-245117.01
301ЖК «Спортивный парк»82022-06-15ООО «АЛЬФА»48.012022-01-2584000.01410.00
302ЖК «Спортивный парк»82022-06-15ООО «АЛЬФА»48.012022-11-0698000.0-144116.67
303ЖК «Спортивный парк»82022-06-15ООО «АЛЬФА»51.012022-01-2579000.01410.00
304ЖК «Спортивный парк»82022-06-15ООО «АЛЬФА»51.012022-11-0692500.0-144117.09
\n", "

300 rows × 10 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 358 } ] }, { "cell_type": "code", "source": [ "t['date'].unique()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7vWFl3uoIMnm", "outputId": "1d5dbc94-c32c-4426-a324-4a6186f44773" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['2022-01-25T00:00:00.000000000', '2022-11-06T00:00:00.000000000',\n", " '2023-02-15T00:00:00.000000000'], dtype='datetime64[ns]')" ] }, "metadata": {}, "execution_count": 365 } ] }, { "cell_type": "code", "source": [ "test_df = new_df[['area', 'num_beds', 'price_per_m', 'days_to_done', 'community']]\n", "test_df = test_df[test_df['days_to_done'] > -2000]\n", "test_df['community'] = test_df['community'].apply(lambda x: x.strip())" ], "metadata": { "id": "GsddQYnB_hi5" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.linear_model import LinearRegression\n", "from sklearn.model_selection import StratifiedKFold" ], "metadata": { "id": "iIV_dc5N_nZB" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0xBACED)" ], "metadata": { "id": "g-1NcmetADYQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for fold, (train_index, val_index) in enumerate(kfold.split(test_df, test_df['days_to_done'])):\n", " if fold == 0:\n", " df_train = test_df.iloc[train_index]\n", " df_val = test_df.iloc[val_index]\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QbgRtLQ-AI89", "outputId": "18b36f27-7059-470d-eb7a-5e7e1c0e03db" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n", " warnings.warn(\n" ] } ] }, { "cell_type": "code", "source": [ "X = df_train.drop(columns=['price_per_m'])\n", "y = df_train['price_per_m'].values\n", "\n", "X_val = df_val.drop(columns=['price_per_m'])\n", "y_val = df_val['price_per_m'].values" ], "metadata": { "id": "asPYq7hz_uAR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install shap catboost -qqq" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "clgz4UO8A92Y", "outputId": "5f8340de-1a3d-4f02-b895-0c64e09adfa0" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m575.9/575.9 KB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.6/76.6 MB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ] }, { "cell_type": "code", "source": [ "from catboost import CatBoostRegressor, Pool" ], "metadata": { "id": "060Gc1XtCYYA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "regression_params = dict(\n", " iterations=5000, \n", " learning_rate=0.05,\n", " verbose=100,\n", " early_stopping_rounds=500,\n", " loss_function='RMSE',\n", " eval_metric='MAE',\n", " task_type='CPU',\n", " cat_features=['community']\n", ")\n" ], "metadata": { "id": "PRyAwOaTCjw1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = CatBoostRegressor(**regression_params).fit(X, y, eval_set=(X_val, y_val))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vSL8GlhyCZ2n", "outputId": "ce8a65b1-edaf-44b3-bf69-e3c70febb28b" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "0:\tlearn: 37709.1413245\ttest: 37499.9541219\tbest: 37499.9541219 (0)\ttotal: 848us\tremaining: 4.24s\n", "100:\tlearn: 9375.1949588\ttest: 8095.7973147\tbest: 8082.1061214 (99)\ttotal: 60.6ms\tremaining: 2.94s\n", "200:\tlearn: 6081.2680389\ttest: 6465.3010380\tbest: 6465.3010380 (200)\ttotal: 185ms\tremaining: 4.41s\n", "300:\tlearn: 4427.3214133\ttest: 6545.6179437\tbest: 6361.4411869 (251)\ttotal: 397ms\tremaining: 6.19s\n", "400:\tlearn: 3395.6664363\ttest: 6681.1132311\tbest: 6361.4411869 (251)\ttotal: 551ms\tremaining: 6.31s\n", "500:\tlearn: 2712.4185987\ttest: 6611.7024042\tbest: 6361.4411869 (251)\ttotal: 732ms\tremaining: 6.58s\n", "600:\tlearn: 2258.2942629\ttest: 6603.5144130\tbest: 6361.4411869 (251)\ttotal: 833ms\tremaining: 6.1s\n", "700:\tlearn: 1912.3338542\ttest: 6570.2824488\tbest: 6361.4411869 (251)\ttotal: 993ms\tremaining: 6.09s\n", "Stopped by overfitting detector (500 iterations wait)\n", "\n", "bestTest = 6361.441187\n", "bestIteration = 251\n", "\n", "Shrink model to first 252 iterations.\n" ] } ] }, { "cell_type": "code", "source": [ "from sklearn.metrics import mean_absolute_percentage_error" ], "metadata": { "id": "N7iYT6eBCfd9" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "y_pred = model.predict(X_val)\n", "\n", "1-mean_absolute_percentage_error(y_pred, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oFTHqEcPDflg", "outputId": "f1fe7202-d1ed-49eb-abb1-7f114d7f44da" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9550957107941889" ] }, "metadata": {}, "execution_count": 405 } ] }, { "cell_type": "code", "source": [ "y_val" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-4SgdPmkLcja", "outputId": "5dc0d00e-7dc0-44cb-df2d-35104cbb3e1b" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([193000. , 226700. , 232200. , 219600. ,\n", " 206000. , 202200. , 176583.33333333, 96000. ,\n", " 117000. , 118200. , 129540. , 141500. ,\n", " 128850. , 135500. , 135500. , 117500. ,\n", " 113500. , 108500. , 186040. , 183666.66666667,\n", " 106333.33333333, 131033.33333333, 115400. , 121500. ,\n", " 109000. , 92500. ])" ] }, "metadata": {}, "execution_count": 406 } ] }, { "cell_type": "code", "source": [ "y_pred" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IeaNjO_oLcoZ", "outputId": "ce1e60be-5e1b-4526-b99c-e42c1aa7a248" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([202911.61234791, 236242.42223859, 233670.36797393, 210230.78031406,\n", " 183976.26830126, 200929.81459329, 186790.2239417 , 103177.72064145,\n", " 123926.00722971, 120629.70265349, 132298.84564404, 139909.99354828,\n", " 109067.28903354, 138292.26372966, 140931.85016274, 114561.81108316,\n", " 115840.26418613, 109029.5771292 , 182236.07980921, 179040.32183855,\n", " 112353.82995154, 132299.89543314, 122734.93978517, 131085.9483305 ,\n", " 111778.34988534, 103989.33626743])" ] }, "metadata": {}, "execution_count": 407 } ] }, { "cell_type": "code", "source": [ "model.predict(pd.DataFrame({'area': [41], 'num_beds': [1], 'days_to_done': [200], 'community':['ЖК «Спортивный парк»']}))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dokS6aD0D71b", "outputId": "5579ebe6-d114-4ebb-fe75-d1286aadb483" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([118381.91800424])" ] }, "metadata": {}, "execution_count": 423 } ] }, { "cell_type": "code", "source": [ "pd.DataFrame({'feature_importance': model.get_feature_importance(Pool(X, y, cat_features=['community'])), \n", " 'feature_names': X.columns}).sort_values(by=['feature_importance'], \n", " ascending=False)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 175 }, "id": "T-c2wrQSDlPk", "outputId": "73959236-8520-4343-e09d-3d829f733471" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " feature_importance feature_names\n", "3 43.253428 community\n", "2 20.491219 days_to_done\n", "0 19.305202 area\n", "1 16.950151 num_beds" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
feature_importancefeature_names
343.253428community
220.491219days_to_done
019.305202area
116.950151num_beds
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 424 } ] }, { "cell_type": "code", "source": [ "X_val.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "4-Qa4p3BDrIk", "outputId": "b9c0485c-6d6d-4148-cb1b-3ef678f0ea91" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " area num_beds days_to_done community\n", "6 49.0 1 233 ЖК «AVrorA», \n", "7 49.0 1 -52 ЖК «AVrorA», \n", "13 52.0 1 -52 ЖК «AVrorA», \n", "22 67.0 2 -52 ЖК «AVrorA», \n", "24 70.0 2 233 ЖК «AVrorA», " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
areanum_bedsdays_to_donecommunity
649.01233ЖК «AVrorA»,
749.01-52ЖК «AVrorA»,
1352.01-52ЖК «AVrorA»,
2267.02-52ЖК «AVrorA»,
2470.02233ЖК «AVrorA»,
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 388 } ] }, { "cell_type": "code", "source": [ "model.predict(X_val.iloc[0])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WV_8itnWLsuX", "outputId": "396bf26b-37e2-46d5-9837-5f1ffdc5bfc3" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "202911.6123479114" ] }, "metadata": {}, "execution_count": 390 } ] }, { "cell_type": "code", "source": [ "model.predict(pd.DataFrame({'area': [44], 'num_beds': [1], 'days_to_done': [400], 'community':['ЖК «AVrorA»,']}))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uRNrzco4N-FJ", "outputId": "c3c80908-1cb7-45a8-f8e0-60809b1d35a1" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([197725.156463])" ] }, "metadata": {}, "execution_count": 434 } ] }, { "cell_type": "code", "source": [ "import pickle\n", "\n", "with open('model_pervichka.pkl', 'wb') as f:\n", " pickle.dump(model, f)" ], "metadata": { "id": "wBBuVC7QOSFL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "d = test_df.drop(columns=['price_per_m']).drop_duplicates()" ], "metadata": { "id": "CKZW2i5YQAd-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "d.num_beds.value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "z5iZsIoeSm2P", "outputId": "dc444da6-bb07-4796-e790-4538d532e9d5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1 51\n", "2 38\n", "3 34\n", "0 2\n", "Name: num_beds, dtype: int64" ] }, "metadata": {}, "execution_count": 441 } ] }, { "cell_type": "code", "source": [ "в" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "Xz472NLWTLU3", "outputId": "cd37fffc-c003-4a30-f77c-c17222199844" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " area num_beds days_to_done community\n", "0 44.0 1 233 ЖК «AVrorA»,\n", "1 44.0 1 -52 ЖК «AVrorA»,\n", "3 47.0 1 233 ЖК «AVrorA»,\n", "4 47.0 1 -52 ЖК «AVrorA»,\n", "6 49.0 1 233 ЖК «AVrorA»,\n", ".. ... ... ... ...\n", "293 45.0 1 -52 ЖК «Спортивный парк»\n", "297 41.0 1 -144 ЖК «Спортивный парк»\n", "299 45.0 1 -144 ЖК «Спортивный парк»\n", "302 48.0 1 -144 ЖК «Спортивный парк»\n", "304 51.0 1 -144 ЖК «Спортивный парк»\n", "\n", "[125 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
areanum_bedsdays_to_donecommunity
044.01233ЖК «AVrorA»,
144.01-52ЖК «AVrorA»,
347.01233ЖК «AVrorA»,
447.01-52ЖК «AVrorA»,
649.01233ЖК «AVrorA»,
...............
29345.01-52ЖК «Спортивный парк»
29741.01-144ЖК «Спортивный парк»
29945.01-144ЖК «Спортивный парк»
30248.01-144ЖК «Спортивный парк»
30451.01-144ЖК «Спортивный парк»
\n", "

125 rows × 4 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 442 } ] }, { "cell_type": "code", "source": [ "df = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/krasnodar_data_final.csv')\n", "df = df[~df['area'].isna()]\n", "df = df[~df['done_date'].isna()]" ], "metadata": { "id": "FUArSeHiUQk9" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = df.loc[df.drop(columns=['price_per_m', 'date', 'floor', 'done_date']).drop_duplicates().index]" ], "metadata": { "id": "F93iEA_4UUmS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "rep = {\n", " '4 квартал 2021' : \"4 квартал 2021\",\n", " '9 квартал 2021' : \"4 квартал 2021\",\n", " '8 квартал 2021' : \"4 квартал 2021\",\n", " '7 квартал 2021' : \"4 квартал 2021\",\n", " '6 квартал 2021' : \"4 квартал 2021\",\n", " '5 квартал 2021' : \"4 квартал 2021\",\n", " '18 квартал 2021' : \"4 квартал 2021\",\n", " '17 квартал 2021' : \"4 квартал 2021\",\n", " '16 квартал 2021' : \"4 квартал 2021\",\n", " '14 квартал 2021' : \"4 квартал 2021\",\n", " '13 квартал 2021' : \"4 квартал 2021\",\n", " '12 квартал 2021' : \"4 квартал 2021\",\n", " '11 квартал 2021' : \"4 квартал 2021\",\n", " '10 квартал 2021' : \"4 квартал 2021\",\n", " '15 квартал 2021' : \"4 квартал 2021\",\n", "}\n", "kvartal_to_date = {\n", " '4 квартал 2021': '2021-12-15',\n", " '2 квартал 2022': '2022-06-15',\n", " '3 квартал 2022': '2022-09-15',\n", " '1 квартал 2023': '2023-03-15',\n", " '2 квартал 2023': '2023-06-15',\n", " '3 квартал 2023': '2023-09-15',\n", " '4 квартал 2023': '2023-12-15',\n", " '1 квартал 2024': '2024-03-15',\n", " '2 квартал 2024': '2024-06-15',\n", " '3 квартал 2024': '2024-09-15',\n", " '4 квартал 2024': '2024-12-15',\n", " 'дом сдан': '1990-12-15'\n", "}\n", "\n", "df['done_date'] = df['done_date'].replace(rep).replace(kvartal_to_date)\n", "\n", "df.done_date = pd.to_datetime(df.done_date)\n", "df.date = pd.to_datetime(df.date)\n", "df['days_to_done'] = df['done_date'] - df['date']\n", "df['days_to_done'] = df['days_to_done'].apply(lambda x: int(str(x).split()[0]))\n", "\n", "df = df[df['days_to_done'] > -2000]" ], "metadata": { "id": "YGVXldFpVK3r" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import uuid" ], "metadata": { "id": "YQ8yCNOUWfBH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df['id'] = [uuid.uuid4() for i in range(df.shape[0])]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uqaytktLVtEG", "outputId": "4987f94a-12b5-4054-e04f-6614ddaa2f87" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['id'] = [uuid.uuid4() for i in range(df.shape[0])]\n" ] } ] }, { "cell_type": "code", "source": [ "df = df.reset_index(drop=True)" ], "metadata": { "id": "PEhBbLi2WAjW" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df['community'] = df['community'].apply(lambda x: x.strip())" ], "metadata": { "id": "OzeORCqdYs-i" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model.predict(pd.DataFrame({'area': [58.0], 'num_beds': [1], 'days_to_done': [1000], 'community':['ЖК «Novella»']}))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3Y17izGaXrBB", "outputId": "98da920d-41b9-4b94-cab9-e47b969f39f0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([135260.44035891])" ] }, "metadata": {}, "execution_count": 499 } ] }, { "cell_type": "code", "source": [ "df.to_csv('info_final.csv', index=False)" ], "metadata": { "id": "fVwsL2LcX9g6" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import tqdm\n", "\n", "preds = []\n", "for idx, row in tqdm.tqdm(df.iterrows()):\n", " subjs = {\n", " 'id':str(row['id']),\n", " 'days_to_done':row['days_to_done'],\n", " 'price_at_moment': round(model.predict(row[model.feature_names_])*row.area)\n", " }\n", " \n", " prices_after = []\n", " for days in [1000, 700, 500, 300, 200, 100, 50, 0, -50, -100]:\n", " row['days_to_done'] = days\n", " pred = round(model.predict(row[model.feature_names_])*row.area)\n", " \n", " subjs[f'{days}_to_done'] = pred\n", "\n", " if days < subjs['days_to_done']:\n", " prices_after.append(pred)\n", "\n", " if max(prices_after) == pred:\n", " subjs['max_price_after_invest'] = pred\n", " subjs['increase'] = subjs['max_price_after_invest'] - subjs['price_at_moment']\n", " subjs['increase_procent'] = round((subjs['increase']/(subjs['price_at_moment']*100))*100*100, 2)\n", "\n", " subjs['days_for_increase'] = subjs['days_to_done']-days\n", "\n", " if subjs['days_for_increase'] >= 365:\n", " subjs['status'] = 'long'\n", " else:\n", " subjs['status'] = 'short'\n", "\n", " \n", " if subjs['increase_procent'] >= 7:\n", " subjs['risk'] = 'good'\n", " elif subjs['increase_procent'] <= 0:\n", " subjs['risk'] = 'bad'\n", " else:\n", " subjs['risk'] = 'riskey'\n", "\n", "\n", " preds.append(subjs)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OVh5D4kQX-0J", "outputId": "6ec9394b-4554-4c0d-d40a-8e696c56b71b" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "153it [00:03, 44.60it/s]\n" ] } ] }, { "cell_type": "code", "source": [ "f = pd.DataFrame(preds)" ], "metadata": { "id": "f5ifJU9habaL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "f.risk.value_counts()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2GJK98WAmwRj", "outputId": "5ca8d789-1afe-4051-f611-bf62eea3bdfb" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "good 67\n", "riskey 47\n", "bad 39\n", "Name: risk, dtype: int64" ] }, "metadata": {}, "execution_count": 587 } ] }, { "cell_type": "code", "source": [ "df.id = df.id.astype(str)" ], "metadata": { "id": "1YzsbENSnDPx" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "f.merge(df[['community', 'liter_num', 'done_date', 'developer', 'floor', 'area',\n", " 'num_beds', 'days_to_done', 'id']], on='id', how='left').to_csv('merged_stats_final.csv', index=False)" ], "metadata": { "id": "_7FnZAE_m-Bt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import json\n", "\n", "with open('stats_final.json', 'w') as f:\n", " json.dump(preds, f)" ], "metadata": { "id": "oHMm4LIKb02q" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "cols = ['1000_to_done', '700_to_done',\n", " '500_to_done', '300_to_done', '200_to_done',\n", " '100_to_done', '50_to_done', '0_to_done', '-50_to_done',\n", " '-100_to_done']" ], "metadata": { "id": "BfqXQdrSiN6_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import plotly.express as px" ], "metadata": { "id": "lDjZh9gVjCvu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "t = f[f['increase_procent'] > 15][cols].T" ], "metadata": { "id": "eY5RJbHIeWvY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "fig = px.line(t)\n", "fig.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 542 }, "id": "4mg4YGkWiivI", "outputId": "314a02a7-1a9c-4fe4-b4d5-3d5b73a24ac0" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "\n", "\n", "\n", "
\n", "
\n", "\n", "" ] }, "metadata": {} } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "Ciovt8RxjbuC" }, "execution_count": null, "outputs": [] } ] }