{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import numpy as np"
],
"metadata": {
"id": "1wCgQ3uby0j8"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df = pd.read_excel('/content/krasnodar_hack_data.xlsx')\n",
"df['date'] = df['date'].apply(lambda x: x[1:-1])\n",
"df['date'] = pd.to_datetime(df['date'])\n",
"df['floor'] = df['floor'].apply(lambda x: str(x).replace('--', '-'))\n",
"df['liter_num'] = df['liter_num'].apply(lambda x: str(x).replace('--', '-'))\n",
"df['price_per_m'] = df['price_per_m'].apply(lambda x: x.replace('\\xa0', '').replace(' ', '')).astype(int)\n",
"df['num_beds'] = df['num_beds'].astype(int)\n",
"df['area'] = df['area'].apply(lambda x: str(x).replace(',', '.')).astype(float)\n",
"df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 423
},
"id": "zYk8ByTx8DXL",
"outputId": "909d53dd-a9a5-4185-dda0-49fd511f3055"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" community liter_num done_date developer floor \\\n",
"0 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 1-4 \n",
"1 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 5-8 \n",
"2 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 9-12 \n",
"3 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 13-16 \n",
"4 ЖК «Смородина» 1-5 1 квартал 2023 ООО «АЛЬФА» 1-4 \n",
"... ... ... ... ... ... \n",
"1284 ЖК «Role Clef» nan 2 полугодие 2021 ООО «Уютный дом» 2-12 \n",
"1285 ЖК «Role Clef» nan 2 полугодие 2021 ООО «Уютный дом» 2-12 \n",
"1286 ЖК «Role Clef» nan 2 полугодие 2021 ООО «Уютный дом» 2-12 \n",
"1287 ЖК «Мелодия» nan дом сдан NaN 5 \n",
"1288 ЖК «Мелодия» nan дом сдан NaN 11 \n",
"\n",
" area num_beds price_per_m date \n",
"0 32.95 1 118000 2022-01-25 \n",
"1 32.95 1 119000 2022-01-25 \n",
"2 32.95 1 120000 2022-01-25 \n",
"3 32.95 1 121000 2022-01-25 \n",
"4 35.20 1 117000 2022-01-25 \n",
"... ... ... ... ... \n",
"1284 NaN 1 586000 2023-02-15 \n",
"1285 NaN 2 400000 2023-02-15 \n",
"1286 NaN 3 395000 2023-02-15 \n",
"1287 NaN 2 94000 2023-02-15 \n",
"1288 NaN 2 94000 2023-02-15 \n",
"\n",
"[1289 rows x 9 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" community | \n",
" liter_num | \n",
" done_date | \n",
" developer | \n",
" floor | \n",
" area | \n",
" num_beds | \n",
" price_per_m | \n",
" date | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ЖК «Смородина» | \n",
" 1-5 | \n",
" 1 квартал 2023 | \n",
" ООО «АЛЬФА» | \n",
" 1-4 | \n",
" 32.95 | \n",
" 1 | \n",
" 118000 | \n",
" 2022-01-25 | \n",
"
\n",
" \n",
" 1 | \n",
" ЖК «Смородина» | \n",
" 1-5 | \n",
" 1 квартал 2023 | \n",
" ООО «АЛЬФА» | \n",
" 5-8 | \n",
" 32.95 | \n",
" 1 | \n",
" 119000 | \n",
" 2022-01-25 | \n",
"
\n",
" \n",
" 2 | \n",
" ЖК «Смородина» | \n",
" 1-5 | \n",
" 1 квартал 2023 | \n",
" ООО «АЛЬФА» | \n",
" 9-12 | \n",
" 32.95 | \n",
" 1 | \n",
" 120000 | \n",
" 2022-01-25 | \n",
"
\n",
" \n",
" 3 | \n",
" ЖК «Смородина» | \n",
" 1-5 | \n",
" 1 квартал 2023 | \n",
" ООО «АЛЬФА» | \n",
" 13-16 | \n",
" 32.95 | \n",
" 1 | \n",
" 121000 | \n",
" 2022-01-25 | \n",
"
\n",
" \n",
" 4 | \n",
" ЖК «Смородина» | \n",
" 1-5 | \n",
" 1 квартал 2023 | \n",
" ООО «АЛЬФА» | \n",
" 1-4 | \n",
" 35.20 | \n",
" 1 | \n",
" 117000 | \n",
" 2022-01-25 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1284 | \n",
" ЖК «Role Clef» | \n",
" nan | \n",
" 2 полугодие 2021 | \n",
" ООО «Уютный дом» | \n",
" 2-12 | \n",
" NaN | \n",
" 1 | \n",
" 586000 | \n",
" 2023-02-15 | \n",
"
\n",
" \n",
" 1285 | \n",
" ЖК «Role Clef» | \n",
" nan | \n",
" 2 полугодие 2021 | \n",
" ООО «Уютный дом» | \n",
" 2-12 | \n",
" NaN | \n",
" 2 | \n",
" 400000 | \n",
" 2023-02-15 | \n",
"
\n",
" \n",
" 1286 | \n",
" ЖК «Role Clef» | \n",
" nan | \n",
" 2 полугодие 2021 | \n",
" ООО «Уютный дом» | \n",
" 2-12 | \n",
" NaN | \n",
" 3 | \n",
" 395000 | \n",
" 2023-02-15 | \n",
"
\n",
" \n",
" 1287 | \n",
" ЖК «Мелодия» | \n",
" nan | \n",
" дом сдан | \n",
" NaN | \n",
" 5 | \n",
" NaN | \n",
" 2 | \n",
" 94000 | \n",
" 2023-02-15 | \n",
"
\n",
" \n",
" 1288 | \n",
" ЖК «Мелодия» | \n",
" nan | \n",
" дом сдан | \n",
" NaN | \n",
" 11 | \n",
" NaN | \n",
" 2 | \n",
" 94000 | \n",
" 2023-02-15 | \n",
"
\n",
" \n",
"
\n",
"
1289 rows × 9 columns
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 22
}
]
},
{
"cell_type": "code",
"source": [
"df.to_csv('krasnodar_data_final.csv', index=False)"
],
"metadata": {
"id": "v7B0OICi8LOx"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/krasnodar_data_final.csv')\n",
"df = df[~df['area'].isna()]\n",
"df = df[~df['done_date'].isna()]\n"
],
"metadata": {
"id": "UNgPSTGM8Ypj"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "mgrkF997ncDd"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "Mk8Qx67bsjVX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df_uniq = df.drop(columns=['price_per_m', 'date', 'floor']).drop_duplicates() #.to_csv('unique_communities.csv', index=False)"
],
"metadata": {
"id": "iZBXk0P7ndiS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df = df.fillna('nan')"
],
"metadata": {
"id": "Wf8Al8YdnlFk"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t = df.groupby(['community', 'liter_num', 'done_date', 'developer', 'area', 'num_beds', 'date'])['price_per_m'].mean()"
],
"metadata": {
"id": "1QEUL1kKpY9I"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t = t.reset_index()"
],
"metadata": {
"id": "344wp42UpZwo"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t.community.value_counts()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PE9rV5olp36f",
"outputId": "64bfe96a-47ee-43f9-8de9-a9c768444595"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"ЖК «Novella» 53\n",
"ЖК «Зеленодар» 47\n",
"ЖК «AVrorA», 42\n",
"ЖК «Смородина» 40\n",
"ЖК «Fresh» 39\n",
"ЖК «The Grand Palace» 21\n",
"ЖК «Небо» 20\n",
"ЖК «URAL», 18\n",
"ЖК «Спортивный парк» 17\n",
"ЖК «Дом на Лаврова» 8\n",
"Name: community, dtype: int64"
]
},
"metadata": {},
"execution_count": 144
}
]
},
{
"cell_type": "code",
"source": [
"rep = {\n",
" '4 квартал 2021' : \"4 квартал 2021\",\n",
" '9 квартал 2021' : \"4 квартал 2021\",\n",
" '8 квартал 2021' : \"4 квартал 2021\",\n",
" '7 квартал 2021' : \"4 квартал 2021\",\n",
" '6 квартал 2021' : \"4 квартал 2021\",\n",
" '5 квартал 2021' : \"4 квартал 2021\",\n",
" '18 квартал 2021' : \"4 квартал 2021\",\n",
" '17 квартал 2021' : \"4 квартал 2021\",\n",
" '16 квартал 2021' : \"4 квартал 2021\",\n",
" '14 квартал 2021' : \"4 квартал 2021\",\n",
" '13 квартал 2021' : \"4 квартал 2021\",\n",
" '12 квартал 2021' : \"4 квартал 2021\",\n",
" '11 квартал 2021' : \"4 квартал 2021\",\n",
" '10 квартал 2021' : \"4 квартал 2021\",\n",
" '15 квартал 2021' : \"4 квартал 2021\",\n",
"}\n",
"kvartal_to_date = {\n",
" '4 квартал 2021': '2021-12-15',\n",
" '2 квартал 2022': '2022-06-15',\n",
" '3 квартал 2022': '2022-09-15',\n",
" '1 квартал 2023': '2023-03-15',\n",
" '2 квартал 2023': '2023-06-15',\n",
" '3 квартал 2023': '2023-09-15',\n",
" '4 квартал 2023': '2023-12-15',\n",
" '1 квартал 2024': '2024-03-15',\n",
" '2 квартал 2024': '2024-06-15',\n",
" '3 квартал 2024': '2024-09-15',\n",
" '4 квартал 2024': '2024-12-15',\n",
" 'дом сдан': '1990-12-15'\n",
"}"
],
"metadata": {
"id": "OjfuBFcqs1pa"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t['done_date'] = t.done_date.replace(rep)"
],
"metadata": {
"id": "HnG-tuphqo29"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "Hr0tYXDnq7Xx"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t.done_date = t.done_date.replace(kvartal_to_date)"
],
"metadata": {
"id": "1YCU5SSkuXMB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t.done_date = pd.to_datetime(t.done_date)\n",
"t.date = pd.to_datetime(t.date)\n",
"t['days_to_done'] = t['done_date'] - t['date']\n",
"t['days_to_done'] = t['days_to_done'].apply(lambda x: int(str(x).split()[0]))\n"
],
"metadata": {
"id": "LaxUGtDRubue"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "az33D3T0u1Fe"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "xJxBzp7Bwe2i"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"trend = []\n",
"for idx, row in t.iterrows():\n",
" if row.date == pd.to_datetime('2022-01-25'):\n",
" trend.append(0)\n",
" continue\n",
" elif row.date == pd.to_datetime('2022-11-06') and row.area == t.iloc[idx-1].area:\n",
" trend.append(row.price_per_m/(t.iloc[idx-1].price_per_m)*100)\n",
" continue\n",
" elif row.date == pd.to_datetime('2023-02-15') and row.area == t.iloc[idx-1].area:\n",
" trend.append(row.price_per_m/(t.iloc[idx-2].price_per_m)*100)\n",
" continue\n",
" else:\n",
" trend.append(0)"
],
"metadata": {
"id": "yGCs4h8V1izN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t['trend'] = trend"
],
"metadata": {
"id": "B6LkDXQ123BG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t['trend'] = t.trend.apply(lambda x: round(x, 2))"
],
"metadata": {
"id": "65q_WpTE24zn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t = t[~((t['trend'] < 100) & (t['trend'] > 50))]"
],
"metadata": {
"id": "MY0wDX8s388F"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"same_cols = ['community', 'liter_num', 'done_date', 'developer', 'area', 'num_beds']"
],
"metadata": {
"id": "3ZviqlKJHHga"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"new_df = []\n",
"for idx, row in t.iterrows():\n",
" if row.date == pd.to_datetime('2022-01-25'):\n",
" i = 1\n",
" if idx+i < t.shape[0]:\n",
" if (t.iloc[idx+i][same_cols] == row[same_cols]).values.sum() == 6:\n",
" row['price_per_m'] = t.iloc[idx+i]['price_per_m']\n",
" row['trend'] = t.iloc[idx+i]['trend']\n",
"\n",
" new_df.append(row)\n",
" if row.date == pd.to_datetime('2022-11-06'):\n",
" i = 1\n",
" if idx+i < t.shape[0]:\n",
" if (t.iloc[idx+i][same_cols] == row[same_cols]).values.sum() == 6:\n",
" row['price_per_m'] = t.iloc[idx+i]['price_per_m']\n",
" row['trend'] = t.iloc[idx+i]['trend']\n",
"\n",
" new_df.append(row)"
],
"metadata": {
"id": "93JSoKBK941f"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"new_df = pd.DataFrame(new_df)"
],
"metadata": {
"id": "EqxOuJDpH5OG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"new_df = new_df[~(new_df['trend'] == 0)]"
],
"metadata": {
"id": "Nu_gJz4cIDAf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 423
},
"id": "rgQgfyA3ItLS",
"outputId": "7ce259e9-638d-4317-db58-eb8fed976978"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" community liter_num done_date developer \\\n",
"0 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n",
"1 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n",
"2 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n",
"3 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n",
"4 ЖК «AVrorA», nan 2022-09-15 ООО «АльфаСтройКомплекс» \n",
".. ... ... ... ... \n",
"300 ЖК «Спортивный парк» 7 2022-06-15 ООО «АЛЬФА» \n",
"301 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n",
"302 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n",
"303 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n",
"304 ЖК «Спортивный парк» 8 2022-06-15 ООО «АЛЬФА» \n",
"\n",
" area num_beds date price_per_m days_to_done trend \n",
"0 44.0 1 2022-01-25 187000.0 233 0.00 \n",
"1 44.0 1 2022-11-06 202000.0 -52 108.02 \n",
"2 44.0 1 2023-02-15 236600.0 -153 126.52 \n",
"3 47.0 1 2022-01-25 200750.0 233 0.00 \n",
"4 47.0 1 2022-11-06 205750.0 -52 102.49 \n",
".. ... ... ... ... ... ... \n",
"300 45.0 1 2023-02-15 101800.0 -245 117.01 \n",
"301 48.0 1 2022-01-25 84000.0 141 0.00 \n",
"302 48.0 1 2022-11-06 98000.0 -144 116.67 \n",
"303 51.0 1 2022-01-25 79000.0 141 0.00 \n",
"304 51.0 1 2022-11-06 92500.0 -144 117.09 \n",
"\n",
"[300 rows x 10 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" community | \n",
" liter_num | \n",
" done_date | \n",
" developer | \n",
" area | \n",
" num_beds | \n",
" date | \n",
" price_per_m | \n",
" days_to_done | \n",
" trend | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" ЖК «AVrorA», | \n",
" nan | \n",
" 2022-09-15 | \n",
" ООО «АльфаСтройКомплекс» | \n",
" 44.0 | \n",
" 1 | \n",
" 2022-01-25 | \n",
" 187000.0 | \n",
" 233 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 1 | \n",
" ЖК «AVrorA», | \n",
" nan | \n",
" 2022-09-15 | \n",
" ООО «АльфаСтройКомплекс» | \n",
" 44.0 | \n",
" 1 | \n",
" 2022-11-06 | \n",
" 202000.0 | \n",
" -52 | \n",
" 108.02 | \n",
"
\n",
" \n",
" 2 | \n",
" ЖК «AVrorA», | \n",
" nan | \n",
" 2022-09-15 | \n",
" ООО «АльфаСтройКомплекс» | \n",
" 44.0 | \n",
" 1 | \n",
" 2023-02-15 | \n",
" 236600.0 | \n",
" -153 | \n",
" 126.52 | \n",
"
\n",
" \n",
" 3 | \n",
" ЖК «AVrorA», | \n",
" nan | \n",
" 2022-09-15 | \n",
" ООО «АльфаСтройКомплекс» | \n",
" 47.0 | \n",
" 1 | \n",
" 2022-01-25 | \n",
" 200750.0 | \n",
" 233 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 4 | \n",
" ЖК «AVrorA», | \n",
" nan | \n",
" 2022-09-15 | \n",
" ООО «АльфаСтройКомплекс» | \n",
" 47.0 | \n",
" 1 | \n",
" 2022-11-06 | \n",
" 205750.0 | \n",
" -52 | \n",
" 102.49 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 300 | \n",
" ЖК «Спортивный парк» | \n",
" 7 | \n",
" 2022-06-15 | \n",
" ООО «АЛЬФА» | \n",
" 45.0 | \n",
" 1 | \n",
" 2023-02-15 | \n",
" 101800.0 | \n",
" -245 | \n",
" 117.01 | \n",
"
\n",
" \n",
" 301 | \n",
" ЖК «Спортивный парк» | \n",
" 8 | \n",
" 2022-06-15 | \n",
" ООО «АЛЬФА» | \n",
" 48.0 | \n",
" 1 | \n",
" 2022-01-25 | \n",
" 84000.0 | \n",
" 141 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 302 | \n",
" ЖК «Спортивный парк» | \n",
" 8 | \n",
" 2022-06-15 | \n",
" ООО «АЛЬФА» | \n",
" 48.0 | \n",
" 1 | \n",
" 2022-11-06 | \n",
" 98000.0 | \n",
" -144 | \n",
" 116.67 | \n",
"
\n",
" \n",
" 303 | \n",
" ЖК «Спортивный парк» | \n",
" 8 | \n",
" 2022-06-15 | \n",
" ООО «АЛЬФА» | \n",
" 51.0 | \n",
" 1 | \n",
" 2022-01-25 | \n",
" 79000.0 | \n",
" 141 | \n",
" 0.00 | \n",
"
\n",
" \n",
" 304 | \n",
" ЖК «Спортивный парк» | \n",
" 8 | \n",
" 2022-06-15 | \n",
" ООО «АЛЬФА» | \n",
" 51.0 | \n",
" 1 | \n",
" 2022-11-06 | \n",
" 92500.0 | \n",
" -144 | \n",
" 117.09 | \n",
"
\n",
" \n",
"
\n",
"
300 rows × 10 columns
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 358
}
]
},
{
"cell_type": "code",
"source": [
"t['date'].unique()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7vWFl3uoIMnm",
"outputId": "1d5dbc94-c32c-4426-a324-4a6186f44773"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['2022-01-25T00:00:00.000000000', '2022-11-06T00:00:00.000000000',\n",
" '2023-02-15T00:00:00.000000000'], dtype='datetime64[ns]')"
]
},
"metadata": {},
"execution_count": 365
}
]
},
{
"cell_type": "code",
"source": [
"test_df = new_df[['area', 'num_beds', 'price_per_m', 'days_to_done', 'community']]\n",
"test_df = test_df[test_df['days_to_done'] > -2000]\n",
"test_df['community'] = test_df['community'].apply(lambda x: x.strip())"
],
"metadata": {
"id": "GsddQYnB_hi5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.model_selection import StratifiedKFold"
],
"metadata": {
"id": "iIV_dc5N_nZB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0xBACED)"
],
"metadata": {
"id": "g-1NcmetADYQ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for fold, (train_index, val_index) in enumerate(kfold.split(test_df, test_df['days_to_done'])):\n",
" if fold == 0:\n",
" df_train = test_df.iloc[train_index]\n",
" df_val = test_df.iloc[val_index]\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QbgRtLQ-AI89",
"outputId": "18b36f27-7059-470d-eb7a-5e7e1c0e03db"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n",
" warnings.warn(\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"X = df_train.drop(columns=['price_per_m'])\n",
"y = df_train['price_per_m'].values\n",
"\n",
"X_val = df_val.drop(columns=['price_per_m'])\n",
"y_val = df_val['price_per_m'].values"
],
"metadata": {
"id": "asPYq7hz_uAR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install shap catboost -qqq"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "clgz4UO8A92Y",
"outputId": "5f8340de-1a3d-4f02-b895-0c64e09adfa0"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m575.9/575.9 KB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.6/76.6 MB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h"
]
}
]
},
{
"cell_type": "code",
"source": [
"from catboost import CatBoostRegressor, Pool"
],
"metadata": {
"id": "060Gc1XtCYYA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"regression_params = dict(\n",
" iterations=5000, \n",
" learning_rate=0.05,\n",
" verbose=100,\n",
" early_stopping_rounds=500,\n",
" loss_function='RMSE',\n",
" eval_metric='MAE',\n",
" task_type='CPU',\n",
" cat_features=['community']\n",
")\n"
],
"metadata": {
"id": "PRyAwOaTCjw1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = CatBoostRegressor(**regression_params).fit(X, y, eval_set=(X_val, y_val))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vSL8GlhyCZ2n",
"outputId": "ce8a65b1-edaf-44b3-bf69-e3c70febb28b"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0:\tlearn: 37709.1413245\ttest: 37499.9541219\tbest: 37499.9541219 (0)\ttotal: 848us\tremaining: 4.24s\n",
"100:\tlearn: 9375.1949588\ttest: 8095.7973147\tbest: 8082.1061214 (99)\ttotal: 60.6ms\tremaining: 2.94s\n",
"200:\tlearn: 6081.2680389\ttest: 6465.3010380\tbest: 6465.3010380 (200)\ttotal: 185ms\tremaining: 4.41s\n",
"300:\tlearn: 4427.3214133\ttest: 6545.6179437\tbest: 6361.4411869 (251)\ttotal: 397ms\tremaining: 6.19s\n",
"400:\tlearn: 3395.6664363\ttest: 6681.1132311\tbest: 6361.4411869 (251)\ttotal: 551ms\tremaining: 6.31s\n",
"500:\tlearn: 2712.4185987\ttest: 6611.7024042\tbest: 6361.4411869 (251)\ttotal: 732ms\tremaining: 6.58s\n",
"600:\tlearn: 2258.2942629\ttest: 6603.5144130\tbest: 6361.4411869 (251)\ttotal: 833ms\tremaining: 6.1s\n",
"700:\tlearn: 1912.3338542\ttest: 6570.2824488\tbest: 6361.4411869 (251)\ttotal: 993ms\tremaining: 6.09s\n",
"Stopped by overfitting detector (500 iterations wait)\n",
"\n",
"bestTest = 6361.441187\n",
"bestIteration = 251\n",
"\n",
"Shrink model to first 252 iterations.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.metrics import mean_absolute_percentage_error"
],
"metadata": {
"id": "N7iYT6eBCfd9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"y_pred = model.predict(X_val)\n",
"\n",
"1-mean_absolute_percentage_error(y_pred, y_val)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oFTHqEcPDflg",
"outputId": "f1fe7202-d1ed-49eb-abb1-7f114d7f44da"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.9550957107941889"
]
},
"metadata": {},
"execution_count": 405
}
]
},
{
"cell_type": "code",
"source": [
"y_val"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-4SgdPmkLcja",
"outputId": "5dc0d00e-7dc0-44cb-df2d-35104cbb3e1b"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([193000. , 226700. , 232200. , 219600. ,\n",
" 206000. , 202200. , 176583.33333333, 96000. ,\n",
" 117000. , 118200. , 129540. , 141500. ,\n",
" 128850. , 135500. , 135500. , 117500. ,\n",
" 113500. , 108500. , 186040. , 183666.66666667,\n",
" 106333.33333333, 131033.33333333, 115400. , 121500. ,\n",
" 109000. , 92500. ])"
]
},
"metadata": {},
"execution_count": 406
}
]
},
{
"cell_type": "code",
"source": [
"y_pred"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IeaNjO_oLcoZ",
"outputId": "ce1e60be-5e1b-4526-b99c-e42c1aa7a248"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([202911.61234791, 236242.42223859, 233670.36797393, 210230.78031406,\n",
" 183976.26830126, 200929.81459329, 186790.2239417 , 103177.72064145,\n",
" 123926.00722971, 120629.70265349, 132298.84564404, 139909.99354828,\n",
" 109067.28903354, 138292.26372966, 140931.85016274, 114561.81108316,\n",
" 115840.26418613, 109029.5771292 , 182236.07980921, 179040.32183855,\n",
" 112353.82995154, 132299.89543314, 122734.93978517, 131085.9483305 ,\n",
" 111778.34988534, 103989.33626743])"
]
},
"metadata": {},
"execution_count": 407
}
]
},
{
"cell_type": "code",
"source": [
"model.predict(pd.DataFrame({'area': [41], 'num_beds': [1], 'days_to_done': [200], 'community':['ЖК «Спортивный парк»']}))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dokS6aD0D71b",
"outputId": "5579ebe6-d114-4ebb-fe75-d1286aadb483"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([118381.91800424])"
]
},
"metadata": {},
"execution_count": 423
}
]
},
{
"cell_type": "code",
"source": [
"pd.DataFrame({'feature_importance': model.get_feature_importance(Pool(X, y, cat_features=['community'])), \n",
" 'feature_names': X.columns}).sort_values(by=['feature_importance'], \n",
" ascending=False)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 175
},
"id": "T-c2wrQSDlPk",
"outputId": "73959236-8520-4343-e09d-3d829f733471"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" feature_importance feature_names\n",
"3 43.253428 community\n",
"2 20.491219 days_to_done\n",
"0 19.305202 area\n",
"1 16.950151 num_beds"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" feature_importance | \n",
" feature_names | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" 43.253428 | \n",
" community | \n",
"
\n",
" \n",
" 2 | \n",
" 20.491219 | \n",
" days_to_done | \n",
"
\n",
" \n",
" 0 | \n",
" 19.305202 | \n",
" area | \n",
"
\n",
" \n",
" 1 | \n",
" 16.950151 | \n",
" num_beds | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 424
}
]
},
{
"cell_type": "code",
"source": [
"X_val.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "4-Qa4p3BDrIk",
"outputId": "b9c0485c-6d6d-4148-cb1b-3ef678f0ea91"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" area num_beds days_to_done community\n",
"6 49.0 1 233 ЖК «AVrorA», \n",
"7 49.0 1 -52 ЖК «AVrorA», \n",
"13 52.0 1 -52 ЖК «AVrorA», \n",
"22 67.0 2 -52 ЖК «AVrorA», \n",
"24 70.0 2 233 ЖК «AVrorA», "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" area | \n",
" num_beds | \n",
" days_to_done | \n",
" community | \n",
"
\n",
" \n",
" \n",
" \n",
" 6 | \n",
" 49.0 | \n",
" 1 | \n",
" 233 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 7 | \n",
" 49.0 | \n",
" 1 | \n",
" -52 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 13 | \n",
" 52.0 | \n",
" 1 | \n",
" -52 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 22 | \n",
" 67.0 | \n",
" 2 | \n",
" -52 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 24 | \n",
" 70.0 | \n",
" 2 | \n",
" 233 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 388
}
]
},
{
"cell_type": "code",
"source": [
"model.predict(X_val.iloc[0])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WV_8itnWLsuX",
"outputId": "396bf26b-37e2-46d5-9837-5f1ffdc5bfc3"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"202911.6123479114"
]
},
"metadata": {},
"execution_count": 390
}
]
},
{
"cell_type": "code",
"source": [
"model.predict(pd.DataFrame({'area': [44], 'num_beds': [1], 'days_to_done': [400], 'community':['ЖК «AVrorA»,']}))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uRNrzco4N-FJ",
"outputId": "c3c80908-1cb7-45a8-f8e0-60809b1d35a1"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([197725.156463])"
]
},
"metadata": {},
"execution_count": 434
}
]
},
{
"cell_type": "code",
"source": [
"import pickle\n",
"\n",
"with open('model_pervichka.pkl', 'wb') as f:\n",
" pickle.dump(model, f)"
],
"metadata": {
"id": "wBBuVC7QOSFL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"d = test_df.drop(columns=['price_per_m']).drop_duplicates()"
],
"metadata": {
"id": "CKZW2i5YQAd-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"d.num_beds.value_counts()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "z5iZsIoeSm2P",
"outputId": "dc444da6-bb07-4796-e790-4538d532e9d5"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"1 51\n",
"2 38\n",
"3 34\n",
"0 2\n",
"Name: num_beds, dtype: int64"
]
},
"metadata": {},
"execution_count": 441
}
]
},
{
"cell_type": "code",
"source": [
"в"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 423
},
"id": "Xz472NLWTLU3",
"outputId": "cd37fffc-c003-4a30-f77c-c17222199844"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" area num_beds days_to_done community\n",
"0 44.0 1 233 ЖК «AVrorA»,\n",
"1 44.0 1 -52 ЖК «AVrorA»,\n",
"3 47.0 1 233 ЖК «AVrorA»,\n",
"4 47.0 1 -52 ЖК «AVrorA»,\n",
"6 49.0 1 233 ЖК «AVrorA»,\n",
".. ... ... ... ...\n",
"293 45.0 1 -52 ЖК «Спортивный парк»\n",
"297 41.0 1 -144 ЖК «Спортивный парк»\n",
"299 45.0 1 -144 ЖК «Спортивный парк»\n",
"302 48.0 1 -144 ЖК «Спортивный парк»\n",
"304 51.0 1 -144 ЖК «Спортивный парк»\n",
"\n",
"[125 rows x 4 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" area | \n",
" num_beds | \n",
" days_to_done | \n",
" community | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 44.0 | \n",
" 1 | \n",
" 233 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 1 | \n",
" 44.0 | \n",
" 1 | \n",
" -52 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 3 | \n",
" 47.0 | \n",
" 1 | \n",
" 233 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 4 | \n",
" 47.0 | \n",
" 1 | \n",
" -52 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" 6 | \n",
" 49.0 | \n",
" 1 | \n",
" 233 | \n",
" ЖК «AVrorA», | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 293 | \n",
" 45.0 | \n",
" 1 | \n",
" -52 | \n",
" ЖК «Спортивный парк» | \n",
"
\n",
" \n",
" 297 | \n",
" 41.0 | \n",
" 1 | \n",
" -144 | \n",
" ЖК «Спортивный парк» | \n",
"
\n",
" \n",
" 299 | \n",
" 45.0 | \n",
" 1 | \n",
" -144 | \n",
" ЖК «Спортивный парк» | \n",
"
\n",
" \n",
" 302 | \n",
" 48.0 | \n",
" 1 | \n",
" -144 | \n",
" ЖК «Спортивный парк» | \n",
"
\n",
" \n",
" 304 | \n",
" 51.0 | \n",
" 1 | \n",
" -144 | \n",
" ЖК «Спортивный парк» | \n",
"
\n",
" \n",
"
\n",
"
125 rows × 4 columns
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 442
}
]
},
{
"cell_type": "code",
"source": [
"df = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/krasnodar_data_final.csv')\n",
"df = df[~df['area'].isna()]\n",
"df = df[~df['done_date'].isna()]"
],
"metadata": {
"id": "FUArSeHiUQk9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df = df.loc[df.drop(columns=['price_per_m', 'date', 'floor', 'done_date']).drop_duplicates().index]"
],
"metadata": {
"id": "F93iEA_4UUmS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"rep = {\n",
" '4 квартал 2021' : \"4 квартал 2021\",\n",
" '9 квартал 2021' : \"4 квартал 2021\",\n",
" '8 квартал 2021' : \"4 квартал 2021\",\n",
" '7 квартал 2021' : \"4 квартал 2021\",\n",
" '6 квартал 2021' : \"4 квартал 2021\",\n",
" '5 квартал 2021' : \"4 квартал 2021\",\n",
" '18 квартал 2021' : \"4 квартал 2021\",\n",
" '17 квартал 2021' : \"4 квартал 2021\",\n",
" '16 квартал 2021' : \"4 квартал 2021\",\n",
" '14 квартал 2021' : \"4 квартал 2021\",\n",
" '13 квартал 2021' : \"4 квартал 2021\",\n",
" '12 квартал 2021' : \"4 квартал 2021\",\n",
" '11 квартал 2021' : \"4 квартал 2021\",\n",
" '10 квартал 2021' : \"4 квартал 2021\",\n",
" '15 квартал 2021' : \"4 квартал 2021\",\n",
"}\n",
"kvartal_to_date = {\n",
" '4 квартал 2021': '2021-12-15',\n",
" '2 квартал 2022': '2022-06-15',\n",
" '3 квартал 2022': '2022-09-15',\n",
" '1 квартал 2023': '2023-03-15',\n",
" '2 квартал 2023': '2023-06-15',\n",
" '3 квартал 2023': '2023-09-15',\n",
" '4 квартал 2023': '2023-12-15',\n",
" '1 квартал 2024': '2024-03-15',\n",
" '2 квартал 2024': '2024-06-15',\n",
" '3 квартал 2024': '2024-09-15',\n",
" '4 квартал 2024': '2024-12-15',\n",
" 'дом сдан': '1990-12-15'\n",
"}\n",
"\n",
"df['done_date'] = df['done_date'].replace(rep).replace(kvartal_to_date)\n",
"\n",
"df.done_date = pd.to_datetime(df.done_date)\n",
"df.date = pd.to_datetime(df.date)\n",
"df['days_to_done'] = df['done_date'] - df['date']\n",
"df['days_to_done'] = df['days_to_done'].apply(lambda x: int(str(x).split()[0]))\n",
"\n",
"df = df[df['days_to_done'] > -2000]"
],
"metadata": {
"id": "YGVXldFpVK3r"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import uuid"
],
"metadata": {
"id": "YQ8yCNOUWfBH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df['id'] = [uuid.uuid4() for i in range(df.shape[0])]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uqaytktLVtEG",
"outputId": "4987f94a-12b5-4054-e04f-6614ddaa2f87"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
":1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df['id'] = [uuid.uuid4() for i in range(df.shape[0])]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df = df.reset_index(drop=True)"
],
"metadata": {
"id": "PEhBbLi2WAjW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df['community'] = df['community'].apply(lambda x: x.strip())"
],
"metadata": {
"id": "OzeORCqdYs-i"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.predict(pd.DataFrame({'area': [58.0], 'num_beds': [1], 'days_to_done': [1000], 'community':['ЖК «Novella»']}))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3Y17izGaXrBB",
"outputId": "98da920d-41b9-4b94-cab9-e47b969f39f0"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([135260.44035891])"
]
},
"metadata": {},
"execution_count": 499
}
]
},
{
"cell_type": "code",
"source": [
"df.to_csv('info_final.csv', index=False)"
],
"metadata": {
"id": "fVwsL2LcX9g6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import tqdm\n",
"\n",
"preds = []\n",
"for idx, row in tqdm.tqdm(df.iterrows()):\n",
" subjs = {\n",
" 'id':str(row['id']),\n",
" 'days_to_done':row['days_to_done'],\n",
" 'price_at_moment': round(model.predict(row[model.feature_names_])*row.area)\n",
" }\n",
" \n",
" prices_after = []\n",
" for days in [1000, 700, 500, 300, 200, 100, 50, 0, -50, -100]:\n",
" row['days_to_done'] = days\n",
" pred = round(model.predict(row[model.feature_names_])*row.area)\n",
" \n",
" subjs[f'{days}_to_done'] = pred\n",
"\n",
" if days < subjs['days_to_done']:\n",
" prices_after.append(pred)\n",
"\n",
" if max(prices_after) == pred:\n",
" subjs['max_price_after_invest'] = pred\n",
" subjs['increase'] = subjs['max_price_after_invest'] - subjs['price_at_moment']\n",
" subjs['increase_procent'] = round((subjs['increase']/(subjs['price_at_moment']*100))*100*100, 2)\n",
"\n",
" subjs['days_for_increase'] = subjs['days_to_done']-days\n",
"\n",
" if subjs['days_for_increase'] >= 365:\n",
" subjs['status'] = 'long'\n",
" else:\n",
" subjs['status'] = 'short'\n",
"\n",
" \n",
" if subjs['increase_procent'] >= 7:\n",
" subjs['risk'] = 'good'\n",
" elif subjs['increase_procent'] <= 0:\n",
" subjs['risk'] = 'bad'\n",
" else:\n",
" subjs['risk'] = 'riskey'\n",
"\n",
"\n",
" preds.append(subjs)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OVh5D4kQX-0J",
"outputId": "6ec9394b-4554-4c0d-d40a-8e696c56b71b"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"153it [00:03, 44.60it/s]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"f = pd.DataFrame(preds)"
],
"metadata": {
"id": "f5ifJU9habaL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"f.risk.value_counts()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2GJK98WAmwRj",
"outputId": "5ca8d789-1afe-4051-f611-bf62eea3bdfb"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"good 67\n",
"riskey 47\n",
"bad 39\n",
"Name: risk, dtype: int64"
]
},
"metadata": {},
"execution_count": 587
}
]
},
{
"cell_type": "code",
"source": [
"df.id = df.id.astype(str)"
],
"metadata": {
"id": "1YzsbENSnDPx"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"f.merge(df[['community', 'liter_num', 'done_date', 'developer', 'floor', 'area',\n",
" 'num_beds', 'days_to_done', 'id']], on='id', how='left').to_csv('merged_stats_final.csv', index=False)"
],
"metadata": {
"id": "_7FnZAE_m-Bt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import json\n",
"\n",
"with open('stats_final.json', 'w') as f:\n",
" json.dump(preds, f)"
],
"metadata": {
"id": "oHMm4LIKb02q"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"cols = ['1000_to_done', '700_to_done',\n",
" '500_to_done', '300_to_done', '200_to_done',\n",
" '100_to_done', '50_to_done', '0_to_done', '-50_to_done',\n",
" '-100_to_done']"
],
"metadata": {
"id": "BfqXQdrSiN6_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import plotly.express as px"
],
"metadata": {
"id": "lDjZh9gVjCvu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"t = f[f['increase_procent'] > 15][cols].T"
],
"metadata": {
"id": "eY5RJbHIeWvY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"fig = px.line(t)\n",
"fig.show()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 542
},
"id": "4mg4YGkWiivI",
"outputId": "314a02a7-1a9c-4fe4-b4d5-3d5b73a24ac0"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"\n",
"\n",
"\n",
" \n",
"\n",
""
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "Ciovt8RxjbuC"
},
"execution_count": null,
"outputs": []
}
]
}