{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "!pip install shap catboost -qqq" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "S-bKew6sru-E", "outputId": "d3e42f68-d081-4c79-d4e4-b232aadb5992" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m575.9/575.9 KB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.6/76.6 MB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Rql8Sw4On-L8" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "source": [ "df1 = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/cian_parsing_result_sale_1_100_krasnodar_18_Feb_2023_03_00_53_912228.csv', sep=';')\n", "df2 = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/cian_parsing_result_sale_50_200_krasnodar_17_Feb_2023_16_32_25_653503.csv', sep=';')" ], "metadata": { "id": "kgtmpsa2oYkQ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = pd.concat([df1, df2])" ], "metadata": { "id": "Fd0PVyVxo6H3" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df = df.drop_duplicates()\n", "df = df[df['living_meters'] > 10]" ], "metadata": { "id": "yx0EDpVxpCeL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6Swi2L94pfp7", "outputId": "9cf81e64-d09d-4fd3-eb56-1bc119924d1c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "Int64Index: 1004 entries, 0 to 131\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 author 1004 non-null object \n", " 1 author_type 1004 non-null object \n", " 2 link 1004 non-null object \n", " 3 city 1004 non-null object \n", " 4 deal_type 1004 non-null object \n", " 5 accommodation_type 1004 non-null object \n", " 6 floor 1004 non-null int64 \n", " 7 floors_count 1004 non-null int64 \n", " 8 rooms_count 1004 non-null int64 \n", " 9 total_meters 1004 non-null float64\n", " 10 price_per_m2 1004 non-null int64 \n", " 11 price 1004 non-null int64 \n", " 12 year_of_construction 1004 non-null int64 \n", " 13 living_meters 1004 non-null float64\n", " 14 kitchen_meters 1004 non-null float64\n", " 15 phone 1004 non-null int64 \n", " 16 district 96 non-null object \n", " 17 street 94 non-null object \n", " 18 underground 0 non-null float64\n", " 19 residential_complex 14 non-null object \n", "dtypes: float64(4), int64(7), object(9)\n", "memory usage: 164.7+ KB\n" ] } ] }, { "cell_type": "code", "source": [ "df = df.drop(columns=['author', 'author_type', 'city', 'deal_type', 'accommodation_type', 'floors_count', 'price_per_m2', 'phone', 'district', 'street',\n", " 'underground', 'residential_complex'])" ], "metadata": { "id": "FfwwIZnAptDZ" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import seaborn as sns\n", "sns.distplot(df[(df['price'] > 1000000) & (df['price'] < 10000000)]['price'].values)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 361 }, "id": "eHaAjo3oqSOu", "outputId": "687d1aee-6962-4955-8e95-5b4db0a8156c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 48 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "sns.distplot(np.log(df['price'].values))\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 341 }, "id": "QWk979EYqSX8", "outputId": "19b9153e-7e3d-4d42-bd93-9e6f91130c7d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 26 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "df = df[(df['price'] > 1000000) & (df['price'] < 10000000)]" ], "metadata": { "id": "yTsj5H6er_E4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "1.050000e+06" ], "metadata": { "id": "9HUIrkcm2Fjb", "outputId": "c12b60c2-8596-4670-bb4a-b6e905d50d46", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1050000.0" ] }, "metadata": {}, "execution_count": 129 } ] }, { "cell_type": "code", "source": [ "df['price'].describe()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XExt2svC1_02", "outputId": "cf147a42-b740-455b-c75f-51fc365f9a94" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "count 9.210000e+02\n", "mean 4.081383e+06\n", "std 1.286029e+06\n", "min 1.050000e+06\n", "25% 3.400000e+06\n", "50% 3.980000e+06\n", "75% 4.500000e+06\n", "max 9.750000e+06\n", "Name: price, dtype: float64" ] }, "metadata": {}, "execution_count": 128 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import StratifiedKFold\n", "from catboost import CatBoostRegressor, Pool" ], "metadata": { "id": "HgnofJwbrFA0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "regression_params = dict(\n", " iterations=5000, \n", " learning_rate=0.01,\n", " verbose=100,\n", " early_stopping_rounds=500,\n", " loss_function='RMSE',\n", " eval_metric='MAE',\n", " task_type='CPU',\n", ")" ], "metadata": { "id": "GhbyXwJAsHdi" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0xBACED)\n", "\n", "models = []\n", "preds = []\n", "data = []\n", "for fold, (train_index, val_index) in enumerate(kfold.split(df, df['price'])):\n", " df_train = df.iloc[train_index]\n", " df_val = df.iloc[val_index]\n", "\n", " X = df_train.drop(columns=['price', 'link'])\n", " y = np.log(df_train['price'].values)\n", "\n", " X_val = df_val.drop(columns=['price', 'link'])\n", " y_val = np.log(df_val['price'].values)\n", "\n", " model = CatBoostRegressor(**regression_params).fit(X, y, eval_set=(X_val, y_val))\n", " preds.append(np.exp(model.predict(X_val)))\n", " data.append(df_val)\n", "\n", " models.append(model)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EMHyD_A0rx9v", "outputId": "bffcf7c6-b68b-4c06-bd6c-5adaa0ce9136" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n", " warnings.warn(\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "0:\tlearn: 0.2087847\ttest: 0.2202941\tbest: 0.2202941 (0)\ttotal: 4.76ms\tremaining: 23.8s\n", "100:\tlearn: 0.1821797\ttest: 0.2034064\tbest: 0.2034064 (100)\ttotal: 101ms\tremaining: 4.89s\n", "200:\tlearn: 0.1729615\ttest: 0.2031829\tbest: 0.2026452 (132)\ttotal: 233ms\tremaining: 5.57s\n", "300:\tlearn: 0.1675100\ttest: 0.2046471\tbest: 0.2026452 (132)\ttotal: 344ms\tremaining: 5.37s\n", "400:\tlearn: 0.1635192\ttest: 0.2059701\tbest: 0.2026452 (132)\ttotal: 441ms\tremaining: 5.05s\n", "500:\tlearn: 0.1599954\ttest: 0.2067372\tbest: 0.2026452 (132)\ttotal: 558ms\tremaining: 5.01s\n", "600:\tlearn: 0.1564030\ttest: 0.2074788\tbest: 0.2026452 (132)\ttotal: 663ms\tremaining: 4.85s\n", "Stopped by overfitting detector (500 iterations wait)\n", "\n", "bestTest = 0.2026452004\n", "bestIteration = 132\n", "\n", "Shrink model to first 133 iterations.\n", "0:\tlearn: 0.2081645\ttest: 0.2234853\tbest: 0.2234853 (0)\ttotal: 1.06ms\tremaining: 5.32s\n", "100:\tlearn: 0.1830416\ttest: 0.1983601\tbest: 0.1983601 (100)\ttotal: 89.7ms\tremaining: 4.35s\n", "200:\tlearn: 0.1764459\ttest: 0.1906699\tbest: 0.1906699 (200)\ttotal: 202ms\tremaining: 4.83s\n", "300:\tlearn: 0.1725523\ttest: 0.1887852\tbest: 0.1886834 (284)\ttotal: 307ms\tremaining: 4.79s\n", "400:\tlearn: 0.1695154\ttest: 0.1877802\tbest: 0.1877044 (384)\ttotal: 441ms\tremaining: 5.06s\n", "500:\tlearn: 0.1667373\ttest: 0.1876195\tbest: 0.1876195 (500)\ttotal: 536ms\tremaining: 4.81s\n", "600:\tlearn: 0.1635846\ttest: 0.1881362\tbest: 0.1876075 (503)\ttotal: 679ms\tremaining: 4.97s\n", "700:\tlearn: 0.1607667\ttest: 0.1877806\tbest: 0.1876075 (503)\ttotal: 771ms\tremaining: 4.73s\n", "800:\tlearn: 0.1580077\ttest: 0.1876148\tbest: 0.1875523 (794)\ttotal: 882ms\tremaining: 4.62s\n", "900:\tlearn: 0.1557013\ttest: 0.1875733\tbest: 0.1875069 (822)\ttotal: 971ms\tremaining: 4.42s\n", "1000:\tlearn: 0.1533426\ttest: 0.1880110\tbest: 0.1875069 (822)\ttotal: 1.1s\tremaining: 4.38s\n", "1100:\tlearn: 0.1513860\ttest: 0.1886087\tbest: 0.1875069 (822)\ttotal: 1.19s\tremaining: 4.21s\n", "1200:\tlearn: 0.1488173\ttest: 0.1896967\tbest: 0.1875069 (822)\ttotal: 1.3s\tremaining: 4.1s\n", "1300:\tlearn: 0.1461068\ttest: 0.1905671\tbest: 0.1875069 (822)\ttotal: 1.4s\tremaining: 3.98s\n", "Stopped by overfitting detector (500 iterations wait)\n", "\n", "bestTest = 0.1875069368\n", "bestIteration = 822\n", "\n", "Shrink model to first 823 iterations.\n", "0:\tlearn: 0.2127787\ttest: 0.2065350\tbest: 0.2065350 (0)\ttotal: 1.04ms\tremaining: 5.21s\n", "100:\tlearn: 0.1867680\ttest: 0.1874736\tbest: 0.1874736 (100)\ttotal: 112ms\tremaining: 5.41s\n", "200:\tlearn: 0.1777852\ttest: 0.1877269\tbest: 0.1866778 (123)\ttotal: 226ms\tremaining: 5.41s\n", "300:\tlearn: 0.1732305\ttest: 0.1892600\tbest: 0.1866778 (123)\ttotal: 321ms\tremaining: 5.01s\n", "400:\tlearn: 0.1700595\ttest: 0.1905574\tbest: 0.1866778 (123)\ttotal: 410ms\tremaining: 4.7s\n", "500:\tlearn: 0.1671602\ttest: 0.1912499\tbest: 0.1866778 (123)\ttotal: 501ms\tremaining: 4.5s\n", "600:\tlearn: 0.1643526\ttest: 0.1915617\tbest: 0.1866778 (123)\ttotal: 601ms\tremaining: 4.4s\n", "Stopped by overfitting detector (500 iterations wait)\n", "\n", "bestTest = 0.1866777773\n", "bestIteration = 123\n", "\n", "Shrink model to first 124 iterations.\n", "0:\tlearn: 0.2134981\ttest: 0.2029683\tbest: 0.2029683 (0)\ttotal: 1.34ms\tremaining: 6.7s\n", "100:\tlearn: 0.1884117\ttest: 0.1792734\tbest: 0.1792734 (100)\ttotal: 105ms\tremaining: 5.08s\n", "200:\tlearn: 0.1802974\ttest: 0.1747837\tbest: 0.1747608 (188)\ttotal: 219ms\tremaining: 5.24s\n", "300:\tlearn: 0.1755450\ttest: 0.1743983\tbest: 0.1743297 (292)\ttotal: 320ms\tremaining: 5s\n", "400:\tlearn: 0.1714775\ttest: 0.1744935\tbest: 0.1742771 (326)\ttotal: 422ms\tremaining: 4.84s\n", "500:\tlearn: 0.1679037\ttest: 0.1749257\tbest: 0.1742771 (326)\ttotal: 516ms\tremaining: 4.63s\n", "600:\tlearn: 0.1644564\ttest: 0.1756507\tbest: 0.1742771 (326)\ttotal: 627ms\tremaining: 4.59s\n", "700:\tlearn: 0.1610518\ttest: 0.1765478\tbest: 0.1742771 (326)\ttotal: 738ms\tremaining: 4.52s\n", "800:\tlearn: 0.1582828\ttest: 0.1772818\tbest: 0.1742771 (326)\ttotal: 839ms\tremaining: 4.4s\n", "Stopped by overfitting detector (500 iterations wait)\n", "\n", "bestTest = 0.1742771224\n", "bestIteration = 326\n", "\n", "Shrink model to first 327 iterations.\n", "0:\tlearn: 0.2131309\ttest: 0.2034499\tbest: 0.2034499 (0)\ttotal: 1.07ms\tremaining: 5.35s\n", "100:\tlearn: 0.1831531\ttest: 0.1942513\tbest: 0.1940467 (92)\ttotal: 89.5ms\tremaining: 4.34s\n", "200:\tlearn: 0.1743874\ttest: 0.1960204\tbest: 0.1939128 (119)\ttotal: 221ms\tremaining: 5.27s\n", "300:\tlearn: 0.1696565\ttest: 0.1983809\tbest: 0.1939128 (119)\ttotal: 319ms\tremaining: 4.98s\n", "400:\tlearn: 0.1659821\ttest: 0.1998092\tbest: 0.1939128 (119)\ttotal: 418ms\tremaining: 4.79s\n", "500:\tlearn: 0.1626480\ttest: 0.2014341\tbest: 0.1939128 (119)\ttotal: 514ms\tremaining: 4.62s\n", "600:\tlearn: 0.1593397\ttest: 0.2024361\tbest: 0.1939128 (119)\ttotal: 601ms\tremaining: 4.4s\n", "Stopped by overfitting detector (500 iterations wait)\n", "\n", "bestTest = 0.1939127814\n", "bestIteration = 119\n", "\n", "Shrink model to first 120 iterations.\n" ] } ] }, { "cell_type": "code", "source": [ "import pickle \n", "\n", "with open('models_vtorichka.pkl', 'wb') as f:\n", " pickle.dump(models, f)" ], "metadata": { "id": "LuUXVv6wsto7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import uuid" ], "metadata": { "id": "88sjMEUnvS16" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df['id'] = [str(uuid.uuid4()) for i in range(df.shape[0])]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "V8PatXqMv12q", "outputId": "23782a79-969e-4607-9f18-46b86e8d0d01" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['id'] = [str(uuid.uuid4()) for i in range(df.shape[0])]\n" ] } ] }, { "cell_type": "code", "source": [ "full_data = pd.concat(data)" ], "metadata": { "id": "G81m40Zev9Jj" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "full_data['pred_price'] = np.concatenate(np.array(preds))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-xH6NW7rwDRg", "outputId": "42bdd58d-730d-49e8-c958-ed86a0a714f1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n", " full_data['pred_price'] = np.concatenate(np.array(preds))\n" ] } ] }, { "cell_type": "code", "source": [ "full_data['diff'] = (full_data['pred_price']-full_data['price'])/(full_data['price']*100)*10000" ], "metadata": { "id": "8Q15MWTjxbVA" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "sns.distplot(full_data['diff'])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 351 }, "id": "AhY2fdDaxklU", "outputId": "c4343704-b7b6-4e19-ac7f-0c044be76a6e" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 106 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "full_data = full_data[(full_data['diff'] > -40) & (full_data['diff'] < 40)]" ], "metadata": { "id": "LVqw0yno1otY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "sns.distplot(full_data['diff'])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 351 }, "id": "YeVaBMq913Xg", "outputId": "c6c89ee9-0b4f-4bf4-86f7-52f2940411dc" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n", " warnings.warn(msg, FutureWarning)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 108 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": { "needs_background": "light" } } ] }, { "cell_type": "code", "source": [ "from sklearn.metrics import mean_absolute_percentage_error\n", "1-mean_absolute_percentage_error(full_data['price'], full_data['pred_price'])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Eo2R38Mo14Rk", "outputId": "3bfa5396-5357-4d10-96c4-31df7e9cd2cc" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8755430954272614" ] }, "metadata": {}, "execution_count": 110 } ] }, { "cell_type": "code", "source": [ "marker = []\n", "for diff in full_data['diff']:\n", " if diff >= 3:\n", " marker.append('overpriced')\n", " elif diff <= -3:\n", " marker.append('underpriced')\n", " else:\n", " marker.append('riskey')" ], "metadata": { "id": "iuDeTnh72Bpi" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "full_data['marker'] = marker" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nuCT6GnB22If", "outputId": "6097b72e-dcaa-4eac-c497-d8f6c3d8645c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " full_data['marker'] = marker\n" ] } ] }, { "cell_type": "code", "source": [ "full_data = full_data[~full_data['marker'].isin(['riskey'])]" ], "metadata": { "id": "i97F-t_Q28x7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "full_data.to_csv('vtorichka_final.csv', index=False)" ], "metadata": { "id": "ad4O7c_929iE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "full_data" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 487 }, "id": "qNGLilGP3Eoq", "outputId": "c8887e92-129b-4374-dfd5-c694eb0179b7" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " link floor rooms_count \\\n", "1 https://krasnodar.cian.ru/sale/flat/279661943/ 20 1 \n", "20 https://krasnodar.cian.ru/sale/flat/283765339/ 5 1 \n", "58 https://krasnodar.cian.ru/sale/flat/280008092/ 10 2 \n", "72 https://krasnodar.cian.ru/sale/flat/283282828/ 6 2 \n", "73 https://krasnodar.cian.ru/sale/flat/283383338/ 6 1 \n", ".. ... ... ... \n", "104 https://krasnodar.cian.ru/sale/flat/277400873/ 5 2 \n", "106 https://krasnodar.cian.ru/sale/flat/279004402/ 3 1 \n", "112 https://krasnodar.cian.ru/sale/flat/280209896/ 15 1 \n", "118 https://krasnodar.cian.ru/sale/flat/273790417/ 2 1 \n", "130 https://krasnodar.cian.ru/sale/flat/257283764/ 4 2 \n", "\n", " total_meters price year_of_construction living_meters \\\n", "1 43.0 4950000 2021 18.2 \n", "20 40.0 4100000 2006 22.9 \n", "58 61.2 7150000 2017 11.5 \n", "72 63.4 4530000 2010 31.5 \n", "73 41.5 3500000 2017 19.7 \n", ".. ... ... ... ... \n", "104 50.0 4900000 1965 28.0 \n", "106 39.1 4900000 2017 18.0 \n", "112 40.5 4900000 2018 16.0 \n", "118 41.7 4900000 2012 22.0 \n", "130 60.0 5800000 2014 34.0 \n", "\n", " kitchen_meters pred_price diff marker \n", "1 14.8 4.283516e+06 -13.464333 underpriced \n", "20 9.7 3.961314e+06 -3.382591 underpriced \n", "58 11.2 4.384590e+06 -38.677063 underpriced \n", "72 16.8 4.974593e+06 9.814404 overpriced \n", "73 10.8 4.035056e+06 15.287324 overpriced \n", ".. ... ... ... ... \n", "104 -1.0 4.098767e+06 -16.351694 underpriced \n", "106 10.0 3.849727e+06 -21.434151 underpriced \n", "112 13.0 4.065052e+06 -17.039765 underpriced \n", "118 10.0 3.836344e+06 -21.707273 underpriced \n", "130 10.0 4.615458e+06 -20.423139 underpriced \n", "\n", "[682 rows x 11 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
linkfloorrooms_counttotal_meterspriceyear_of_constructionliving_meterskitchen_meterspred_pricediffmarker
1https://krasnodar.cian.ru/sale/flat/279661943/20143.04950000202118.214.84.283516e+06-13.464333underpriced
20https://krasnodar.cian.ru/sale/flat/283765339/5140.04100000200622.99.73.961314e+06-3.382591underpriced
58https://krasnodar.cian.ru/sale/flat/280008092/10261.27150000201711.511.24.384590e+06-38.677063underpriced
72https://krasnodar.cian.ru/sale/flat/283282828/6263.44530000201031.516.84.974593e+069.814404overpriced
73https://krasnodar.cian.ru/sale/flat/283383338/6141.53500000201719.710.84.035056e+0615.287324overpriced
....................................
104https://krasnodar.cian.ru/sale/flat/277400873/5250.04900000196528.0-1.04.098767e+06-16.351694underpriced
106https://krasnodar.cian.ru/sale/flat/279004402/3139.14900000201718.010.03.849727e+06-21.434151underpriced
112https://krasnodar.cian.ru/sale/flat/280209896/15140.54900000201816.013.04.065052e+06-17.039765underpriced
118https://krasnodar.cian.ru/sale/flat/273790417/2141.74900000201222.010.03.836344e+06-21.707273underpriced
130https://krasnodar.cian.ru/sale/flat/257283764/4260.05800000201434.010.04.615458e+06-20.423139underpriced
\n", "

682 rows × 11 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 119 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "b3tUDuIN3iFr" }, "execution_count": null, "outputs": [] } ] }