mirror of
https://github.com/BlackWallTeam/ML.git
synced 2024-11-23 16:13:44 +03:00
1116 lines
100 KiB
Plaintext
1116 lines
100 KiB
Plaintext
|
{
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 0,
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"provenance": []
|
|||
|
},
|
|||
|
"kernelspec": {
|
|||
|
"name": "python3",
|
|||
|
"display_name": "Python 3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"name": "python"
|
|||
|
}
|
|||
|
},
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"!pip install shap catboost -qqq"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "S-bKew6sru-E",
|
|||
|
"outputId": "d3e42f68-d081-4c79-d4e4-b232aadb5992"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stdout",
|
|||
|
"text": [
|
|||
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m575.9/575.9 KB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
|||
|
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.6/76.6 MB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
|||
|
"\u001b[?25h"
|
|||
|
]
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {
|
|||
|
"id": "Rql8Sw4On-L8"
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df1 = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/cian_parsing_result_sale_1_100_krasnodar_18_Feb_2023_03_00_53_912228.csv', sep=';')\n",
|
|||
|
"df2 = pd.read_csv('/content/drive/MyDrive/krasnodar_hack/cian_parsing_result_sale_50_200_krasnodar_17_Feb_2023_16_32_25_653503.csv', sep=';')"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "kgtmpsa2oYkQ"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df = pd.concat([df1, df2])"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "Fd0PVyVxo6H3"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df = df.drop_duplicates()\n",
|
|||
|
"df = df[df['living_meters'] > 10]"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "yx0EDpVxpCeL"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df.info()"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "6Swi2L94pfp7",
|
|||
|
"outputId": "9cf81e64-d09d-4fd3-eb56-1bc119924d1c"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stdout",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"Int64Index: 1004 entries, 0 to 131\n",
|
|||
|
"Data columns (total 20 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 author 1004 non-null object \n",
|
|||
|
" 1 author_type 1004 non-null object \n",
|
|||
|
" 2 link 1004 non-null object \n",
|
|||
|
" 3 city 1004 non-null object \n",
|
|||
|
" 4 deal_type 1004 non-null object \n",
|
|||
|
" 5 accommodation_type 1004 non-null object \n",
|
|||
|
" 6 floor 1004 non-null int64 \n",
|
|||
|
" 7 floors_count 1004 non-null int64 \n",
|
|||
|
" 8 rooms_count 1004 non-null int64 \n",
|
|||
|
" 9 total_meters 1004 non-null float64\n",
|
|||
|
" 10 price_per_m2 1004 non-null int64 \n",
|
|||
|
" 11 price 1004 non-null int64 \n",
|
|||
|
" 12 year_of_construction 1004 non-null int64 \n",
|
|||
|
" 13 living_meters 1004 non-null float64\n",
|
|||
|
" 14 kitchen_meters 1004 non-null float64\n",
|
|||
|
" 15 phone 1004 non-null int64 \n",
|
|||
|
" 16 district 96 non-null object \n",
|
|||
|
" 17 street 94 non-null object \n",
|
|||
|
" 18 underground 0 non-null float64\n",
|
|||
|
" 19 residential_complex 14 non-null object \n",
|
|||
|
"dtypes: float64(4), int64(7), object(9)\n",
|
|||
|
"memory usage: 164.7+ KB\n"
|
|||
|
]
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df = df.drop(columns=['author', 'author_type', 'city', 'deal_type', 'accommodation_type', 'floors_count', 'price_per_m2', 'phone', 'district', 'street',\n",
|
|||
|
" 'underground', 'residential_complex'])"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "FfwwIZnAptDZ"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"sns.distplot(df[(df['price'] > 1000000) & (df['price'] < 10000000)]['price'].values)"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/",
|
|||
|
"height": 361
|
|||
|
},
|
|||
|
"id": "eHaAjo3oqSOu",
|
|||
|
"outputId": "687d1aee-6962-4955-8e95-5b4db0a8156c"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
|
|||
|
" warnings.warn(msg, FutureWarning)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.axes._subplots.AxesSubplot at 0x7f6be852bf40>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 48
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "display_data",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x288 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEQCAYAAAC6Om+RAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dd3hc133m8e9vBoPBoIPorGBvkqlCFUu2JMuSLTc5j+M4ki07ThQrxXESJ5uy2Se7jpNssklsx94n3lgu67aSq+wosi3ZalSnREqixAYQbCBYUIjeMTNn/5gBDVEAMSQwc2fuvJ/nwYPBlHt/lwTeOXPuueeYcw4REfGfgNcFiIhIeijgRUR8SgEvIuJTCngREZ9SwIuI+JQCXkTEp7Iu4M3sa2bWaWa7F2BbbzGzl6d9jZnZryxEnSIi2c6ybRy8mV0HDAHfdM5dtIDbXQS0AkudcyMLtV0RkWyVdS1459wTQM/0+8xstZk9aGY7zexJM9twAZt+P/AzhbuI5IusC/hZ3A18wjl3OfBfgC9ewDZuA+5d0KpERLJYgdcFzMXMSoFrgO+b2dTd4eRj7wM+PcPLjjvn3j5tG43AxcBD6a1WRCR7ZH3Ak/iU0eecu+TsB5xz9wH3pbCNDwA/cs5NLnRxIiLZKuu7aJxzA8BhM/s1AEvYcp6buR11z4hInsm6gDeze4FngfVm1m5mdwIfAu40s13AHuC957G9JmAZsG3hqxURyV5ZN0xSREQWRta14EVEZGFk1UnWmpoa19TU5HUZIiI5Y+fOnd3OudqZHsuqgG9qamLHjh1elyEikjPM7Ohsj6mLRkTEpxTwIiI+pYAXEfEpBbyIiE8p4EVEfEoBLyLiUwp4ERGfUsCLiPiUAl5ExKey6kpWkVTcs71t1sc+eNXyDFYikt3UghcR8SkFvIiITyngRUR8SgEvIuJTCngREZ9SwIuI+JQCXkTEpxTwIiI+pYAXEfEpBbyIiE8p4EVEfEoBLyLiUwp4ERGfUsCLiPiUAl5ExKcU8CIiPqWAFxHxKQW8iIhPKeBFRHxKAS8i4lMKeBERnypI58bN7AgwCMSAqHNuazr3JyIiv5TWgE96i3OuOwP7ERGRadRFIyLiU+kOeAf83Mx2mtldMz3BzO4ysx1mtqOrqyvN5YiI5I90B/ybnHOXAe8APm5m1539BOfc3c65rc65rbW1tWkuR0Qkf6Q14J1zx5PfO4EfAVemc38iIvJLaQt4Mysxs7Kp28DbgN3p2p+IiLxWOkfR1AM/MrOp/dzjnHswjfsTEZFp0hbwzrlDwJZ0bV9ERM5NwyRFRHwqExc6iVyQe7a3eV2CSE5TC15ExKcU8CIiPqWAFxHxKQW8iIhP6SSrZMS5Tph+8KrlGaxEJH+oBS8i4lMKeBERn1LAi4j4lPrgxXN9IxN86YlDPN3azYm+MdbWlfL2zfUEAwGCAfO6PJGcpYAXTzWfGuAffraPofEob1xVzQ3ra9lzYoBP/edeGsqL+NBVy6kuDXtdpkhOUsCLZ14+1scPdh5j0+JyPvNrl7C+oQwA5xwP7engT773Ml996jAfu24VVcWFHlcrknvUBy+eaOsZ4Yc721lRXcK9H7v6TLgDmBm3XNTAb127krFojK8/c4TJWNzDakVykwJeMm5kPMq9z7dRHingjqtWUFYUmvF5iysj3H7FcroGx3l4b0eGqxTJfQp4ybif7T7F4NgkH7xyBZHC4Dmfu7a+jCuaFvFUazftvSMZqlDEHxTwklHHekbY2dbLtWtqWFIVSek177iogeLCIL9QK17kvCjgJWOcczzwygnKigq4cX1dyq8rCgV589paDnQO0XZ6OI0ViviLAl4y5kDnEMd6R7l5Yz3h0Lm7Zs529apqiguDPLK/M03VifiPAl4ywjnHo/s7qYyEuGR55Xm/vrAgwLVrajjQOcTpofE0VCjiPxoHLxlxqHuYtp4R3rNlMQWB17YrUl2a7/LlVTyyr4MdR3t5++aGdJQp4itqwUtGPHvwNMWFQbauqLrgbZRHQqyvL2Pn0V5icbeA1Yn4kwJe0u5E3yj7Tg5wRdMiQsH5/cptbVrE0HiU5lMDC1SdiH8p4CXt7n0+0QVzZdOieW9rXX0ZxYVBXj3eP+9tifidAl7SKhqL850XjrGuvoyqkvnPJxMMGBsby9l/apBoXNMXiJyLAl7S6skD3XQNjnPFArTep2xeXM54NM6hLo2JFzkXBbyk1Q9fbKeqOMS6htIF2+bq2lIKCwLsOaFuGpFzSXvAm1nQzF4yswfSvS/JLv2jk/x8b8eMQyPnIxQMsL6+jL0nB4k7jaYRmU0mWvB/BOzLwH4kyzy4+yQT0Tjvu2zpgm97fUMZw+NRTvWPLfi2RfwirQFvZkuBdwFfSed+JDv9566TrKguZsvSigXf9uraRJfPwa6hBd+2iF+kuwX/r8CfA7MOdzCzu8xsh5nt6OrqSnM5kik9wxM8e+g077q4EbOFX1e1IhKirixMa6cCXmQ2aQt4M3s30Omc23mu5znn7nbObXXOba2trU1XOZJhD+05RSzueNcbGtO2j9V1pRw5PazVnkRmkc4W/LXArWZ2BPgOcKOZfTuN+5Ms8pNXTtJUXcymxvK07WNNbSmTMUdbjxYCEZlJ2gLeOfdfnXNLnXNNwG3Ao865O9K1P8kefSOJ7pl3pql7ZsqqmhICpn54kdloHLwsuMeaO4nFHW9L84yP4VCQxooIR0+rBS8yk4wEvHPucefcuzOxL/Hew3s7qS0L84YlCz965mzLFxXT3jui2SVFZqAWvCyoiWicbS1d3LSxjkAgfd0zU1ZUFzMZc5zsH037vkRyjQJeFtT2w6cZGo9y08b6jOxvRXUJgE60isxAAS8L6uG9HRSFEsvrZUJFJERFJKR+eJEZKOBlwTjneHhfJ29eW0vReS6qPR8rqos5elozS4qcTWuyyoLZf2qQ432jXLVyUcrrrC6E5YuKeaW9n4HRyYztUyQXqAUvC+bhvR0YiYnAMmlpZQSA43060SoynQJeFszD+zpYWhWhrCiU0f02VEQwoL1XAS8ynQJeFkTn4Bi72vvZmMapCWZTWBCgvryI43060SoynQJeFsQTLd1A5rtnpiypjHC8dxSnBUBEzlDAy4LY1tJFbVmYhvIiT/a/pCrC8ESME1oAROQMBbzMWyzuePJAF9evq03r5GLnsiR5ovXVdq3TKjJFAS/ztqu9j76RSa5f5918/g0VRQQMXj3e51kNItlG4+Dlgkwf5/7wvsTwyI7+MYrD3vxKhYIB6sqK2HtiwJP9i2QjteBl3g50DLK0KuJZuE9pqChi38lBT2sQySYKeJmXkfEo7b2jrKv3ZvTMdA3lRZwaGKN3eMLrUkSyQkoBb2b3mdm7zExvCPIaB7qGcJAVAd9YkRjBs++UumlEIPUW/BeBDwIHzOwfzWx9GmuSHNJyapDiwiBLqiJel0LDVMCrm0YESDHgnXMPO+c+BFwGHAEeNrNnzOw3zSyz16VL1og7x4HOIdbUlRLwaHjkdGVFIWpKw+w7qRa8CJxHH7yZVQMfBX4beAn4PInA/0VaKpOsd6p/jKHxaFZ0z0zZ2FjGfnXRiACp98H/CHgSKAbe45y71Tn3XefcJ4DSdBYo2aulI9EVsrYue34FNjWW09IxxGQs7nUpIp5LdVzbl51zP51+h5mFnXPjzrmtaahLckBLxxCLK4oyPnvkuaxvKGMiGufo6RHWZNEbj4gXUu2i+bsZ7nt2IQuR3DI2GaOtZzirumcA1tYl6mnt1IlWkXO24M2sAVgCRMzsUmDqTFo5ie4ayVOtnUPEHazNsoBfU1eKWeLTxS0XeV2NiLfm6qJ5O4kTq0uBz067fxD4qzTVJDngQOcg4YIAyxdl1/t8pDDI0qoIBzqHvC5FxHPnDHjn3DeAb5jZrzrnfpihmiTLOedo6UgMjwwGvB8eeba1dWUc6FAXjchcXTR3OOe+DTSZ2Z+c/bhz7rMzvEx87kDnEP2jk9y4vs7rUma0tq6Up1q7icbiFAR18bXkr7m6a
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
}
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"sns.distplot(np.log(df['price'].values))\n"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/",
|
|||
|
"height": 341
|
|||
|
},
|
|||
|
"id": "QWk979EYqSX8",
|
|||
|
"outputId": "19b9153e-7e3d-4d42-bd93-9e6f91130c7d"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
|
|||
|
" warnings.warn(msg, FutureWarning)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.axes._subplots.AxesSubplot at 0x7f6bfd1dbcd0>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 26
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "display_data",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x288 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD8CAYAAABthzNFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXhcd33v8fd3RtJoXyzJu2U7wdlDHCISlvQm0JKEtCRt6b0kYUkp1IWHtLf0ljZAb6Dh8jwUetvSB2gIrW+ACwlbgLTXEEKBhC3BdnAWO5vjVfKmfV9m+d4/5owzlkfWyNbRzEif1/PMozm/c86c77Ekf/Vbj7k7IiIiU0UKHYCIiBQnJQgREclJCUJERHJSghARkZyUIEREJCclCBERySm0BGFma8zsx2a2y8x2mtl/z3GMmdk/m9luM3vSzF6Rte9WM3sheN0aVpwiIpKbhTUPwsxWACvc/XEzqwO2A7/r7ruyjrke+FPgeuAK4NPufoWZLQG2Ae2AB+de5u59oQQrIiInCa0G4e6H3f3x4P0Q8AywasphNwJf8rRHgcYgsVwLPOTuvUFSeAi4LqxYRUTkZGXzcREzWwdcCjw2Zdcq4GDWdkdQNl35KbW0tPi6devOIFIRkcVl+/bt3e7emmtf6AnCzGqBbwF/7u6DIXz+JmATQFtbG9u2bZvrS4iILFhmtn+6faGOYjKzctLJ4Svufn+OQzqBNVnbq4Oy6cpP4u53u3u7u7e3tuZMgiIichrCHMVkwL8Bz7j7P0xz2APAO4LRTK8CBtz9MPAgcI2ZNZlZE3BNUCYiIvMkzCam1wJvB54ysx1B2YeANgB3vwvYQnoE025gFHhnsK/XzD4GbA3Ou9Pde0OMVUREpggtQbj7zwCb4RgH3jfNvs3A5hBCExGRPGgmtYiI5KQEISIiOSlBiIhITkoQIiKSkxKEiIjkNC9LbYgsRF997EDO8luuaJvnSETCoRqEiIjkpAQhIiI5KUGIiEhOShAiIpKTEoSIiOSkBCEiIjkpQYiISE5KECIikpMShIiI5KQEISIiOSlBiIhITkoQIiKSU2iL9ZnZZuB3gGPuflGO/R8A3poVx/lAa/A86n3AEJAEEu7eHlacIiKSW5irud4DfAb4Uq6d7v4p4FMAZvYm4P3u3pt1yOvcvTvE+ETyNt3KrSILWWhNTO7+CNA744FpNwP3hhWLiIjMXsH7IMysGrgO+FZWsQM/MLPtZrapMJGJiCxuxfDAoDcBP5/SvHSlu3ea2VLgITN7NqiRnCRIIJsA2tr0oBYRkblS8BoEcBNTmpfcvTP4egz4NnD5dCe7+93u3u7u7a2traEGKiKymBQ0QZhZA3AV8N2sshozq8u8B64Bni5MhCIii1eYw1zvBa4GWsysA/gIUA7g7ncFh/0e8AN3H8k6dRnwbTPLxPdVd/9+WHGKiEhuoSUId785j2PuIT0cNrtsD3BJOFGJiEi+iqEPQkREipAShIiI5KQEISIiOSlBiIhITkoQIiKSkxKEiIjkpAQhIiI5KUGIiEhOShAiIpKTEoSIiOSkBCEiIjkpQYiISE5KECIikpMShIiI5KQEISIiOSlBiIhITkoQIiKSkxKEiIjkFFqCMLPNZnbMzJ6eZv/VZjZgZjuC1x1Z+64zs+fMbLeZ3R5WjCIiMr0waxD3ANfNcMxP3X1j8LoTwMyiwGeBNwIXADeb2QUhxikiIjmEliDc/RGg9zROvRzY7e573H0SuA+4cU6DExGRGRW6D+LVZvaEmX3PzC4MylYBB7OO6QjKRERkHpUV8NqPA2vdfdjMrge+A2yY7YeY2SZgE0BbW9vcRigisogVrAbh7oPuPhy83wKUm1kL0AmsyTp0dVA23efc7e7t7t7e2toaaswiIotJwRKEmS03MwveXx7E0gNsBTaY2XozqwBuAh4oVJwiuUwmUjzZ0c/e7pFChyISmtCamMzsXuBqoMXMOoCPAOUA7n4X8AfAe80sAYwBN7m7Awkzuw14EIgCm919Z1hxisxW19AEdz38ImPxJJXlEf7ymnOprihka61IOEL7qXb3m2fY/xngM9Ps2wJsCSMukTP1893dxJMp3vyKVdz/eCePPN/FdRetKHRYInOu0KOYRErKeDzJjoP9XLK6kcvWLuGSNY384sUeBsfihQ5NZM4pQYjMwuMH+phMprjirCUAXH1uK4mUs+vwYIEjE5l7ShAis/DrA/2saqxidVM1AK21Meory9jXo85qWXiUIETyNB5Pcqh/jHOX1x0vMzPWNtewr3uE9BgLkYVDCUIkTwd6R3FgXXPNCeXrWmoYHE/QN6p+CFlYlCBE8rS3e4SIQduS6hPK1zWnt9XMJAuNEoRInvZ1j7CqsYqKshN/bZbVV1JZHmGfJs3JAqMEIZKHeDJFR98Y61tqTtoXMWPtkhoO9I4WIDKR8ChBiOThYN8oSfeT+h8yljdU0jM8STKljmpZOJQgRPJwqG8MgNVT+h8yWmtjJN3pG5mcz7BEQqUEIZKHI4Pj1MXKqI3lXp2mtS4GQNfwxHyGJRIqJQiRPBweGGd5Q+W0+48niCElCFk4lCBEZhBPpjg2NMGKUySIyvIodZVlHFOCkAVECUJkBnu6Rkim/JQ1CEj3Q3QNjc9TVCLhU4IQmcEzwUJ8yxuqTnlca12MruEJLbkhC4YShMgMnjk8SDRitNbGTnlca12M8XiK7mGNZJKFQQlCZAbPHBliWV2MaMROeVymo/rFruH5CEskdEoQIjN44egQy+pP3f8A0BLUMPScalkolCBETmE8nuTwwDjNtRUzHltfWU7EoDOYVCdS6kJLEGa22cyOmdnT0+x/q5k9aWZPmdkvzOySrH37gvIdZrYtrBhFZrK/J72+UvMM/Q8A0YhRX1VOR5/WZJKFIcwaxD3AdafYvxe4yt0vBj4G3D1l/+vcfaO7t4cUn8iMMkt4N9fMXIMAaKyqoLNfNQhZGHKvGzAH3P0RM1t3iv2/yNp8FFgdViwipyuzhHdzzcw1CICm6nI1McmCUSx9EO8Cvpe17cAPzGy7mW061YlmtsnMtpnZtq6urlCDlMVnX88IS2oqqKqI5nV8Y3UFRwbHiSdTIUcmEr7QahD5MrPXkU4QV2YVX+nunWa2FHjIzJ5190dyne/udxM0T7W3t2uGksypfd2jx58Yl4+m6nJSDp9/eA9LspqlbrmiLYzwREJV0BqEmb0c+FfgRnfvyZS7e2fw9RjwbeDywkQoi92+npFpnwGRS2N1Oin0j2qynJS+giUIM2sD7gfe7u7PZ5XXmFld5j1wDZBzJJRImDJDXNfleIrcdBqrywHoH42HFZbIvAmticnM7gWuBlrMrAP4CFAO4O53AXcAzcDnzAwgEYxYWgZ8OygrA77q7t8PK06R6WSGuK5trmZkIpnXOQ1V6QTRN6YahJS+MEcx3TzD/ncD785Rvge45OQzROZXZojr+pYanu4czOuc8miEusoy1SBkQSiWUUwiRSczxHXtLPogABqryulTH4QsAEoQItPY1zPKkpqK481G+WqsrlANQhYEJQiRaezrHmHtLIa4ZjRWlTM4FtdzIaTkKUGITGN/zwjrZ9m8BFBXVU4i5YzHNVlOSpsShEgO4/EkhwbGZ93/AFBXmR77MTiuZiYpbXklCDO738x+28yUUGRRONCbHuK6rmX2TUz1lek+CyUIKXX5/of/OeAW4AUz+4SZnRtiTCIFl3noz2xmUWfUBzWIobHEnMYkMt/yShDu/kN3fyvwCmAf8MPgGQ7vNLPZDfEQKQH7gzkQs5lFnVGnGoQsEHk3GZlZM/CHpCe3/Rr4NOmE8VAokYkU0N7u0xviClBRFqGyPMLQuGoQUtrymkltZt8GzgW+DLzJ3Q8Hu76mJ77JQrS/5/SGuGbUVZarBiElL9+lNr7g7luyC8ws5u4TeuKbLET7ukd41VnNp31+fWWZahBS8vJtYvpfOcp+OZeBiBSLMxnimlGvGoQsAKesQZjZcmAVUGVmlwIW7KoHTr/+LVLE/uUnLwJwqH+Mrz524LQ+o
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
}
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df = df[(df['price'] > 1000000) & (df['price'] < 10000000)]"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "yTsj5H6er_E4"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"1.050000e+06"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "9HUIrkcm2Fjb",
|
|||
|
"outputId": "c12b60c2-8596-4670-bb4a-b6e905d50d46",
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
}
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"1050000.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 129
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df['price'].describe()"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "XExt2svC1_02",
|
|||
|
"outputId": "cf147a42-b740-455b-c75f-51fc365f9a94"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"count 9.210000e+02\n",
|
|||
|
"mean 4.081383e+06\n",
|
|||
|
"std 1.286029e+06\n",
|
|||
|
"min 1.050000e+06\n",
|
|||
|
"25% 3.400000e+06\n",
|
|||
|
"50% 3.980000e+06\n",
|
|||
|
"75% 4.500000e+06\n",
|
|||
|
"max 9.750000e+06\n",
|
|||
|
"Name: price, dtype: float64"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 128
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import StratifiedKFold\n",
|
|||
|
"from catboost import CatBoostRegressor, Pool"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "HgnofJwbrFA0"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"regression_params = dict(\n",
|
|||
|
" iterations=5000, \n",
|
|||
|
" learning_rate=0.01,\n",
|
|||
|
" verbose=100,\n",
|
|||
|
" early_stopping_rounds=500,\n",
|
|||
|
" loss_function='RMSE',\n",
|
|||
|
" eval_metric='MAE',\n",
|
|||
|
" task_type='CPU',\n",
|
|||
|
")"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "GhbyXwJAsHdi"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0xBACED)\n",
|
|||
|
"\n",
|
|||
|
"models = []\n",
|
|||
|
"preds = []\n",
|
|||
|
"data = []\n",
|
|||
|
"for fold, (train_index, val_index) in enumerate(kfold.split(df, df['price'])):\n",
|
|||
|
" df_train = df.iloc[train_index]\n",
|
|||
|
" df_val = df.iloc[val_index]\n",
|
|||
|
"\n",
|
|||
|
" X = df_train.drop(columns=['price', 'link'])\n",
|
|||
|
" y = np.log(df_train['price'].values)\n",
|
|||
|
"\n",
|
|||
|
" X_val = df_val.drop(columns=['price', 'link'])\n",
|
|||
|
" y_val = np.log(df_val['price'].values)\n",
|
|||
|
"\n",
|
|||
|
" model = CatBoostRegressor(**regression_params).fit(X, y, eval_set=(X_val, y_val))\n",
|
|||
|
" preds.append(np.exp(model.predict(X_val)))\n",
|
|||
|
" data.append(df_val)\n",
|
|||
|
"\n",
|
|||
|
" models.append(model)"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "EMHyD_A0rx9v",
|
|||
|
"outputId": "bffcf7c6-b68b-4c06-bd6c-5adaa0ce9136"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stdout",
|
|||
|
"text": [
|
|||
|
"0:\tlearn: 0.2087847\ttest: 0.2202941\tbest: 0.2202941 (0)\ttotal: 4.76ms\tremaining: 23.8s\n",
|
|||
|
"100:\tlearn: 0.1821797\ttest: 0.2034064\tbest: 0.2034064 (100)\ttotal: 101ms\tremaining: 4.89s\n",
|
|||
|
"200:\tlearn: 0.1729615\ttest: 0.2031829\tbest: 0.2026452 (132)\ttotal: 233ms\tremaining: 5.57s\n",
|
|||
|
"300:\tlearn: 0.1675100\ttest: 0.2046471\tbest: 0.2026452 (132)\ttotal: 344ms\tremaining: 5.37s\n",
|
|||
|
"400:\tlearn: 0.1635192\ttest: 0.2059701\tbest: 0.2026452 (132)\ttotal: 441ms\tremaining: 5.05s\n",
|
|||
|
"500:\tlearn: 0.1599954\ttest: 0.2067372\tbest: 0.2026452 (132)\ttotal: 558ms\tremaining: 5.01s\n",
|
|||
|
"600:\tlearn: 0.1564030\ttest: 0.2074788\tbest: 0.2026452 (132)\ttotal: 663ms\tremaining: 4.85s\n",
|
|||
|
"Stopped by overfitting detector (500 iterations wait)\n",
|
|||
|
"\n",
|
|||
|
"bestTest = 0.2026452004\n",
|
|||
|
"bestIteration = 132\n",
|
|||
|
"\n",
|
|||
|
"Shrink model to first 133 iterations.\n",
|
|||
|
"0:\tlearn: 0.2081645\ttest: 0.2234853\tbest: 0.2234853 (0)\ttotal: 1.06ms\tremaining: 5.32s\n",
|
|||
|
"100:\tlearn: 0.1830416\ttest: 0.1983601\tbest: 0.1983601 (100)\ttotal: 89.7ms\tremaining: 4.35s\n",
|
|||
|
"200:\tlearn: 0.1764459\ttest: 0.1906699\tbest: 0.1906699 (200)\ttotal: 202ms\tremaining: 4.83s\n",
|
|||
|
"300:\tlearn: 0.1725523\ttest: 0.1887852\tbest: 0.1886834 (284)\ttotal: 307ms\tremaining: 4.79s\n",
|
|||
|
"400:\tlearn: 0.1695154\ttest: 0.1877802\tbest: 0.1877044 (384)\ttotal: 441ms\tremaining: 5.06s\n",
|
|||
|
"500:\tlearn: 0.1667373\ttest: 0.1876195\tbest: 0.1876195 (500)\ttotal: 536ms\tremaining: 4.81s\n",
|
|||
|
"600:\tlearn: 0.1635846\ttest: 0.1881362\tbest: 0.1876075 (503)\ttotal: 679ms\tremaining: 4.97s\n",
|
|||
|
"700:\tlearn: 0.1607667\ttest: 0.1877806\tbest: 0.1876075 (503)\ttotal: 771ms\tremaining: 4.73s\n",
|
|||
|
"800:\tlearn: 0.1580077\ttest: 0.1876148\tbest: 0.1875523 (794)\ttotal: 882ms\tremaining: 4.62s\n",
|
|||
|
"900:\tlearn: 0.1557013\ttest: 0.1875733\tbest: 0.1875069 (822)\ttotal: 971ms\tremaining: 4.42s\n",
|
|||
|
"1000:\tlearn: 0.1533426\ttest: 0.1880110\tbest: 0.1875069 (822)\ttotal: 1.1s\tremaining: 4.38s\n",
|
|||
|
"1100:\tlearn: 0.1513860\ttest: 0.1886087\tbest: 0.1875069 (822)\ttotal: 1.19s\tremaining: 4.21s\n",
|
|||
|
"1200:\tlearn: 0.1488173\ttest: 0.1896967\tbest: 0.1875069 (822)\ttotal: 1.3s\tremaining: 4.1s\n",
|
|||
|
"1300:\tlearn: 0.1461068\ttest: 0.1905671\tbest: 0.1875069 (822)\ttotal: 1.4s\tremaining: 3.98s\n",
|
|||
|
"Stopped by overfitting detector (500 iterations wait)\n",
|
|||
|
"\n",
|
|||
|
"bestTest = 0.1875069368\n",
|
|||
|
"bestIteration = 822\n",
|
|||
|
"\n",
|
|||
|
"Shrink model to first 823 iterations.\n",
|
|||
|
"0:\tlearn: 0.2127787\ttest: 0.2065350\tbest: 0.2065350 (0)\ttotal: 1.04ms\tremaining: 5.21s\n",
|
|||
|
"100:\tlearn: 0.1867680\ttest: 0.1874736\tbest: 0.1874736 (100)\ttotal: 112ms\tremaining: 5.41s\n",
|
|||
|
"200:\tlearn: 0.1777852\ttest: 0.1877269\tbest: 0.1866778 (123)\ttotal: 226ms\tremaining: 5.41s\n",
|
|||
|
"300:\tlearn: 0.1732305\ttest: 0.1892600\tbest: 0.1866778 (123)\ttotal: 321ms\tremaining: 5.01s\n",
|
|||
|
"400:\tlearn: 0.1700595\ttest: 0.1905574\tbest: 0.1866778 (123)\ttotal: 410ms\tremaining: 4.7s\n",
|
|||
|
"500:\tlearn: 0.1671602\ttest: 0.1912499\tbest: 0.1866778 (123)\ttotal: 501ms\tremaining: 4.5s\n",
|
|||
|
"600:\tlearn: 0.1643526\ttest: 0.1915617\tbest: 0.1866778 (123)\ttotal: 601ms\tremaining: 4.4s\n",
|
|||
|
"Stopped by overfitting detector (500 iterations wait)\n",
|
|||
|
"\n",
|
|||
|
"bestTest = 0.1866777773\n",
|
|||
|
"bestIteration = 123\n",
|
|||
|
"\n",
|
|||
|
"Shrink model to first 124 iterations.\n",
|
|||
|
"0:\tlearn: 0.2134981\ttest: 0.2029683\tbest: 0.2029683 (0)\ttotal: 1.34ms\tremaining: 6.7s\n",
|
|||
|
"100:\tlearn: 0.1884117\ttest: 0.1792734\tbest: 0.1792734 (100)\ttotal: 105ms\tremaining: 5.08s\n",
|
|||
|
"200:\tlearn: 0.1802974\ttest: 0.1747837\tbest: 0.1747608 (188)\ttotal: 219ms\tremaining: 5.24s\n",
|
|||
|
"300:\tlearn: 0.1755450\ttest: 0.1743983\tbest: 0.1743297 (292)\ttotal: 320ms\tremaining: 5s\n",
|
|||
|
"400:\tlearn: 0.1714775\ttest: 0.1744935\tbest: 0.1742771 (326)\ttotal: 422ms\tremaining: 4.84s\n",
|
|||
|
"500:\tlearn: 0.1679037\ttest: 0.1749257\tbest: 0.1742771 (326)\ttotal: 516ms\tremaining: 4.63s\n",
|
|||
|
"600:\tlearn: 0.1644564\ttest: 0.1756507\tbest: 0.1742771 (326)\ttotal: 627ms\tremaining: 4.59s\n",
|
|||
|
"700:\tlearn: 0.1610518\ttest: 0.1765478\tbest: 0.1742771 (326)\ttotal: 738ms\tremaining: 4.52s\n",
|
|||
|
"800:\tlearn: 0.1582828\ttest: 0.1772818\tbest: 0.1742771 (326)\ttotal: 839ms\tremaining: 4.4s\n",
|
|||
|
"Stopped by overfitting detector (500 iterations wait)\n",
|
|||
|
"\n",
|
|||
|
"bestTest = 0.1742771224\n",
|
|||
|
"bestIteration = 326\n",
|
|||
|
"\n",
|
|||
|
"Shrink model to first 327 iterations.\n",
|
|||
|
"0:\tlearn: 0.2131309\ttest: 0.2034499\tbest: 0.2034499 (0)\ttotal: 1.07ms\tremaining: 5.35s\n",
|
|||
|
"100:\tlearn: 0.1831531\ttest: 0.1942513\tbest: 0.1940467 (92)\ttotal: 89.5ms\tremaining: 4.34s\n",
|
|||
|
"200:\tlearn: 0.1743874\ttest: 0.1960204\tbest: 0.1939128 (119)\ttotal: 221ms\tremaining: 5.27s\n",
|
|||
|
"300:\tlearn: 0.1696565\ttest: 0.1983809\tbest: 0.1939128 (119)\ttotal: 319ms\tremaining: 4.98s\n",
|
|||
|
"400:\tlearn: 0.1659821\ttest: 0.1998092\tbest: 0.1939128 (119)\ttotal: 418ms\tremaining: 4.79s\n",
|
|||
|
"500:\tlearn: 0.1626480\ttest: 0.2014341\tbest: 0.1939128 (119)\ttotal: 514ms\tremaining: 4.62s\n",
|
|||
|
"600:\tlearn: 0.1593397\ttest: 0.2024361\tbest: 0.1939128 (119)\ttotal: 601ms\tremaining: 4.4s\n",
|
|||
|
"Stopped by overfitting detector (500 iterations wait)\n",
|
|||
|
"\n",
|
|||
|
"bestTest = 0.1939127814\n",
|
|||
|
"bestIteration = 119\n",
|
|||
|
"\n",
|
|||
|
"Shrink model to first 120 iterations.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import pickle \n",
|
|||
|
"\n",
|
|||
|
"with open('models_vtorichka.pkl', 'wb') as f:\n",
|
|||
|
" pickle.dump(models, f)"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "LuUXVv6wsto7"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import uuid"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "88sjMEUnvS16"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df['id'] = [str(uuid.uuid4()) for i in range(df.shape[0])]"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "V8PatXqMv12q",
|
|||
|
"outputId": "23782a79-969e-4607-9f18-46b86e8d0d01"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"<ipython-input-66-26f511121cc1>:1: SettingWithCopyWarning: \n",
|
|||
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|||
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|||
|
"\n",
|
|||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|||
|
" df['id'] = [str(uuid.uuid4()) for i in range(df.shape[0])]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data = pd.concat(data)"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "G81m40Zev9Jj"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data['pred_price'] = np.concatenate(np.array(preds))"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "-xH6NW7rwDRg",
|
|||
|
"outputId": "42bdd58d-730d-49e8-c958-ed86a0a714f1"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"<ipython-input-98-34150432fa31>:1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
|
|||
|
" full_data['pred_price'] = np.concatenate(np.array(preds))\n"
|
|||
|
]
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data['diff'] = (full_data['pred_price']-full_data['price'])/(full_data['price']*100)*10000"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "8Q15MWTjxbVA"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"sns.distplot(full_data['diff'])"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/",
|
|||
|
"height": 351
|
|||
|
},
|
|||
|
"id": "AhY2fdDaxklU",
|
|||
|
"outputId": "c4343704-b7b6-4e19-ac7f-0c044be76a6e"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
|
|||
|
" warnings.warn(msg, FutureWarning)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.axes._subplots.AxesSubplot at 0x7f6c3ad69460>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 106
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "display_data",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x288 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEGCAYAAABy53LJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de3xcd3nv+88zM9LoLlvX+C7bcZw4CQmJk9BAUiA7EChguiGHhLSEfbIJPSWv3ZbdvU+6e+Bw0vbsze4ptBTaXW5tCIQkDaW4EAgEQiFN4/hCiGM7tuWb4qtkyZZ1l2bmOX/MGldRJGvG1sya0Xzfr9dYa37zW2ueGY/m0e+yfsvcHRERkWxFwg5ARERKixKHiIjkRIlDRERyosQhIiI5UeIQEZGcxMIOoBBaWlq8o6Mj7DBERErK1q1bT7p769TyskgcHR0dbNmyJewwRERKipkdmq48r11VZnabme02s04zu3+ax+Nm9mjw+CYz6wjKbzWzrWa2Pfj51kn7/DQ45gvBrS2fr0FERF4tby0OM4sCXwBuBQ4Dm81so7vvnFTtHuCUu19sZncAnwY+AJwE3u3uR83sCuBJYMmk/e5ydzUhRERCkM8Wx/VAp7vvd/dx4BFgw5Q6G4AHg+3HgVvMzNz9F+5+NCjfAVSbWTyPsYqISJbymTiWAK9Mun+YV7caXlXH3RNAP9A8pc77gG3uPjap7G+DbqpPmJlN9+Rmdq+ZbTGzLT09PRfyOkREZJKino5rZpeT7r766KTiu9z9SuCm4Pab0+3r7l909/Xuvr619TWTAkRE5DzlM3EcAZZNur80KJu2jpnFgEagN7i/FPg28CF335fZwd2PBD8HgIdJd4mJiEiB5DNxbAbWmNlKM6sE7gA2TqmzEbg72H4/8BN3dzNbAHwPuN/d/yVT2cxiZtYSbFcA7wJeyuNrEBGRKfKWOIIxi/tIz4jaBTzm7jvM7AEze09Q7StAs5l1Ah8HMlN27wMuBj45ZdptHHjSzF4EXiDdYvlSvl6DiIi8lpXD9TjWr1/vOgFQRCQ3ZrbV3ddPLS+LM8cl/x7e1PWasg/esDyESEQk34p6VpWIiBQfJQ4REcmJEoeIiOREiUNERHKixCEiIjlR4hARkZwocYiISE6UOEREJCdKHCIikhMlDhERyYkSh4iI5ESJQ0REcqLEISIiOVHiEBGRnChxiIhITpQ4REQkJ0ocIiKSEyUOERHJiRKHiIjkRIlDRERyosQhIiI5UeIQEZGcxMIOQErLw5u6wg5BREKmFoeIiOREiUNERHKixCEiIjlR4hARkZwocYiISE6UOEREJCdKHCIikhMlDhERyYkSh4iI5ESJQ0REcpLXxGFmt5nZbjPrNLP7p3k8bmaPBo9vMrOOoPxWM9tqZtuDn2+dtM+1QXmnmX3OzCyfr0FERF4tb4nDzKLAF4B3AOuAO81s3ZRq9wCn3P1i4LPAp4Pyk8C73f1K4G7goUn7/DXwEWBNcLstX69BREReK58tjuuBTnff7+7jwCPAhil1NgAPBtuPA7eYmbn7L9z9aFC+A6gOWieLgAZ3f87dHfga8N48vgYREZkin4ljCfDKpPuHg7Jp67h7AugHmqfUeR+wzd3HgvqHZzkmAGZ2r5ltMbMtPT095/0iRETk1Yp6cNzMLifdffXRXPd19y+6+3p3X9/a2jr3wclr9A6OselALxPJVNihiEge5fN6HEeAZZPuLw3Kpqtz2MxiQCPQC2BmS4FvAx9y932T6i+d5ZgSgh/uPM7P9vSQcti0v4+7blgedkgikif5bHFsBtaY2UozqwTuADZOqbOR9OA3wPuBn7i7m9kC4HvA/e7+L5nK7n4MOGNmbwhmU30I+E4eX4Nk4cDJIX66u4crljTygfXL6B+Z4Jubu0gPQ4nIfJO3xBGMWdwHPAnsAh5z9x1m9oCZvSeo9hWg2cw6gY8DmSm79wEXA580sxeCW1vw2G8DXwY6gX3A9/P1GmR2KXee2H6MhqoY//71S7lq2QLeeeUijp4e5ce7usMOT0TyIK+XjnX3J4AnppR9ctL2KHD7NPv9MfDHMxxzC3DF3EYq52vn0TMcOT3C+69dSmUs/XfI1csW8PTubv78x3u45bI2dKqNyPxS1IPjUvx+8cppGqpiXL1swdmyaMR4y9pWXjpyhucP9IUYnYjkgxKHnLfRiSR7TwxwxZJGIlNaFVcsaaQyFuEHO46HFJ2I5IsSh5y3l4+fIZFyrlzS+JrH4rEoN69p4Yc7TmiQXGSeyesYh8xv24+coaEqxrKmmmkfb6iq4MjpEf7sh3tYvKAagA9qmq5IyVOLQ85LIpli74kBLl/82m6qjEsXNWDAjqNnChuciOSVEoecl8OnRkiknNWtdTPWqYvHWNFcw54TAwWMTETyTYlDzsuB3iEAOpqn76bKWNlSy7H+EcYSyUKEJSIFoMQh5+XgySHaG+LUxM89TNbRXEvKoatvuECRiUi+KXFIzpIp51DvMB3NtbPWXd5UgwEHTypxiMwXShySs2P9I4wnU6xsmT1xxCuiLF5QzcGga0tESp8Sh+Ts4MnM+MbsiSNdr4ZX+oZJpLTcush8oMQhOTt8eoTG6goaqiuyqr+iuZZEyjl6ejTPkYlIIShxSM6Onh49e0JfNjInCB45pXEOkflAiUNyMjaRpHdwjCULqrLep6EqRm08xhG1OETmBSUOycmx/lEccmpxmBlLFlRx9PRI/gITkYJR4pCcHO1Pf/nnkjgy9bsHRhmd0ImAIqVOiUNycvT0CPXxGA1V2Q2MZyxurCbl8PJxLT8iUuqUOCQnR06P5NzaAFiyML3P9iP9cx2SiBSYEodkbXQiSc/AGItzGBjPWFBdQXVFlB1KHCIlT4lDsra/Z4iUQ3tD7okjPUBezUtHlThESp0Sh2Qtszz6+SQOgIsaq9h7YpBkSlcEFCllShyStT0nBogYNNdVntf+7Q1xxhIpDmndKpGSpsQhWdtzYpCWujixyPl9bDItld2aWSVS0nTNccna3u4B2s6zmwqgrb4KA/5+62FODU+cLdd1yEVKi1ockpWR8SRdfcO018fP+xiVsQhNtZWcOKOlR0RKmRKHZKWzexB3LqjFAenuKiUOkdKmxCFZOTuj6gJaHJCeWdU7OM5EUtfmEClVShySlb3dg1REjea6C0sc7Q1VONA9MDY3gYlIwSlxSFb29QyyormWaMQu6DiZFou6q0RKlxKHZGV/zyCrsrjG+Gya6+JEDHrU4hApWUocMqtEMkVX3zCrWusu+FjRiNFcG1fiEClhShwyq1dOjTCRdFa1XniLA6C1XolDpJQpccisDpwcBGD1HCaO3qExrVklUqLymjjM7DYz221mnWZ2/zSPx83s0eDxTWbWEZQ3m9nTZjZoZp+fss9Pg2O+ENza8vkaJL0qLsCqlgvvqgJoq4+TcugdUqtDpBTlLXGYWRT4AvAOYB1wp5mtm1LtHuCUu18MfBb4dFA+CnwC+P0ZDn+Xu18d3LrnPnqZbF/PEAtrKlhYe36LG07VGsysUneVSGnKZ4vjeqDT3fe7+zjwCLBhSp0NwIPB9uPALWZm7j7k7s+QTiASsv09g3MyMJ7RWqfEIVLK8pk4lgCvTLp/OCibto67J4B+oDmLY/9t0E31CTOb9sQCM7vXzLaY2Zaenp7co5ez9p8cYuUcTMXNiFdEaayuUOIQKVGlODh+l7tfCdwU3H5zukru/kV3X+/u61tbWwsa4HwyMDpBz8DYnM2oymiti+vscZESlc/EcQRYNun+0qBs2jpmFgMagd5zHdTdjwQ/B4CHSXeJSZ4c6h0GYGXzHCeO+jg9g2O4a2aVSKnJKnGY2T+Y2a+ZWS6JZjOwxsxWmlklcAewcUqdjcDdwfb7gZ/4Ob5JzCxmZi3BdgXwLuClHGKSHHX1pRPH8uaaOT1ua32c8USKM6OJOT2uiORfthdy+ivgPwCfM7O/B/7W3Xefawd3T5jZfcCTQBT4qrvvMLMHgC3uvhH4CvCQmXUCfaSTCwBmdhBoACrN7L3A24BDw
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
}
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data = full_data[(full_data['diff'] > -40) & (full_data['diff'] < 40)]"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "LVqw0yno1otY"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"sns.distplot(full_data['diff'])"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/",
|
|||
|
"height": 351
|
|||
|
},
|
|||
|
"id": "YeVaBMq913Xg",
|
|||
|
"outputId": "c6c89ee9-0b4f-4bf4-86f7-52f2940411dc"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
|
|||
|
" warnings.warn(msg, FutureWarning)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<matplotlib.axes._subplots.AxesSubplot at 0x7f6c3ad75d90>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 108
|
|||
|
},
|
|||
|
{
|
|||
|
"output_type": "display_data",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 432x288 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEGCAYAAABy53LJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dd3xc5Zn3/8+lUe/dsiVbknvFxpZtESB0MKGYhGY6DxDCD9jkSYcUNuFhd0M2CykQdinJUmKMMRgMMZgOJrhI7g3bsqzqpmZJtqw61++PGRMhZFmyNTqamev9eumlmTPnjL7HlnTp3Pd97ltUFWOMMaa3QpwOYIwxxr9Y4TDGGNMnVjiMMcb0iRUOY4wxfWKFwxhjTJ+EOh1gIKSmpmpOTo7TMYwxxq+sWbOmWlXTum4PisKRk5NDYWGh0zGMMcaviEhpd9utqcoYY0yfWOEwxhjTJ1Y4jDHG9IkVDmOMMX1ihcMYY0yfWOEwxhjTJ1Y4jDHG9IkVDmOMMX1ihcMYY0yfBMWd48YMhPmryvp8zPWzR/ggiTG+ZVccxhhj+sQKhzHGmD6xwmGMMaZPrHAYY4zpEyscxhhj+sQKhzHGmD6xwmGMMaZPrHAYY4zpEyscxhhj+sQKhzHGmD6xwmGMMaZPrHAYY4zpEyscxhhj+sQKhzHGmD6xwmGMMaZPfFo4RGSOiGwXkSIRua+b1yNE5CXv66tEJMe7/QIRWSMim7yfz+10zEfe91zv/Uj35TkYY4z5Mp8t5CQiLuBx4AKgAigQkSWqurXTbrcDdao6WkTmAQ8D1wLVwGWqukdEJgPLgMxOx92gqoW+ym5Mf+pwK9v2NrBtbwOltU00HGkjJERIjApj54FGrpyexeTMBKdjGtNrvlwBcBZQpKrFACKyAJgLdC4cc4FfeR8vAh4TEVHVdZ322QJEiUiEqrb4MK8x/cqtytrSOj7cfoC6pjaiwlyMSoth0tB43KpUHWrhb6vK+Os/Srho0hB+fflkMhIinY5tzHH5snBkAuWdnlcAs4+1j6q2i0g9kILniuOoK4G1XYrGX0WkA3gFeEhVtesXF5E7gTsBRoyw5TnNwKpubGHhmnIq6o4wPCmKS6YMY/zQOEJEvrTfpVOH8tdPS/jvj3dx0e8/4b+unsr5E4c4lNqY3hnUneMiMglP89V3Om2+QVWnAGd6P27q7lhVfVJV81Q1Ly0tzfdhjQHUe5Xx2IdF1Bxq5eoZWdx11igmDov/StEAiI8M43vnj2Hp985kRHI0dz5fyN9WlTqQ3Jje8+UVRyUwvNPzLO+27vapEJFQIAGoARCRLGAxcLOq7jp6gKpWej83ish8PE1iz/nqJIzprdZ2N6+tr2R9+UFyU2O4Jm84CVFhPR4zf1XZF4+vnJ5Fa7ubny/ezKriWvJHpnR7zPWz7QraOMuXVxwFwBgRyRWRcGAesKTLPkuAW7yPrwI+UFUVkUTg78B9qvqPozuLSKiIpHofhwGXApt9eA7G9Mq++maeXL6LDeUHOX9COrefkXvcotFVeGgIN+ZnMz4jjjc27GFD+UEfpTXm5PiscKhqO3AvnhFR24CFqrpFRB4Ukcu9uz0DpIhIEfAD4OiQ3XuB0cADXYbdRgDLRGQjsB7PFctTvjoHY3pjY8VBLn/sU2oOtXLTadmcO35It81SveEKEa6bNYLslBheWVtBeW1TP6c15uRJN/3KAScvL08LC230rul/727dz7+8uJbU2Ai+NT2LjPj+GRV1uKWdP39URIdbueec0cRF/vPqxZqqzEARkTWqmtd1uy/7OIwZFDr3I/RWb345v1RQxv2vbmJKViLP3JLHO1v2n0i8bsVEhHJjfjZPfLSLV9dWcvNp2cgJXsUY098G9agqYwYjVeXxD4v46SubOGNMGvPvmE1qbES/f52hCVFcPDmD7fsbWbW7tt/f35gTZYXDmD5wu5Vfv7GV/1y2nW+emskzt+QRE+G7C/f8kSmMHRLLW5v3cqCx2Wdfx5i+sMJhTC+1d7j5/sL1/O9nJdxxRi7/dfVUwly+/RESEb41PYswVwgLC8tpd7t9+vWM6Q0rHMb0Qodb+dHLG3h9/R5+Mmccv7h0IiEhA9PnEB8ZxrdOzWTPwWY+2VE1IF/TmJ5Y4TDmODrcyo8XbeC19Xv48UXjuPvs0QOeYeKwBKZkJvDR9ipKqg8P+Nc3pjMrHMb0QFX5xWubeXVtJT+4YCz3nDPwReOoS6YMxRUi/PL1zQTDMHozeFnhMKYHTy0v5sXVZdx99ii+e94YR7PER4Vx4cQhLN9ZzRsb9zqaxQQ3KxzGHMN7W/fzH299ziWnDOVHF45zOg4As0emcEpWAg++sZX6I21OxzFBygqHMd3YtreB7y1Yx5TMBH531dQB6wg/nhAR/v2bU6g93MIf3tvpdBwTpKxwGNNFc1sH336ukNjIUJ66OY+ocJfTkb5kcmYC184cznMrSiiuOuR0HBOErHAY04mq8tr6SvbWN/PnG2YwpJ/mnupvP7hgHJFhLv596edORzFByAqHMZ1sqDjIxop6vn/+GGZkJzkd55jS4iK4+5xRvLdtP58VVR//AGP6kRUOY7wOtbTzxoa9jEiO5v9z4F6Nvrrt9FyykqJ48M2tdLhteK4ZOFY4jPF6Y8MeWjvcfOvUTFyDpDO8J5FhLu6/eAKf72tk0Zpyp+OYIGKFwxig6MAhNlXWc864NNIHab9Gd74xJYNpwxP5w3s7aW7rcDqOCRJWOEzQ63Arb27cQ3JMOGeOSXM6Tp+ICD+5aBx76ptPaN0RY06ELeRkgt6q3TUcaGzhpvxsn8922x+6KxCj0mL4r3e2IwIRoV8dPmyrBpr+NPh/Sozxoea2Dj74/AAj02IYnxHndJwTduHEDA63dvDZrhqno5ggYIXDBLVPi6ppau1gzqQMv16adXhyNBOGxrN8ZxVNre1OxzEBzpqqTNBqamnn06JqJmcmkJUU/aXX/LG/4IIJQ/jTBw18urOaCydlOB3HBDC74jBB67PiGlrb3Zw3Pt3pKP0iIyGSSZkJrCiu4UirjbAyvmOFwwSllrYOVuyqYeLQ+EE7rciJOHtsGi3tblYUW1+H8R0rHCYorS6p5UhbB2eN9a/ht8czLDGK8Rlx/KOompZ2u+owvmGFwwSdtg43n+6sZnRaLMOTo49/gJ85e1w6R9o6WL271ukoJkBZ4TBBZ21ZHY0t7Zw1LrCuNo4akRzNyLQYPt1ZTVuH2+k4JgBZ4TBBpcOtfLKjiuFJUYxMjXE6js+cMy6dxpZ21pTWOR3FBCArHCaobNvbQF1TG18fm+bX920cz8jUGLKSovhHUTVutZlzTf+ywmGCyoriGhKjw5gwNN7pKD4lIpwxOpWaw61s39fodBwTYHxaOERkjohsF5EiEbmvm9cjROQl7+urRCTHu/0CEVkjIpu8n8/tdMwM7/YiEfmjBPKfjaZf7a0/wu7qw+TnphASBN82k4YlkBAVxqe20JPpZz4rHCLiAh4HLgYmAteJyMQuu90O1KnqaOBR4GHv9mrgMlWdAtwCPN/pmCeAbwNjvB9zfHUOJrCsLK4hzCXk5Qzelf36kytE+NqoFHZXH2ZzZb3TcUwA8eUVxyygSFWLVbUVWADM7bLPXOBZ7+NFwHkiIqq6TlX3eLdvAaK8VydDgXhVXamqCjwHXOHDczABoqm1nfXlB5malUh0ePDMtJOXnUy4K4S/fLrb6SgmgPiycGQCnZclq/Bu63YfVW0H6oGULvtcCaxV1Rbv/hXHeU8AROROESkUkcKqqqoTPgkTGApL6mjrUE4b1fXbK7BFhbuYkZPEGxv3cKCh2ek4JkAM6s5xEZmEp/nqO309VlWfVNU8Vc1LSwvM8fqmd9yqrNpdQ05KDEMTopyOM+C+NjKFdrfy3IpSp6OYAOHLwlEJDO/0PMu7rdt9RCQUSABqvM+zgMXAzaq6q9P+Wcd5T2O+ZOf+Ruqa2oLuauOolNgIzhufzoKCMlrb7YZAc/J8WTgKgDEikisi4cA8YEmXfZbg6fwGuAr4QFVVRBKBv
|
|||
|
},
|
|||
|
"metadata": {
|
|||
|
"needs_background": "light"
|
|||
|
}
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"from sklearn.metrics import mean_absolute_percentage_error\n",
|
|||
|
"1-mean_absolute_percentage_error(full_data['price'], full_data['pred_price'])"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "Eo2R38Mo14Rk",
|
|||
|
"outputId": "3bfa5396-5357-4d10-96c4-31df7e9cd2cc"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"0.8755430954272614"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 110
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"marker = []\n",
|
|||
|
"for diff in full_data['diff']:\n",
|
|||
|
" if diff >= 3:\n",
|
|||
|
" marker.append('overpriced')\n",
|
|||
|
" elif diff <= -3:\n",
|
|||
|
" marker.append('underpriced')\n",
|
|||
|
" else:\n",
|
|||
|
" marker.append('riskey')"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "iuDeTnh72Bpi"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data['marker'] = marker"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/"
|
|||
|
},
|
|||
|
"id": "nuCT6GnB22If",
|
|||
|
"outputId": "6097b72e-dcaa-4eac-c497-d8f6c3d8645c"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "stream",
|
|||
|
"name": "stderr",
|
|||
|
"text": [
|
|||
|
"<ipython-input-113-c38300c6d6bf>:1: SettingWithCopyWarning: \n",
|
|||
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|||
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|||
|
"\n",
|
|||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|||
|
" full_data['marker'] = marker\n"
|
|||
|
]
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data = full_data[~full_data['marker'].isin(['riskey'])]"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "i97F-t_Q28x7"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data.to_csv('vtorichka_final.csv', index=False)"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"id": "ad4O7c_929iE"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"full_data"
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"colab": {
|
|||
|
"base_uri": "https://localhost:8080/",
|
|||
|
"height": 487
|
|||
|
},
|
|||
|
"id": "qNGLilGP3Eoq",
|
|||
|
"outputId": "c8887e92-129b-4374-dfd5-c694eb0179b7"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"output_type": "execute_result",
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
" link floor rooms_count \\\n",
|
|||
|
"1 https://krasnodar.cian.ru/sale/flat/279661943/ 20 1 \n",
|
|||
|
"20 https://krasnodar.cian.ru/sale/flat/283765339/ 5 1 \n",
|
|||
|
"58 https://krasnodar.cian.ru/sale/flat/280008092/ 10 2 \n",
|
|||
|
"72 https://krasnodar.cian.ru/sale/flat/283282828/ 6 2 \n",
|
|||
|
"73 https://krasnodar.cian.ru/sale/flat/283383338/ 6 1 \n",
|
|||
|
".. ... ... ... \n",
|
|||
|
"104 https://krasnodar.cian.ru/sale/flat/277400873/ 5 2 \n",
|
|||
|
"106 https://krasnodar.cian.ru/sale/flat/279004402/ 3 1 \n",
|
|||
|
"112 https://krasnodar.cian.ru/sale/flat/280209896/ 15 1 \n",
|
|||
|
"118 https://krasnodar.cian.ru/sale/flat/273790417/ 2 1 \n",
|
|||
|
"130 https://krasnodar.cian.ru/sale/flat/257283764/ 4 2 \n",
|
|||
|
"\n",
|
|||
|
" total_meters price year_of_construction living_meters \\\n",
|
|||
|
"1 43.0 4950000 2021 18.2 \n",
|
|||
|
"20 40.0 4100000 2006 22.9 \n",
|
|||
|
"58 61.2 7150000 2017 11.5 \n",
|
|||
|
"72 63.4 4530000 2010 31.5 \n",
|
|||
|
"73 41.5 3500000 2017 19.7 \n",
|
|||
|
".. ... ... ... ... \n",
|
|||
|
"104 50.0 4900000 1965 28.0 \n",
|
|||
|
"106 39.1 4900000 2017 18.0 \n",
|
|||
|
"112 40.5 4900000 2018 16.0 \n",
|
|||
|
"118 41.7 4900000 2012 22.0 \n",
|
|||
|
"130 60.0 5800000 2014 34.0 \n",
|
|||
|
"\n",
|
|||
|
" kitchen_meters pred_price diff marker \n",
|
|||
|
"1 14.8 4.283516e+06 -13.464333 underpriced \n",
|
|||
|
"20 9.7 3.961314e+06 -3.382591 underpriced \n",
|
|||
|
"58 11.2 4.384590e+06 -38.677063 underpriced \n",
|
|||
|
"72 16.8 4.974593e+06 9.814404 overpriced \n",
|
|||
|
"73 10.8 4.035056e+06 15.287324 overpriced \n",
|
|||
|
".. ... ... ... ... \n",
|
|||
|
"104 -1.0 4.098767e+06 -16.351694 underpriced \n",
|
|||
|
"106 10.0 3.849727e+06 -21.434151 underpriced \n",
|
|||
|
"112 13.0 4.065052e+06 -17.039765 underpriced \n",
|
|||
|
"118 10.0 3.836344e+06 -21.707273 underpriced \n",
|
|||
|
"130 10.0 4.615458e+06 -20.423139 underpriced \n",
|
|||
|
"\n",
|
|||
|
"[682 rows x 11 columns]"
|
|||
|
],
|
|||
|
"text/html": [
|
|||
|
"\n",
|
|||
|
" <div id=\"df-a6cf0fdf-dab1-46b9-bd08-d0381acd0037\">\n",
|
|||
|
" <div class=\"colab-df-container\">\n",
|
|||
|
" <div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>link</th>\n",
|
|||
|
" <th>floor</th>\n",
|
|||
|
" <th>rooms_count</th>\n",
|
|||
|
" <th>total_meters</th>\n",
|
|||
|
" <th>price</th>\n",
|
|||
|
" <th>year_of_construction</th>\n",
|
|||
|
" <th>living_meters</th>\n",
|
|||
|
" <th>kitchen_meters</th>\n",
|
|||
|
" <th>pred_price</th>\n",
|
|||
|
" <th>diff</th>\n",
|
|||
|
" <th>marker</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/279661943/</td>\n",
|
|||
|
" <td>20</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>43.0</td>\n",
|
|||
|
" <td>4950000</td>\n",
|
|||
|
" <td>2021</td>\n",
|
|||
|
" <td>18.2</td>\n",
|
|||
|
" <td>14.8</td>\n",
|
|||
|
" <td>4.283516e+06</td>\n",
|
|||
|
" <td>-13.464333</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>20</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/283765339/</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>40.0</td>\n",
|
|||
|
" <td>4100000</td>\n",
|
|||
|
" <td>2006</td>\n",
|
|||
|
" <td>22.9</td>\n",
|
|||
|
" <td>9.7</td>\n",
|
|||
|
" <td>3.961314e+06</td>\n",
|
|||
|
" <td>-3.382591</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>58</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/280008092/</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>61.2</td>\n",
|
|||
|
" <td>7150000</td>\n",
|
|||
|
" <td>2017</td>\n",
|
|||
|
" <td>11.5</td>\n",
|
|||
|
" <td>11.2</td>\n",
|
|||
|
" <td>4.384590e+06</td>\n",
|
|||
|
" <td>-38.677063</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>72</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/283282828/</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>63.4</td>\n",
|
|||
|
" <td>4530000</td>\n",
|
|||
|
" <td>2010</td>\n",
|
|||
|
" <td>31.5</td>\n",
|
|||
|
" <td>16.8</td>\n",
|
|||
|
" <td>4.974593e+06</td>\n",
|
|||
|
" <td>9.814404</td>\n",
|
|||
|
" <td>overpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>73</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/283383338/</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>41.5</td>\n",
|
|||
|
" <td>3500000</td>\n",
|
|||
|
" <td>2017</td>\n",
|
|||
|
" <td>19.7</td>\n",
|
|||
|
" <td>10.8</td>\n",
|
|||
|
" <td>4.035056e+06</td>\n",
|
|||
|
" <td>15.287324</td>\n",
|
|||
|
" <td>overpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>104</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/277400873/</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>50.0</td>\n",
|
|||
|
" <td>4900000</td>\n",
|
|||
|
" <td>1965</td>\n",
|
|||
|
" <td>28.0</td>\n",
|
|||
|
" <td>-1.0</td>\n",
|
|||
|
" <td>4.098767e+06</td>\n",
|
|||
|
" <td>-16.351694</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>106</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/279004402/</td>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>39.1</td>\n",
|
|||
|
" <td>4900000</td>\n",
|
|||
|
" <td>2017</td>\n",
|
|||
|
" <td>18.0</td>\n",
|
|||
|
" <td>10.0</td>\n",
|
|||
|
" <td>3.849727e+06</td>\n",
|
|||
|
" <td>-21.434151</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>112</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/280209896/</td>\n",
|
|||
|
" <td>15</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>40.5</td>\n",
|
|||
|
" <td>4900000</td>\n",
|
|||
|
" <td>2018</td>\n",
|
|||
|
" <td>16.0</td>\n",
|
|||
|
" <td>13.0</td>\n",
|
|||
|
" <td>4.065052e+06</td>\n",
|
|||
|
" <td>-17.039765</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>118</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/273790417/</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>41.7</td>\n",
|
|||
|
" <td>4900000</td>\n",
|
|||
|
" <td>2012</td>\n",
|
|||
|
" <td>22.0</td>\n",
|
|||
|
" <td>10.0</td>\n",
|
|||
|
" <td>3.836344e+06</td>\n",
|
|||
|
" <td>-21.707273</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>130</th>\n",
|
|||
|
" <td>https://krasnodar.cian.ru/sale/flat/257283764/</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>60.0</td>\n",
|
|||
|
" <td>5800000</td>\n",
|
|||
|
" <td>2014</td>\n",
|
|||
|
" <td>34.0</td>\n",
|
|||
|
" <td>10.0</td>\n",
|
|||
|
" <td>4.615458e+06</td>\n",
|
|||
|
" <td>-20.423139</td>\n",
|
|||
|
" <td>underpriced</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>682 rows × 11 columns</p>\n",
|
|||
|
"</div>\n",
|
|||
|
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a6cf0fdf-dab1-46b9-bd08-d0381acd0037')\"\n",
|
|||
|
" title=\"Convert this dataframe to an interactive table.\"\n",
|
|||
|
" style=\"display:none;\">\n",
|
|||
|
" \n",
|
|||
|
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
|||
|
" width=\"24px\">\n",
|
|||
|
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
|
|||
|
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
|
|||
|
" </svg>\n",
|
|||
|
" </button>\n",
|
|||
|
" \n",
|
|||
|
" <style>\n",
|
|||
|
" .colab-df-container {\n",
|
|||
|
" display:flex;\n",
|
|||
|
" flex-wrap:wrap;\n",
|
|||
|
" gap: 12px;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .colab-df-convert {\n",
|
|||
|
" background-color: #E8F0FE;\n",
|
|||
|
" border: none;\n",
|
|||
|
" border-radius: 50%;\n",
|
|||
|
" cursor: pointer;\n",
|
|||
|
" display: none;\n",
|
|||
|
" fill: #1967D2;\n",
|
|||
|
" height: 32px;\n",
|
|||
|
" padding: 0 0 0 0;\n",
|
|||
|
" width: 32px;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .colab-df-convert:hover {\n",
|
|||
|
" background-color: #E2EBFA;\n",
|
|||
|
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
|||
|
" fill: #174EA6;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" [theme=dark] .colab-df-convert {\n",
|
|||
|
" background-color: #3B4455;\n",
|
|||
|
" fill: #D2E3FC;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" [theme=dark] .colab-df-convert:hover {\n",
|
|||
|
" background-color: #434B5C;\n",
|
|||
|
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
|||
|
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
|||
|
" fill: #FFFFFF;\n",
|
|||
|
" }\n",
|
|||
|
" </style>\n",
|
|||
|
"\n",
|
|||
|
" <script>\n",
|
|||
|
" const buttonEl =\n",
|
|||
|
" document.querySelector('#df-a6cf0fdf-dab1-46b9-bd08-d0381acd0037 button.colab-df-convert');\n",
|
|||
|
" buttonEl.style.display =\n",
|
|||
|
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
|||
|
"\n",
|
|||
|
" async function convertToInteractive(key) {\n",
|
|||
|
" const element = document.querySelector('#df-a6cf0fdf-dab1-46b9-bd08-d0381acd0037');\n",
|
|||
|
" const dataTable =\n",
|
|||
|
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
|||
|
" [key], {});\n",
|
|||
|
" if (!dataTable) return;\n",
|
|||
|
"\n",
|
|||
|
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
|||
|
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
|||
|
" + ' to learn more about interactive tables.';\n",
|
|||
|
" element.innerHTML = '';\n",
|
|||
|
" dataTable['output_type'] = 'display_data';\n",
|
|||
|
" await google.colab.output.renderOutput(dataTable, element);\n",
|
|||
|
" const docLink = document.createElement('div');\n",
|
|||
|
" docLink.innerHTML = docLinkHtml;\n",
|
|||
|
" element.appendChild(docLink);\n",
|
|||
|
" }\n",
|
|||
|
" </script>\n",
|
|||
|
" </div>\n",
|
|||
|
" </div>\n",
|
|||
|
" "
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"execution_count": 119
|
|||
|
}
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"source": [],
|
|||
|
"metadata": {
|
|||
|
"id": "b3tUDuIN3iFr"
|
|||
|
},
|
|||
|
"execution_count": null,
|
|||
|
"outputs": []
|
|||
|
}
|
|||
|
]
|
|||
|
}
|