upload ml

2025-07-15 02:22:28 +03:00 · 2023-09-10 08:43:58 +03:00 · 2023-09-10 08:43:58 +03:00 · e548bd4ad7
commit e548bd4ad7
9 changed files with 1495 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/README.md
+++ b/README.md
@ -0,0 +1,10 @@
 ### Установить зависимости
 `pip3 install -r req.txt`
 ### Локации документов для обучения
 - `catboost-train.ipynb` - файл для обучения кэтбуста
 - `bert_train.ipynb` - файл для обучения берта
 - `nearest-search-train.ipynb` - файл для обучения поиска близжайших соседей
 - `tfidf-train.ipynb` - файл для обучения tf-idf + random forest
 ### Инференс модели
 `uvicorn inference:app --reload --workers 1`
 ### У мля слишком много больших файлов поэтому мы выложили код с весами моделей на гугл диск https://drive.google.com/drive/folders/1hnWKpZjtQLBbzAE9YsUW_4x-IEb3mFvg?usp=sharing
--- a/annoy_labels.pickle
+++ b/annoy_labels.pickle
--- a/bert_train.ipynb
+++ b/bert_train.ipynb
@ -0,0 +1,287 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "Requirement already satisfied: transformers in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (4.32.1)\n",
      "Requirement already satisfied: datasets in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (2.14.4)\n",
      "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (2.1.0)\n",
      "Requirement already satisfied: evaluate in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (0.4.0)\n",
      "Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (1.25.2)\n",
      "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (3.12.3)\n",
      "Requirement already satisfied: huggingface-hub<1.0,>=0.15.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (0.16.4)\n",
      "Requirement already satisfied: packaging>=20.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (23.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (6.0.1)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (2023.8.8)\n",
      "Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (2.31.0)\n",
      "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (0.13.3)\n",
      "Requirement already satisfied: safetensors>=0.3.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (0.3.3)\n",
      "Requirement already satisfied: tqdm>=4.27 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers) (4.66.1)\n",
      "Requirement already satisfied: pyarrow>=8.0.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from datasets) (13.0.0)\n",
      "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from datasets) (0.3.7)\n",
      "Requirement already satisfied: xxhash in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from datasets) (3.3.0)\n",
      "Requirement already satisfied: multiprocess in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from datasets) (0.70.15)\n",
      "Requirement already satisfied: fsspec[http]>=2021.11.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from datasets) (2023.6.0)\n",
      "Requirement already satisfied: aiohttp in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from datasets) (3.8.5)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas) (2023.3)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas) (2023.3)\n",
      "Requirement already satisfied: responses<0.19 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from evaluate) (0.18.0)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from aiohttp->datasets) (23.1.0)\n",
      "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from aiohttp->datasets) (3.2.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from aiohttp->datasets) (6.0.4)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from aiohttp->datasets) (4.0.3)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.9.2)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.4.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from aiohttp->datasets) (1.3.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.15.1->transformers) (4.7.1)\n",
      "Requirement already satisfied: six>=1.5 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->transformers) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->transformers) (2.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->transformers) (2023.7.22)\n"
     ]
    }
   ],
   "source": [
    "!pip3 install transformers datasets pandas evaluate numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "label2id = {\n",
    " 'AAA(RU)': 0,\n",
    " 'AA(RU)': 1, \n",
    " 'A+(RU)': 2,\n",
    " 'A(RU)': 3,\n",
    " 'A-(RU)': 4,\n",
    " 'BBB+(RU)': 5,\n",
    " 'BBB(RU)': 6, \n",
    " 'AA+(RU)': 7,\n",
    " 'BBB-(RU)': 8,\n",
    " 'AA-(RU)': 9,\n",
    " 'BB+(RU)': 10, \n",
    " 'BB-(RU)': 11, \n",
    " 'B+(RU)': 12,\n",
    " 'BB(RU)': 13, \n",
    " 'B(RU)': 14,\n",
    " 'B-(RU)': 15, \n",
    " 'C(RU)': 16\n",
    "}\n",
    "id2label = {0: 'AAA(RU)',\n",
    " 1: 'AA(RU)',\n",
    " 2: 'A+(RU)',\n",
    " 3: 'A(RU)',\n",
    " 4: 'A-(RU)',\n",
    " 5: 'BBB+(RU)',\n",
    " 6: 'BBB(RU)',\n",
    " 7: 'AA+(RU)',\n",
    " 8: 'BBB-(RU)',\n",
    " 9: 'AA-(RU)',\n",
    " 10: 'BB+(RU)',\n",
    " 11: 'BB-(RU)',\n",
    " 12: 'B+(RU)',\n",
    " 13: 'BB(RU)',\n",
    " 14: 'B(RU)',\n",
    " 15: 'B-(RU)',\n",
    " 16: 'C(RU)'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('dss.pickle', 'rb') as file:\n",
    "    data = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': 'Кредитный рейтинг АКБ Энергобанк ПАО далее Энергобанк Банк обусловлен удовлетворительным бизнеспрофилем в сочетании с сильной достаточностью капитала критическим рискпрофилем и удовлетворительной оценкой ликвидности и фондирования учитывающей концентрацию обязательств Банка на средствах крупнейших кредиторовОсновная деятельность Энергобанка сконцентрирована в Республике Татарстан далее РТ где он занимает устойчивые рыночные позиции занял шестое место по размеру активов на На российском банковском рынке Банк имеет невысокую долю на он занимал е место по величине собственного капитала и е место по размеру активовКлючевыми направлениями деятельности Банка являются кредитование предприятий агропромышленного комплекса строительства и торговли а также залоговое розничное кредитование Контролирующими собственниками Банка являются ИН Хайруллин и АН Хайруллин владеющие около акций через АО Эдельвейс КорпорейшнОценка бизнеспрофиля Банка отражает его относительно невысокую долю на российском рынке банковских услуг и выраженную региональную направленность его деятельности несмотря на планы по открытию отделений за пределами РТОперационный доход Банка характеризуется низким хотя и возрастающим уровнем диверсификации по итогам года значение индекса ХерфиндаляХиршмана составило и свидетельствовало о повышенной концентрации на кредитовании корпоративного сектора около операционного дохода Качество управления Банком оценивается АКРА как удовлетворительное и соответствующее среднему уровню в российском банковском секторе в целом Организационная структура Энергобанка соответствует масштабам и особенностям его бизнеса Структура собственности Банка прозрачна при этом отмечается связанность его операций с аффилированными с собственниками Банка компаниямиСтратегия Банка на период до конца года предполагает планомерное наращивание размера активов за',\n",
       " 'label': 11}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForSequenceClassification\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\n",
    "    \"cointegrated/rubert-tiny\", num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny\")\n",
    "def token(text):\n",
    "    return tokenizer(text['text'], padding=True, truncation=True, max_length=512, return_tensors='pt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 11500/11500 [00:04<00:00, 2842.10 examples/s]\n",
      "Map: 100%|██████████| 1845/1845 [00:00<00:00, 3125.02 examples/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import Dataset\n",
    "import pandas as pd\n",
    "from random import shuffle\n",
    "shuffle(data)\n",
    "train = data[:11500]\n",
    "test = data[11500:]\n",
    "train = Dataset.from_pandas(pd.DataFrame(data=train))\n",
    "test = Dataset.from_pandas(pd.DataFrame(data=test))\n",
    "tokenized_train = train.map(token, batched=True)\n",
    "tokenized_test = test.map(token, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "import numpy as np\n",
    "\n",
    "f1 = evaluate.load(\"f1\")\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    predictions = np.argmax(predictions, axis=1)\n",
    "    return f1.compute(predictions=predictions, references=labels, average='macro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!\n",
      "PyTorch: setting up devices\n",
      "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n"
     ]
    }
   ],
   "source": [
    "from transformers import TrainingArguments, Trainer, DataCollatorWithPadding\n",
    "\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"akra_model\",\n",
    "    learning_rate=2e-5,\n",
    "    per_device_train_batch_size=16,\n",
    "    per_device_eval_batch_size=16,\n",
    "    num_train_epochs=10,\n",
    "    weight_decay=0.01,\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    load_best_model_at_end=True,\n",
    ")\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=tokenized_train,\n",
    "    eval_dataset=tokenized_test,\n",
    "    tokenizer=tokenizer,\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.train()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/catboost-train.ipynb
+++ b/catboost-train.ipynb
@ -0,0 +1,161 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: catboost in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (1.2.1)\n",
      "Requirement already satisfied: graphviz in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from catboost) (0.20.1)\n",
      "Requirement already satisfied: matplotlib in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from catboost) (3.7.2)\n",
      "Requirement already satisfied: numpy>=1.16.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from catboost) (1.25.2)\n",
      "Requirement already satisfied: pandas>=0.24 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from catboost) (2.1.0)\n",
      "Requirement already satisfied: scipy in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from catboost) (1.11.2)\n",
      "Requirement already satisfied: plotly in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from catboost) (5.16.1)\n",
      "Requirement already satisfied: six in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from catboost) (1.16.0)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas>=0.24->catboost) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas>=0.24->catboost) (2023.3)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas>=0.24->catboost) (2023.3)\n",
      "Requirement already satisfied: contourpy>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from matplotlib->catboost) (1.1.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from matplotlib->catboost) (0.11.0)\n",
      "Requirement already satisfied: fonttools>=4.22.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from matplotlib->catboost) (4.42.1)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from matplotlib->catboost) (1.4.5)\n",
      "Requirement already satisfied: packaging>=20.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from matplotlib->catboost) (23.1)\n",
      "Requirement already satisfied: pillow>=6.2.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from matplotlib->catboost) (10.0.0)\n",
      "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from matplotlib->catboost) (3.0.9)\n",
      "Requirement already satisfied: tenacity>=6.2.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from plotly->catboost) (8.2.3)\n"
     ]
    }
   ],
   "source": [
    "!pip3 install catboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('cb.pickle', 'rb') as file:\n",
    "    cb_dataset = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostClassifier(iterations=50000,\n",
    "                          learning_rate=1e-2,\n",
    "                          depth=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "label2id = {\n",
    " 'AAA(RU)': 0,\n",
    " 'AA(RU)': 1, \n",
    " 'A+(RU)': 2,\n",
    " 'A(RU)': 3,\n",
    " 'A-(RU)': 4,\n",
    " 'BBB+(RU)': 5,\n",
    " 'BBB(RU)': 6, \n",
    " 'AA+(RU)': 7,\n",
    " 'BBB-(RU)': 8,\n",
    " 'AA-(RU)': 9,\n",
    " 'BB+(RU)': 10, \n",
    " 'BB-(RU)': 11, \n",
    " 'B+(RU)': 12,\n",
    " 'BB(RU)': 13, \n",
    " 'B(RU)': 14,\n",
    " 'B-(RU)': 15, \n",
    " 'C(RU)': 16\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = []\n",
    "train_labels = []\n",
    "for i in cb_dataset[0:6500]:\n",
    "    train.append([label2id[i['outs'][0]['answer']], i['outs'][0]['metric'], i['outs'][1]['metric'], label2id[i['outs'][1]['answer']], i['outs'][2]['metric'], label2id[i['outs'][2]['answer']]])\n",
    "    if not isinstance(i['label'], int):\n",
    "        train_labels.append(label2id[i['label'] + '(RU)'])\n",
    "    else:\n",
    "        train_labels.append(i['label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = []\n",
    "test_labels = []\n",
    "for i in cb_dataset[6500:]:\n",
    "    test.append([label2id[i['outs'][0]['answer']], i['outs'][0]['metric'], i['outs'][1]['metric'], label2id[i['outs'][1]['answer']], i['outs'][2]['metric'], label2id[i['outs'][2]['answer']]])\n",
    "    if not isinstance(i['label'], int):\n",
    "        test_labels.append(label2id[i['label'] + '(RU)'])\n",
    "    else:\n",
    "        test_labels.append(i['label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(train, train_labels)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/inference.py
+++ b/inference.py
@ -0,0 +1,529 @@
 import torch
 from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer, BertConfig
 from captum.attr import LayerIntegratedGradients
 import re
 import torch
 import numpy as np
 from collections import Counter
 from fastapi import FastAPI
 from pydantic import BaseModel
 import pickle
 from  matplotlib.colors import LinearSegmentedColormap
 from catboost import CatBoostClassifier
 from pymystem3 import Mystem
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.ensemble import RandomForestClassifier
 from  matplotlib.colors import LinearSegmentedColormap
 from sentence_transformers import SentenceTransformer
 sentence_model = SentenceTransformer('sentence-transformers/LaBSE')
 catboost = CatBoostClassifier().load_model('catboost')
 def get_embs(text):
    embeddings = sentence_model.encode(text)
    return embeddings
 cmap = LinearSegmentedColormap.from_list('rg',["w", "g"], N=512)
 mstm = Mystem()
 with open('vectorizer.pickle', 'rb') as file:
    model_tfidf = pickle.load(file)
 with open('tree.pickle', 'rb') as file:
    cls = pickle.load(file)
 def resolve_text(tokens, text):
    words = text.split()
    tokens_values = list(map(lambda tok: tok[0], tokens))
    tokens_metrics = list(map(lambda tok: tok[1], tokens))
    resolved = []
    for i, word in enumerate(words):
        try:
            if mstm.lemmatize(word)[0] in tokens_values:
                try:
                    value = tokens_metrics[tokens_values.index(mstm.lemmatize(word)[0])]
                    #color = from_abs_to_rgb(min(tokens_metrics), max(tokens_metrics), value)
                    resolved.append(f'<span data-value="{(value - min(tokens_metrics))/ max(tokens_metrics)}">{word}</span>')
                except:
                    resolved.append(word)
            else:
                resolved.append(word)
        except:
            resolved.append(word)
    return ' '.join(resolved)
 def process_classify(text):
    if not len(text.replace(' ', '')): return {'ans': 0, 'text': ''}
    try:
        normalized = ''.join(mstm.lemmatize(text)[:-1])
    except: return {'ans': 0, 'text': ''}
    tf_idfed = model_tfidf.transform(np.array([normalized]))[0]
    ans = cls.predict(tf_idfed)[0]
    return {'ans': ans, 'text': ""}
 def process_embedding(text):
    if not len(text.replace(' ', '')): return {'ans': 0, 'text': ''}
    try:
        normalized = ''.join(mstm.lemmatize(text)[:-1])
    except: return {'ans': 0, 'text': ''}
    tf_idfed = model_tfidf.transform(np.array([normalized]))[0]
    values = []
    for i in range(5000):
        values.append(tf_idfed.todense()[0, i])
    important_tokens = []
    for i, val in enumerate(values):
        if val > (np.min(values) + np.max(values)) / 3:
            important_tokens.append((val, i))
    tokens = model_tfidf.get_feature_names_out()
    tokens = list(map(lambda x: (tokens[x[1]], x[0]), reversed(sorted(important_tokens))))
    ans = cls.predict(tf_idfed)[0]
    text = resolve_text(tokens, text)
    return {'ans': ans, 'text': text}
 cmap = LinearSegmentedColormap.from_list('rg',["w", "g"], N=512) 
 label2id = {
 'AAA(RU)': 0,
 'AA(RU)': 1, 
 'A+(RU)': 2,
 'A(RU)': 3,
 'A-(RU)': 4,
 'BBB+(RU)': 5,
 'BBB(RU)': 6, 
 'AA+(RU)': 7,
 'BBB-(RU)': 8,
 'AA-(RU)': 9,
 'BB+(RU)': 10, 
 'BB-(RU)': 11, 
 'B+(RU)': 12,
 'BB(RU)': 13, 
 'B(RU)': 14,
 'B-(RU)': 15, 
 'C(RU)': 16
 }
 id2label = {0: 'AAA(RU)',
 1: 'AA(RU)',
 2: 'A+(RU)',
 3: 'A(RU)',
 4: 'A-(RU)',
 5: 'BBB+(RU)',
 6: 'BBB(RU)',
 7: 'AA+(RU)',
 8: 'BBB-(RU)',
 9: 'AA-(RU)',
 10: 'BB+(RU)',
 11: 'BB-(RU)',
 12: 'B+(RU)',
 13: 'BB(RU)',
 14: 'B(RU)',
 15: 'B-(RU)',
 16: 'C(RU)'}
 cmap = LinearSegmentedColormap.from_list('rg',["w", "g"], N=512) 
 from math import inf
 from annoy import AnnoyIndex
 import numpy as np
 import pickle
 def get_distance(emb1, emb2):
    emb2 /= np.sum(emb2**2)
    emb1 /= np.sum(emb1**2)
    return 1 / abs(np.dot(emb2-emb1, emb1-emb2))
 with open('new_embeddings.pickle', 'rb') as file:
    new_embeddings = pickle.load(file)
 with open('annoy_labels.pickle', 'rb') as file:
    labels = pickle.load(file)
 with open('n_labels.pickle', 'rb') as file:
    n_labels = pickle.load(file)
 index = AnnoyIndex(768, 'angular')
 index.load('nearest.annoy')
 def get_nearest_value(embeddings):
    items = list(map(lambda x: (
            labels[x], 
            get_distance(embeddings, new_embeddings[x]),
            list(n_labels)[x]
        ), 
        index.get_nns_by_vector(embeddings, 20)
    ))
    weights = np.array([0 for _ in range(17)])
    refs = [[] for _ in range(17)]
    s = 0
    for item in items:
        if item[1] == inf:
            return id2label[item[0]], 100, [item[2]]
        s += item[1]
        weights[item[0]] += item[1]
        refs[item[0]].append(item[2])
    return id2label[np.argmax(weights)], (weights[np.argmax(weights)] / s) * 100, refs[np.argmax(weights)]
 def to_rgb(vals):
    return f'rgb({int(vals[0]*255)}, {int(vals[1]*255)}, {int(vals[2]*255)})'
 def from_abs_to_rgb(min, max, value):
    return to_rgb(cmap((value - min)/ max))
 def get_nns_tokens(encoding, attrs, predicted_id):
    current_array = map(
        lambda x: (tokenizer.convert_ids_to_tokens(encoding['input_ids'][0][x[0]-5:x[0]+5]), x[1]), 
        list(
            reversed(
                sorted(
                    enumerate(
                        attrs[0][predicted_id].numpy()
                    ), 
                    key=lambda x: x[1]
                )
            )
        )[0:10]
    )
    return list(current_array)
 def get_description_interpreting(attrs, predicted_id):
    attrs = attrs.detach().numpy()
    positive_weights = attrs[0][predicted_id]
    negative_weights = [0 for _ in range(len(positive_weights))]
    for i in range(len(attrs[0])):
        if i == predicted_id: continue
        for j in range(len(attrs[0][i])):
            negative_weights[j] += attrs[0][i][j]
    for i in range(len(negative_weights)):
        negative_weights[i] /= len(attrs[0]) - 1
    return {
        'positive_weights': (
            positive_weights, 
            {
                'min': np.min(positive_weights),
                'max': np.max(positive_weights)
            }
        ),
        'negative_weights': (
            negative_weights,
            {
                'min': min(negative_weights),
                'max': max(negative_weights)
            }
        )
    }
 def transform_token_ids(func_data, token_ids, word):
    tokens = list(map(lambda x: tokenizer.convert_ids_to_tokens([x])[0].replace('##', ''), token({'text': clean(word)})['input_ids'][0]))
    weights = [func_data['positive_weights'][0][i] for i in token_ids]
    wts = []
    for i in range(len(weights)):
        if weights[i] > 0:
            #color = from_abs_to_rgb(func_data['positive_weights'][1]['min'], func_data['positive_weights'][1]['max'], weights[i])
            mn = max(func_data['positive_weights'][1]['min'], 0)
            mx = func_data['positive_weights'][1]['max']
            wts.append((weights[i] - mn)/ mx)
            #word = word.lower().replace(tokens[i], f'<span data-value="{(weights[i] - mn)/ mx}">{tokens[i]}</span>')
    try:
        if sum(wts) / len(wts) >= 0.2:
            return f'<span data-value={sum(wts) / len(wts)}>{word}</span>'
    except: pass
    return word
 def build_text(tokens, func_data, current_text):
    splitted_text = current_text.split()
    splitted_text_iterator = 0
    current_word = ''
    current_word_ids = []
    for i, token in enumerate(tokens):
        decoded = tokenizer.convert_ids_to_tokens([token])[0]
        if decoded == '[CLS]': continue
        if not len(current_word):
            current_word = decoded
            current_word_ids.append(i)
        elif decoded.startswith('##'):
            current_word += decoded[2:]
            current_word_ids.append(i)
        else:
            while clean(splitted_text[splitted_text_iterator]) != current_word:
                splitted_text_iterator += 1
            current_word = decoded
            splitted_text[splitted_text_iterator] = transform_token_ids(func_data, current_word_ids, splitted_text[splitted_text_iterator])
            current_word_ids = []
    return ' '.join(splitted_text)
 def squad_pos_forward_func(inputs, token_type_ids=None, attention_mask=None, position=0):
    pred = predict(inputs.to(torch.long), token_type_ids.to(torch.long), attention_mask.to(torch.long))
    pred = pred[position]
    return pred.max(1).values
 def predict_press_release(input_ids, token_type_ids, attention_mask):
    encoding = {
        'input_ids': input_ids.to(model.device),
        'token_type_ids': token_type_ids.to(model.device),
        'attention_mask': attention_mask.to(model.device)
    }
    outputs = model(**encoding)
    return outputs
 def clean(text):
    text = re.sub('[^а-яё ]', ' ', str(text).lower())
    text = re.sub(r" +", " ", text).strip()
    return text
 def get_description_interpreting(attrs):
    positive_weights = attrs
    return {
        'positive_weights': (
            positive_weights, 
            {
                'min': np.min(positive_weights),
                'max': np.max(positive_weights)
            }
        ),
    }
 def predict(input_ids, token_type_ids, attention_mask):
    encoding = {
        'input_ids': input_ids.to(model.device),
        'token_type_ids': token_type_ids.to(model.device),
        'attention_mask': attention_mask.to(model.device)
    }
    outputs = model(**encoding)
    return outputs
 def batch_tokenize(text):
    splitted_text = text.split()
    current_batch = splitted_text[0]
    batches = []
    for word in splitted_text[1:]:
        if len(tokenizer(current_batch + ' ' + word)['input_ids']) < 512:
            current_batch += ' ' + word
        else:
            batches.append({
                'text': current_batch
            })
            current_batch = word
    return batches + [{'text': current_batch}]
 def token(text):
    return tokenizer(text['text'], padding=True, truncation=True, max_length=512, return_tensors='pt')
 def tfidf_classify(data):
    if not len(data.data): return ''
    data = list(map(lambda x: x['text'], batch_tokenize(clean(data.data))))
    predicted_labels = []
    predicted_text = ""
    for item in data:
        predicted_labels.append(process_classify(item)['ans'])
    ans = Counter(predicted_labels).most_common()[0][0]
    score = len(list(filter(lambda x: x == ans, predicted_labels))) / len(predicted_labels)
    ans = id2label[ans]
    return {'answer': ans, 'text': predicted_text, 'metric': score, 'extendingLabels': list(map(lambda x: id2label[x], predicted_labels))}
 def tfidf_embeddings(data):
    if not len(data.data): return ''
    data = list(map(lambda x: x['text'], batch_tokenize(clean(data.data))))
    predicted_labels = []
    predicted_text = ""
    for item in data:
        ans = process_embedding(item)
        predicted_labels.append(ans['ans'])
        predicted_text += ans['text'] + ' '
    ans = Counter(predicted_labels).most_common()[0][0]
    print(ans, predicted_text)
    return {'answer': id2label[ans], 'text': predicted_text}
 def bert_classify(data):
    data = clean(data)
    predicted = []
    text = ''
    batched = batch_tokenize(data)
    for b in batched:
        print(len(predicted))
        embs = token(b)
        answer = predict_press_release(
                    embs['input_ids'], embs['token_type_ids'], embs['attention_mask']
                    ).logits[0]
        answer = torch.softmax(answer, dim=-1).detach().numpy()
        answer_score = np.max(answer)
        predicted.append(
            [id2label[np.argmax(answer)],
            float(answer_score)]
            )
    ans = {'AA(RU)': [0]}
    for i in predicted:
        if i[0] not in ans.keys():
            ans.update({i[0]: [i[1]]})
        else:
            ans[i[0]].append(i[1])
    selected = 'AA(RU)'
    score = 0
    for candidate in ans.keys():
        if sum(ans[candidate]) / len(ans[candidate]) > score:
            score = sum(ans[candidate]) / len(ans[candidate])
            selected = candidate
        elif sum(ans[candidate]) / len(ans[candidate]) == score and len(ans[candidate]) > len(ans):
            selected = candidate
    return {
        'answer': selected, 
        'text': text,
        'longAnswer': predicted,
        'metric': score
    }
 def bert_embeddings(data):
    data = clean(data)
    predicted = []
    text = ''
    batched = batch_tokenize(data)
    for b in batched:
        embs = token(b)
        predicted.append(np.argmax(predict_press_release(embs['input_ids'], embs['token_type_ids'], embs['attention_mask']).logits.detach().numpy()[0]))
        attrs = lig.attribute(embs['input_ids'], additional_forward_args=(embs['attention_mask'], embs['token_type_ids'], 0))
        attrs = np.array(list(map(lambda x: x.sum(), attrs[0])))
        descr = get_description_interpreting(attrs)
        text += build_text(embs['input_ids'][0], descr, b['text']) + ' '
    return {'answer': id2label[Counter(predicted).most_common()[0][0]], 'text': text}
 config = BertConfig.from_json_file("./akra_model/checkpoint/config.json")
 model = AutoModelForSequenceClassification.from_pretrained(
    "./akra_model/checkpoint", config=config
 )
 tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
 lig = LayerIntegratedGradients(squad_pos_forward_func, model.bert.embeddings)
 app = FastAPI()
 class Predict(BaseModel):
    data: str
 class ListPredict(BaseModel):
    data: list
@app.post('/predict')
 def predict_(data: Predict):
    return bert_classify(data)
@app.post('/bert/process')
 def predict_f(data: Predict):
    return bert_classify(data)
@app.get('/interpret')
 def interpret():
    pass
@app.post('/tfidf/process')
 def tfidf_res(data: Predict):
    return tfidf_classify(data)
@app.post('/tfidf/batch')
 def tfidf_batch(data: ListPredict):
    res = []
    for item in data.data:
        res.append(tfidf_classify(Predict(data=item)))
    return res
@app.post('/bert/batch')
 def bert_batch(data: ListPredict):
    res = []
    for item in data:
        res.append(bert_classify({'data': item}))
    return res
@app.post('/bert/describe')
 def bert_describe(data: Predict):
    return bert_embeddings(data)
@app.post('/tfidf/describe')
 def tfidf_describe(data: Predict):
    return tfidf_embeddings(data)
 def get_nearest_service(data: Predict):
    data = clean(data.data)
    batched = batch_tokenize(data)
    res = []
    scores = {}
    for key in id2label.values():
        scores.update({key: []})
    for batch in batched:
        features = list(get_nearest_value(get_embs(batch['text'])))
        features[0] = features[0]
        features[1] /= 100
        scores[features[0]].append(features[1] if features[1] < 95 else 100)
        res.append(
            {
                'text': batch['text'],
                'features': features
            }
        )
    mx = 0
    label = 'AA(RU)'
    for key in scores.keys():
        try:
            if (sum(scores[key]) / len(scores[key])) > mx:
                label = key
                mx = (sum(scores[key]) / len(scores[key]))
            if (sum(scores[key]) / len(scores[key])) == mx:
                if len(scores[key]) > len(scores[label]):
                    label = key
        except: pass
    return {'detailed': res, 'metric': mx, 'answer': label}
@app.post('/nearest/nearest')
 def proccess_text(data: Predict):
    return get_nearest_service(data)
@app.post('/catboost')
 def catboost_process(data: Predict):
    tfidf = tfidf_classify(data)
    bert = bert_classify(data)
    nearest = get_nearest_service(data)
    inputs = [label2id[tfidf['answer']], tfidf['metric'], bert['metric'], label2id[bert['answer']], nearest['metric'], label2id[nearest['answer']]]
    catboost_answer = id2label[catboost.predict([inputs])[0][0]]
    return {
        'bert': bert,
        'tfidf': tfidf,
        'nearest': nearest,
        'total': catboost_answer
    }
--- a/nearest-search-train.ipynb
+++ b/nearest-search-train.ipynb
@ -0,0 +1,177 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "Requirement already satisfied: annoy in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (1.17.3)\n",
      "Requirement already satisfied: sentence_transformers in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (2.2.2)\n",
      "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (4.32.1)\n",
      "Requirement already satisfied: tqdm in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (4.66.1)\n",
      "Requirement already satisfied: torch>=1.6.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (2.0.1)\n",
      "Requirement already satisfied: torchvision in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (0.15.2)\n",
      "Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (1.25.2)\n",
      "Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (1.3.0)\n",
      "Requirement already satisfied: scipy in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (1.11.2)\n",
      "Requirement already satisfied: nltk in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (3.8.1)\n",
      "Requirement already satisfied: sentencepiece in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (0.1.99)\n",
      "Requirement already satisfied: huggingface-hub>=0.4.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sentence_transformers) (0.16.4)\n",
      "Requirement already satisfied: filelock in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.12.3)\n",
      "Requirement already satisfied: fsspec in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n",
      "Requirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.31.0)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.7.1)\n",
      "Requirement already satisfied: packaging>=20.9 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (23.1)\n",
      "Requirement already satisfied: sympy in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from torch>=1.6.0->sentence_transformers) (1.12)\n",
      "Requirement already satisfied: networkx in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from torch>=1.6.0->sentence_transformers) (3.1)\n",
      "Requirement already satisfied: jinja2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from torch>=1.6.0->sentence_transformers) (3.1.2)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.8.8)\n",
      "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.13.3)\n",
      "Requirement already satisfied: safetensors>=0.3.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.3.3)\n",
      "Requirement already satisfied: click in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from nltk->sentence_transformers) (8.1.7)\n",
      "Requirement already satisfied: joblib in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from nltk->sentence_transformers) (1.3.2)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from scikit-learn->sentence_transformers) (3.2.0)\n",
      "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from torchvision->sentence_transformers) (10.0.0)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from jinja2->torch>=1.6.0->sentence_transformers) (2.1.3)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (3.2.0)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2023.7.22)\n",
      "Requirement already satisfied: mpmath>=0.19 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from sympy->torch>=1.6.0->sentence_transformers) (1.3.0)\n"
     ]
    }
   ],
   "source": [
    "!pip3 install annoy sentence_transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "model = SentenceTransformer('sentence-transformers/LaBSE')\n",
    "\n",
    "def get_embs(text):\n",
    "    embeddings = model.encode(text)\n",
    "    return embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('n_labels.pickle', 'rb') as file:\n",
    "    n_labels = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('annoy_labels.pickle', 'rb') as file:\n",
    "    labels = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embs = []\n",
    "for text in n_labels:\n",
    "    embs.append(get_embs(text))\n",
    "    if len(embs) % 50 == 0:\n",
    "        print(len(embs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from annoy import AnnoyIndex\n",
    "index = AnnoyIndex(768, 'angular')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, emb in enumerate(embs):\n",
    "    index.add_item(i, emb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index.build(20)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/req.txt
+++ b/req.txt
@ -0,0 +1,123 @@
 accelerate==0.22.0
 aiohttp==3.8.5
 aiosignal==1.3.1
 annotated-types==0.5.0
 annoy==1.17.3
 anyio==3.7.1
 appnope==0.1.3
 asttokens==2.2.1
 async-timeout==4.0.3
 attrs==23.1.0
 backcall==0.2.0
 beautifulsoup4==4.12.2
 captum==0.6.0
 certifi==2023.7.22
 charset-normalizer==3.2.0
 click==8.1.7
 comm==0.1.4
 contourpy==1.1.0
 cycler==0.11.0
 datasets==2.14.4
 debugpy==1.6.7.post1
 decorator==5.1.1
 dill==0.3.7
 et-xmlfile==1.1.0
 evaluate==0.4.0
 exceptiongroup==1.1.3
 executing==1.2.0
 fastapi==0.103.1
 filelock==3.12.3
 fonttools==4.42.1
 frozenlist==1.4.0
 fsspec==2023.6.0
 h11==0.14.0
 httptools==0.6.0
 huggingface-hub==0.16.4
 icu==0.0.1
 idna==3.4
 ipykernel==6.25.1
 ipython==7.34.0
 jedi==0.19.0
 Jinja2==3.1.2
 joblib==1.3.2
 jupyter_client==8.3.1
 jupyter_core==5.3.1
 kiwisolver==1.4.5
 MarkupSafe==2.1.3
 matplotlib==3.7.2
 matplotlib-inline==0.1.6
 Morfessor==2.0.6
 mpmath==1.3.0
 multidict==6.0.4
 multiprocess==0.70.15
 nest-asyncio==1.5.7
 networkx==3.1
 nltk==3.8.1
 numpy==1.25.2
 openpyxl==3.1.2
 outcome==1.2.0
 packaging==23.1
 pandas==2.1.0
 parso==0.8.3
 pexpect==4.8.0
 pickleshare==0.7.5
 Pillow==10.0.0
 platformdirs==3.10.0
 polyglot==16.7.4
 prompt-toolkit==3.0.39
 psutil==5.9.5
 ptyprocess==0.7.0
 pure-eval==0.2.2
 pyarrow==13.0.0
 pydantic==2.3.0
 pydantic_core==2.6.3
 Pygments==2.16.1
 pymystem3==0.2.0
 pyparsing==3.0.9
 PySocks==1.7.1
 python-dateutil==2.8.2
 python-dotenv==1.0.0
 pytz==2023.3
 PyYAML==6.0.1
 pyzmq==25.1.1
 regex==2023.8.8
 requests==2.31.0
 responses==0.18.0
 safetensors==0.3.3
 scikit-learn==1.3.0
 scipy==1.11.2
 seaborn==0.12.2
 selenium==4.12.0
 sentence-transformers==2.2.2
 sentencepiece==0.1.99
 six==1.16.0
 sniffio==1.3.0
 sortedcontainers==2.4.0
 soupsieve==2.5
 stack-data==0.6.2
 starlette==0.27.0
 sympy==1.12
 threadpoolctl==3.2.0
 tokenizers==0.13.3
 torch==2.0.1
 torchvision==0.15.2
 tornado==6.3.3
 tqdm==4.66.1
 traitlets==5.9.0
 transformers==4.32.1
 transformers-interpret==0.10.0
 transliterate==1.10.2
 trio==0.22.2
 trio-websocket==0.10.3
 typing_extensions==4.7.1
 tzdata==2023.3
 undetected-chromedriver==3.5.3
 urllib3==2.0.4
 uvicorn==0.23.2
 uvloop==0.17.0
 watchfiles==0.20.0
 wcwidth==0.2.6
 websockets==11.0.3
 wsproto==1.2.0
 xxhash==3.3.0
 yarl==1.9.2
--- a/tfidf-train.ipynb
+++ b/tfidf-train.ipynb
@ -0,0 +1,208 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pymystem3 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (0.2.0)\n",
      "Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (2.1.0)\n",
      "Collecting sklearn\n",
      "  Downloading sklearn-0.0.post9.tar.gz (3.6 kB)\n",
      "  Installing build dependencies ... \u001b[?25ldone\n",
      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: requests in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pymystem3) (2.31.0)\n",
      "Requirement already satisfied: numpy>=1.23.2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas) (1.25.2)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas) (2023.3)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from pandas) (2023.3)\n",
      "Requirement already satisfied: six>=1.5 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->pymystem3) (3.2.0)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->pymystem3) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->pymystem3) (2.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from requests->pymystem3) (2023.7.22)\n",
      "Building wheels for collected packages: sklearn\n",
      "  Building wheel for sklearn (pyproject.toml) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for sklearn: filename=sklearn-0.0.post9-py3-none-any.whl size=2952 sha256=de085da5188e0680130af47d37bf6a7803a4dbec121af8adf834ac3d03747231\n",
      "  Stored in directory: /Users/ilya/Library/Caches/pip/wheels/ef/63/d1/f1671e1e93b7ef4d35df483f9b2485e6dd21941da9a92296fb\n",
      "Successfully built sklearn\n",
      "Installing collected packages: sklearn\n",
      "Successfully installed sklearn-0.0.post9\n"
     ]
    }
   ],
   "source": [
    "!pip3 install pymystem3 pandas sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('dss.pickle', 'rb') as file:\n",
    "    data = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "from pymystem3 import Mystem\n",
    "\n",
    "def clean(text):\n",
    "    text = re.sub('[^а-яё ]', ' ', str(text).lower())\n",
    "    text = re.sub(r\" +\", \" \", text).strip()\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = list(map(lambda x: clean(x['text']), data))\n",
    "labels = list(map(lambda x: x['label'], data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "mstm = Mystem()\n",
    "normalized = [''.join(mstm.lemmatize(t)[:-1]) for t in texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df = pd.DataFrame()\n",
    "df['text'] = texts\n",
    "df['norm'] = normalized\n",
    "df['label'] = labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train, test = train_test_split(df, test_size=0.1, random_state=42)\n",
    "valid, test = train_test_split(test, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "model_tfidf = TfidfVectorizer(max_features=5000)\n",
    "\n",
    "train_tfidf = model_tfidf.fit_transform(train['norm'].values)\n",
    "valid_tfidf = model_tfidf.transform(valid['norm'].values)\n",
    "test_tfidf = model_tfidf.transform(test['norm'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(random_state=42)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "RandomForestClassifier(random_state=42)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "cls = RandomForestClassifier(random_state=42)\n",
    "cls.fit(train_tfidf, train['label'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.629399514876379"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "predictions = cls.predict(test_tfidf)\n",
    "f1_score(predictions, test['label'].values, average='weighted')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }