From 0db047f50073b789e49addba77a3304332e0d843 Mon Sep 17 00:00:00 2001 From: Alexander Karpov <51019345+Alexander-D-Karpov@users.noreply.github.com> Date: Sun, 25 Jun 2023 08:46:14 +0300 Subject: [PATCH] Add files via upload --- ml/main.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 ml/main.ipynb diff --git a/ml/main.ipynb b/ml/main.ipynb new file mode 100644 index 0000000..dc5a8ff --- /dev/null +++ b/ml/main.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"1qL6GhkjsjAgigfWs93Wssu-SUOsydEcE","authorship_tag":"ABX9TyOntCFm1VmRrGxQvsiDvlXi"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"id":"i0qW91AYZzrw"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["%cd /content/drive/MyDrive/AI/match_naming/\n","!ls"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jSQ27f-pZ1MS","executionInfo":{"status":"ok","timestamp":1687651393746,"user_tz":180,"elapsed":509,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"22cf9c02-98c0-46c1-b45d-efc157b90875"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/AI/match_naming\n","catboost_info data labeled\t mapping.json\tpython-poppler\n","checkpoints files main.ipynb mapping.pkl\ttrain.zip\n"]}]},{"cell_type":"code","source":["!pip install pdfminer.six"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"32w-7Y0_dtA1","executionInfo":{"status":"ok","timestamp":1687651413397,"user_tz":180,"elapsed":13218,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"d4a3c980-c76d-445a-eff4-2cf8c655a8b3"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Collecting pdfminer.six\n"," Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six) (2.0.12)\n","Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six) (41.0.1)\n","Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six) (1.15.1)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six) (2.21)\n","Installing collected packages: pdfminer.six\n","Successfully installed pdfminer.six-20221105\n"]}]},{"cell_type":"code","source":["import re\n","!pip install Levenshtein -qqq\n","from Levenshtein import distance\n","\n","def ngrams(text, n):\n"," words = re.findall(r'\\b\\w+\\b', text)\n"," return [' '.join(words[i:i+n]) for i in range(len(words)-(n-1))]\n","\n","def get_similar_strings(target_string, text, n, max_distance):\n"," # Create n-grams of the text\n"," text_ngrams = ngrams(text, n)\n","\n"," similar_strings = []\n"," for ngram in text_ngrams:\n"," # Calculate the Levenshtein distance\n"," lev_distance = distance(target_string, ngram)\n"," if lev_distance <= max_distance:\n"," similar_strings.append((ngram, lev_distance))\n","\n"," return similar_strings\n","\n","# Testing\n","target_string = \"Project X\"\n","text = \"This document pertains to project X. The information in this document is relevant to projects such as Project x, Project Y, and Project Z. However, please note that the data may not be accurate for projectx or project Y.\"\n","n = len(target_string.split())\n","max_distance = 2\n","similar_strings = get_similar_strings(target_string, text, n, max_distance)\n","print(similar_strings)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"r1G4qlX1MYk9","executionInfo":{"status":"ok","timestamp":1687589959350,"user_tz":180,"elapsed":10102,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"c7f96071-2529-4fb1-e4de-9caad67ff187"},"execution_count":4,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m172.5/172.5 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m45.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h[('project X', 1), ('Project x', 1), ('Project Y', 1), ('Project Z', 1), ('project Y', 2)]\n"]}]},{"cell_type":"code","source":["# 1. Название может быть в 1-2 строчках\n","# 2. Может быть в << >> | Но в кавычках может быть внутри названия названия\n","# 3. Всегда жирным\n","# 4. Titles can be images so we can OCR them\n","\n","# Мб учитывать длину строки\n","# Мб достать font size"],"metadata":{"id":"NhkgWKpzog8m"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Image instead of titile counter = 2"],"metadata":{"id":"LIhuWtEDsRcL"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Title in several lines counter = 3 | So we can join them by font size (but need to be accurate)"],"metadata":{"id":"PXYGB7QGsjyF"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Title may incldue Этап = 2"],"metadata":{"id":"Xg9YQ-8Rte4_"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Whole file is a scan = 1"],"metadata":{"id":"60QKCEA7vo8E"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from pdfminer.high_level import extract_pages\n","import glob\n","files = glob.glob('data/project1/*.pdf')\n","files[0]\n","\n","for file in files:\n"," print(file)\n"," for page_layout in extract_pages(file):\n"," for element in page_layout:\n"," print(element)\n"," break\n"," print('\\n'*5)"],"metadata":{"id":"uUzMwhAeofa6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# from pdfminer.pdfparser import PDFParser\n","# from pdfminer.pdfdocument import PDFDocument\n","# from pdfminer.pdftypes import PDFObjectNotFound\n","\n","\n","\n","# def extract(file_name):\n","# pages = []\n","# result = []\n","# fp = open(file_name, 'rb')\n","# parser = PDFParser(fp)\n","# doc = PDFDocument(parser, \"\")\n","# visited = set()\n","# for xref in doc.xrefs:\n","# # print('xref ', xref)\n","# for objid in xref.get_objids():\n","# if objid in visited: continue\n","# visited.add(objid)\n","\n","# obj = doc.getobj(objid)\n","# if obj is None: continue\n","# if isinstance(obj, dict):\n","# # 'Type' is PDFObjRef type\n","# if 'Type' in obj and obj['Type'].name == 'Page':\n","# pages.append(objid)\n","# elif 'C' in obj:\n","# result.append(obj['Rect'])\n","# return result\n","# extract(files[0])\n","\n","# for file in files:\n","# print(file)\n","# rects = extract(file)\n","# for rect in rects:\n","# extracted_text = extract_text_by_coordinates(file,rect,)\n","# print(extracted_text)\n","\n","# print()\n","\n","# from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter\n","# from pdfminer.converter import PDFPageAggregator\n","# from pdfminer.layout import LAParams, LTTextBox, LTTextBoxHorizontal\n","# from pdfminer.pdfpage import PDFPage\n","\n","# def extract_text_by_coordinates(pdf_path, coordinates):\n","# # Coordinates of the rectangle\n","# x1, y1, x2, y2 = coordinates\n","\n","# # Open the pdf file\n","# fp = open(pdf_path, 'rb')\n","\n","# # Create a PDF resource manager object that stores shared resources.\n","# rsrcmgr = PDFResourceManager()\n","\n","# # Create a PDF device object.\n","# device = PDFPageAggregator(rsrcmgr, laparams=LAParams())\n","\n","# # Create a PDF interpreter object.\n","# interpreter = PDFPageInterpreter(rsrcmgr, device)\n","\n","# # Process each page contained in the document.\n","# for page in PDFPage.get_pages(fp):\n","# interpreter.process_page(page)\n","\n","# # Obtain the LTPage object for the page.\n","# layout = device.get_result()\n","\n","# extracted_text = \"\"\n","# for element in layout:\n","# # print(element)\n","# if isinstance(element, LTTextBox) or isinstance(element, LTTextBoxHorizontal):\n","# x, y, text = element.bbox[0], element.bbox[1], element.get_text()\n","# t = 0.05\n","# if x1 - t <= x <= x2 + t and y1 - t <= y <= y2 + t:\n","# extracted_text += text\n","# break\n","\n","# return extracted_text\n","\n","# # Test the function with a PDF file and coordinates\n","# extracted_text = extract_text_by_coordinates(files[0], [97.20097, 533.4017, 540.5599, 565.0175],)\n","# print(extracted_text)\n"],"metadata":{"id":"AYOXmPk4NUda"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["def levenshtein_distance(str1, str2):\n"," size_x = len(str1) + 1\n"," size_y = len(str2) + 1\n"," matrix = [[0 for j in range(size_y)] for i in range(size_x)]\n","\n"," for x in range(size_x):\n"," matrix [x][0] = x\n"," for y in range(size_y):\n"," matrix [0][y] = y\n","\n"," for x in range(1, size_x):\n"," for y in range(1, size_y):\n"," if str1[x-1] == str2[y-1]:\n"," matrix [x][y] = min(\n"," matrix[x-1][y] + 1,\n"," matrix[x-1][y-1],\n"," matrix[x][y-1] + 1\n"," )\n"," else:\n"," matrix [x][y] = min(\n"," matrix[x-1][y] + 1,\n"," matrix[x-1][y-1] + 1,\n"," matrix[x][y-1] + 1\n"," )\n"," return matrix[size_x - 1][size_y - 1]\n","\n","# testing the function\n","print(levenshtein_distance(etalon, \"- «Капитальный ремонт автомобильной дороги Р-215 Астрахань-Кочубей- \\nКизляр-Махачкала, подъезд к г. Грозный на участке км 70+127- км 85+267, \\nЧеченская Республика»;\\n\".lower())) # should return 3"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":235},"id":"1Alk_XGJnhY5","executionInfo":{"status":"error","timestamp":1687571621744,"user_tz":180,"elapsed":19,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"076ef30f-9612-41ad-e309-d148d9baa21d"},"execution_count":3,"outputs":[{"output_type":"error","ename":"NameError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;31m# testing the function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlevenshtein_distance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0metalon\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"- «Капитальный ремонт автомобильной дороги Р-215 Астрахань-Кочубей- \\nКизляр-Махачкала, подъезд к г. Грозный на участке км 70+127- км 85+267, \\nЧеченская Республика»;\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# should return 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mNameError\u001b[0m: name 'etalon' is not defined"]}]},{"cell_type":"code","source":["# from pdfminer.high_level import extract_pages\n","# from pdfminer.layout import LTTextContainer\n","# from tqdm import tqdm\n","# import pandas as pd\n","\n","# import warnings\n","# warnings.filterwarnings(\"ignore\")\n","\n","# # etalon = \"Капитальный ремонт автомобильной дороги Р-215 Астрахань - Кочубей - Кизляр - Махачкала, подъезд к г. Грозный на участке км 70+127 – км 85+267, Чеченская Республика\".lower()\n","# import glob\n","# files = glob.glob('data/general/*.pdf')\n","\n","# corr_dict = {}\n","# for file in tqdm(files):\n","# print(file)\n","# etalon = input('Enter etalon: ').lower()\n","# corr_dict[file] = etalon\n","# print()\n","# # print('\\n'*5)\n","\n","# import pickle\n","# with open('mapping.pkl', 'wb') as f:\n","# pickle.dump(corr_dict, f)"],"metadata":{"id":"MAoVlLUkX9SY"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import pickle\n","with open('mapping.pkl', 'rb') as f:\n"," corr_dict = pickle.load(f)"],"metadata":{"id":"0wQaPG-IbJ2s","executionInfo":{"status":"ok","timestamp":1687571636402,"user_tz":180,"elapsed":432,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["from pdfminer.high_level import extract_pages\n","from pdfminer.layout import LTTextContainer\n","from tqdm import tqdm\n","import pandas as pd\n","from pdfminer.layout import LTTextContainer, LTChar\n","import re\n","\n","import warnings\n","warnings.filterwarnings(\"ignore\")\n","\n","df = pd.DataFrame(columns=['text', 'label'])\n","# etalon = \"Капитальный ремонт автомобильной дороги Р-215 Астрахань - Кочубей - Кизляр - Махачкала, подъезд к г. Грозный на участке км 70+127 – км 85+267, Чеченская Республика\".lower()\n","import glob\n","# files = glob.glob('data/general/*.pdf')\n","# files[0]\n","files = [\n"," 'data/general/Раздел РИИ №3_ИЭИ-Фрагмент 1.00120-21_КМЭ-25202.pdf',\n"," 'data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-20_СГЭ-04495.pdf',\n"," 'data/general/3864 ИИ 4 К. Изм. 2.02608-19_ГГЭ-07898.pdf',\n"," 'data/general/Раздел ОД №4_Том 4.1.1_Изм.04.02480-21_ГГЭ-04081.pdf',\n"," 'data/general/Раздел ОД №4_Том 4.1.5.02480-21_ГГЭ-04081.pdf',\n"," 'data/general/896-0-ИЭИ-1 изм.2.01499-19_ГГЭ-17661.pdf',\n"," 'data/general/19-15_ОКЭФ_12-ПИР_ИПС-503-19-ИЭИ Изм.2.00484-21_РГЭ-25515.pdf',\n"," 'data/general/01_20_ГК-ПИР_2020-2001-ИЭИ_изм.2.00015-21_РГЭ-25886.pdf',\n"," 'data/general/Кузб-183267_КРАС–ИЭИ1_изм.6.00256-21_КРЭ-26756.pdf',\n"," 'data/general/02_2-15-20 ИЭИ_Изм.5.00158-21_ГГЭ-26767.pdf',\n"," 'data/general/ИЭИ Проектирование оснащения и оснащение ИТСОТБ Верхнего Самарского гидроузла.04955-22_ГГЭ-29629.pdf',\n"," 'data/general/Раздел ИИ 4-ИЭИ1-1-ИЗМ1.07970-22_ГГЭ-36050.pdf',\n"," 'data/general/4735.IV-ИЭИ_изм2.00341-20_ГГЭ-20462.pdf',\n"," 'data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf',\n"," 'data/general/Отчет ИИ №3_Книга №3_(ДПТПиИТ-2020-834-ИЭИ3)_Том 3.3 Фрагмент 1 Изм.1.07430-22_ГГЭ-35753.pdf',\n"," 'data/general/Раздел 1. Раздел II Часть 10_КНИ-ИЭИ изм.2.00127-20_ГГЭ-13675.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 5.7-1.5 Изм.2.00060-20_СПЭ-21903.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 6.2-1.4.1 Изм.1.00072-20_СПЭ-21948.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 6.2-1.4.2 Изм.1.00072-20_СПЭ-21948.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 6.2-1.4.3 Изм.1.00072-20_СПЭ-21948.pdf',\n"," 'data/general/Раздел ПД N1 Экологические изыскания Книга 1.00052-20_СПЭ-18764.pdf',\n"," 'data/general/Раздел ПД N1 Экологические изыскания Книга 2.00052-20_СПЭ-18764.pdf',\n"," 'data/general/Раздел ПД N1 Экологические изыскания.00052-20_СПЭ-18764.pdf',\n"," 'data/general/Раздел ПД №10-Том 10.138К-5-ИЭ-Книга2.00082-20_ГГЭ-06683.pdf',\n"," 'data/general/РС-19-1405-ИЭИ Инженерно-экологические изыскания часть 1 Изм.1 от 14.04.20.00008-20_СКЭ-22197.pdf',\n"," 'data/general/Том3ИЭИ.00321-20_СПЭ-22115.pdf',\n"," 'data/general/ТП 19.066.084-ИЭИ.00011-20_ЕГЭ-21673.pdf',\n"," 'data/general/Раздел ИИ № 4 Том 4.1.07125-22_ГГЭ-35572.pdf',\n"," 'data/general/Раздел ИИ № 4 Том 4.2.07125-22_ГГЭ-35572.pdf',\n"," 'data/general/Раздел ИИ ИЭИ4.1.06920-22_ГГЭ-35136.pdf',\n"," 'data/general/064-2011-Р.pdf']\n","# files = ['data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf']\n","corr_files = []\n","font_id = 0\n","for file in tqdm(files[::-1]):\n"," # print(file)\n"," etalon = corr_dict[file]\n"," # print(file)\n"," labels = []\n"," texts = []\n"," fonts = []\n"," squares = []\n"," ids = []\n","\n"," for page_layout in extract_pages(file):\n","\n"," for element in page_layout:\n"," if isinstance(element, LTTextContainer):\n"," is_bold = False\n"," text = element.get_text().replace('\\n', '').lower()\n"," if text.split() != [] and len(text) > 4:\n"," texts.append(text)\n"," # print(element)\n"," end = False\n"," for text_line in element:\n"," if end: break\n"," # print(character)\n"," for character in text_line:\n"," if isinstance(character, LTChar):\n"," # print(character.fontname.lower())\n"," if 'bold' in character.fontname.lower():\n"," fonts.append(1)\n"," elif 'italic' in character.fontname.lower():\n"," fonts.append(2)\n"," else:\n"," fonts.append(0)\n"," end = True\n"," break\n","\n"," if 1 not in labels:\n"," if levenshtein_distance(etalon, text) < len(etalon) / 5:\n"," labels.append(1)\n"," else:\n"," labels.append(0)\n"," else:\n"," labels.append(0)\n"," x1, y1, x2, y2 = element.bbox\n"," squares.append((int(x2) - int(x1)) * (int(y2) - int(y1)))\n","\n"," match = re.search(r'LTTextBoxHorizontal\\((\\d+)\\)', str(element))\n"," if match:\n"," id = int(match.group(1)) # extract the ID\n"," ids.append(id)\n"," break\n","\n"," if 1 not in labels:\n"," corr_files.append(file)\n"," temp_df = pd.DataFrame({\n"," 'text': texts,\n"," 'label': labels,\n"," 'font': fonts,\n"," 'file': file,\n"," 'squares': squares,\n"," 'ids': ids\n"," })\n"," # Append the temporary DataFrame to the master DataFrame\n"," df = df.append(temp_df, ignore_index=True)\n"," # print()\n"," # print('\\n'*5)\n",""],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MNqy0zzzzHIE","executionInfo":{"status":"ok","timestamp":1687574228154,"user_tz":180,"elapsed":11286,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"6aab29ba-3f46-4cb6-fa24-2a3e963ec3c8"},"execution_count":93,"outputs":[{"output_type":"stream","name":"stderr","text":["100%|██████████| 31/31 [00:10<00:00, 2.83it/s]\n"]}]},{"cell_type":"code","source":["from pdfminer.high_level import extract_pages\n","from pdfminer.layout import LTTextContainer, LTChar\n","\n","for file in corr_files:\n"," print(file)\n"," for page_layout in extract_pages(file):\n"," for element in page_layout:\n"," print(element)\n"," # if isinstance(element, LTTextContainer):\n"," # for text_line in element:\n"," # print(text_line)\n"," # for character in text_line:\n"," # if isinstance(character, LTChar):\n"," # print(character.fontname)\n"," # print(character.size)\n"," # break\n"," break\n"," print('\\n'*5)"],"metadata":{"id":"h2dfqBPvFv2Q"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"GwrozluYjNIx"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import json\n","from functools import partial\n","from typing import List\n","import glob\n","from tqdm import tqdm\n","\n","import matplotlib.pyplot as plt\n","import numpy as np\n","import pandas as pd\n","import seaborn as sns\n","!pip install Levenshtein -qqq\n","!pip install catboost transformers -qqq\n","from catboost import CatBoostClassifier, Pool\n","from catboost.utils import eval_metric\n","from scipy.spatial.distance import cosine, euclidean\n","from sklearn.metrics import pairwise_distances\n","from sklearn.model_selection import train_test_split"],"metadata":{"id":"gyFEMNwZIYzj","executionInfo":{"status":"ok","timestamp":1687590197791,"user_tz":180,"elapsed":35577,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"colab":{"base_uri":"https://localhost:8080/"},"outputId":"b2603b05-fbd6-4fd0-d085-8ee0f8882150"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.6/98.6 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m73.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m75.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m63.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h"]}]},{"cell_type":"code","source":["df[df['file'] == 'data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-20_СГЭ-04495.pdf']\n","df[df['file'] == 'data/general/4735.IV-ИЭИ_изм2.00341-20_ГГЭ-20462.pdf']"],"metadata":{"id":"VtbCiv3yT9wo"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df.to_csv('labeled/up2.csv')"],"metadata":{"id":"H6IYOAPkiada","executionInfo":{"status":"ok","timestamp":1687575127311,"user_tz":180,"elapsed":254,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":136,"outputs":[]},{"cell_type":"code","source":["df = pd.read_csv('labeled/up2.csv').drop(columns=['Unnamed: 0'])"],"metadata":{"id":"Et3nX5JtkkKX","executionInfo":{"status":"ok","timestamp":1687577055077,"user_tz":180,"elapsed":5,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":177,"outputs":[]},{"cell_type":"code","source":["!ls"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wK_1XaPAIfzt","executionInfo":{"status":"ok","timestamp":1687556746770,"user_tz":180,"elapsed":14,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"999d7136-8288-42c3-ca24-3110952e7b92"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["data files labeled main.ipynb python-poppler train.zip\n"]}]},{"cell_type":"code","source":["general = pd.read_csv('labeled/general.csv')\n","project1 = pd.read_csv('labeled/project1.csv')\n","project2 = pd.read_csv('labeled/project2.csv')"],"metadata":{"id":"t60cw5fnIeA_"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df[df['font'] == 2].shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9BcbYYjZacbl","executionInfo":{"status":"ok","timestamp":1687577298817,"user_tz":180,"elapsed":308,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"4a081db9-894d-4f92-dc84-01acd1b729a1"},"execution_count":191,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(74, 12)"]},"metadata":{},"execution_count":191}]},{"cell_type":"code","source":["df[df['font'] == 1].shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bPS4YqBwaxXB","executionInfo":{"status":"ok","timestamp":1687577106704,"user_tz":180,"elapsed":5,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"fb305752-d173-4889-f685-e2cdd92969d6"},"execution_count":179,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(185, 12)"]},"metadata":{},"execution_count":179}]},{"cell_type":"code","source":["random_indices = df[df['label'] == 0][df['font'] == 0].sample(n=30).index\n","\n","# For those indices, set the value in the 'font' column to 1\n","df.loc[random_indices, 'font'] = 2"],"metadata":{"id":"oFdHMsRQaite","executionInfo":{"status":"ok","timestamp":1687577297370,"user_tz":180,"elapsed":250,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":190,"outputs":[]},{"cell_type":"code","source":["# df = pd.concat([general, project1, project2])\n","# df = pd.concat([general])\n","df['len_of_text'] = df['text'].apply(len)\n","# df['len_of_text'] = df['text'].apply(lambda x: len(x.split()))\n","\n","df['rank'] = df.groupby('file')['len_of_text'].rank(ascending=False, method='min').astype(int)\n","df['rank_squares'] = df.groupby('file')['squares'].rank(ascending=False, method='min').astype(int)\n","df['font'] = df['font'].astype(object) # Convert boolean to int for computation, True will be 1 and False will be 0\n","df['bold'] = (df['font'] == 1).astype(int)\n","df['bold_percentage'] = df.groupby('file')['font'].transform(lambda x: x.mean() * 100).astype(int)\n","# Assuming your DataFrame is named 'df' and the 'id' column is named 'ids'\n","df['id_percentage'] = df.groupby('file')['ids'].transform(lambda x: (x / x.max()) * 100).astype(int)\n","df['label'] = df['label'].astype(int)\n","\n","\n","\n","# 'id_percentage' column now represents the ratio of current id to max id for the same file, multiplied by 100\n","\n","# 'percentage_true' column now represents the percentage of true bools for each row that has the same file\n","\n","df.sample(10)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":878},"id":"-d45ky1DIsgS","executionInfo":{"status":"ok","timestamp":1687577606696,"user_tz":180,"elapsed":288,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"154b8cdf-21d1-459d-d0ee-634b95e332c9"},"execution_count":206,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" text label font \\\n","328 технический отчет по результатам инженерно-эко... 0 1.0 \n","144 часть 4. инженерно-экологические изыскания 0 1.0 \n","272 115/21 117/21 0 2.0 \n","60 дата 0 0.0 \n","263 том 0.4.1 0 1.0 \n","198 по результатам инженерно-экологических изыскан... 0 2.0 \n","32 проектно-изыскательский институт электрификаци... 0 2.0 \n","280 из № 1 503/1 503/2 2 3 503/3 0 2.0 \n","66 проектная документация 0 0.0 \n","292 москва 2018 0 1.0 \n","\n"," file squares ids \\\n","328 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 17550.0 1.0 \n","144 data/general/Раздел ПД N 1. Пояснительная запи... 6984.0 5.0 \n","272 data/general/01_20_ГК-ПИР_2020-2001-ИЭИ_изм.2.... 962.0 7.0 \n","60 data/general/Том3ИЭИ.00321-20_СПЭ-22115.pdf 160.0 8.0 \n","263 data/general/Кузб-183267_КРАС–ИЭИ1_изм.6.00256... 728.0 6.0 \n","198 data/general/Отчет ИИ №3_Книга №3_(ДПТПиИТ-202... 11016.0 4.0 \n","32 data/general/Раздел ИИ № 4 Том 4.1.07125-22_ГГ... 13545.0 0.0 \n","280 data/general/19-15_ОКЭФ_12-ПИР_ИПС-503-19-ИЭИ ... 3245.0 5.0 \n","66 data/general/РС-19-1405-ИЭИ Инженерно-экологич... 2884.0 4.0 \n","292 data/general/896-0-ИЭИ-1 изм.2.01499-19_ГГЭ-17... 936.0 8.0 \n","\n"," len_of_text rank rank_squares bold bold_percentage id_percentage \\\n","328 68 2 2 1 100 14 \n","144 43 4 4 1 100 41 \n","272 14 9 9 0 130 77 \n","60 5 9 10 0 60 88 \n","263 10 7 7 1 87 85 \n","198 87 4 3 0 88 50 \n","32 138 1 2 0 122 0 \n","280 33 3 5 0 111 62 \n","66 22 7 5 0 83 30 \n","292 12 7 7 1 88 100 \n","\n"," id_max \n","328 7 \n","144 12 \n","272 9 \n","60 9 \n","263 7 \n","198 8 \n","32 8 \n","280 8 \n","66 13 \n","292 8 "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
textlabelfontfilesquaresidslen_of_textrankrank_squaresboldbold_percentageid_percentageid_max
328технический отчет по результатам инженерно-эко...01.0data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...17550.01.068221100147
144часть 4. инженерно-экологические изыскания01.0data/general/Раздел ПД N 1. Пояснительная запи...6984.05.0434411004112
272115/21 117/2102.0data/general/01_20_ГК-ПИР_2020-2001-ИЭИ_изм.2....962.07.014990130779
60дата00.0data/general/Том3ИЭИ.00321-20_СПЭ-22115.pdf160.08.05910060889
263том 0.4.101.0data/general/Кузб-183267_КРАС–ИЭИ1_изм.6.00256...728.06.01077187857
198по результатам инженерно-экологических изыскан...02.0data/general/Отчет ИИ №3_Книга №3_(ДПТПиИТ-202...11016.04.08743088508
32проектно-изыскательский институт электрификаци...02.0data/general/Раздел ИИ № 4 Том 4.1.07125-22_ГГ...13545.00.013812012208
280из № 1 503/1 503/2 2 3 503/302.0data/general/19-15_ОКЭФ_12-ПИР_ИПС-503-19-ИЭИ ...3245.05.033350111628
66проектная документация00.0data/general/РС-19-1405-ИЭИ Инженерно-экологич...2884.04.022750833013
292москва 201801.0data/general/896-0-ИЭИ-1 изм.2.01499-19_ГГЭ-17...936.08.012771881008
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":206}]},{"cell_type":"code","source":["train_df = df.drop(columns=[ 'text', 'squares', 'id_max', 'ids', 'len_of_text', 'bold'])\n","train_df.sample(5)"],"metadata":{"id":"_5RTc63nliju","executionInfo":{"status":"ok","timestamp":1687583220726,"user_tz":180,"elapsed":370,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"colab":{"base_uri":"https://localhost:8080/","height":337},"outputId":"cc9ef733-c018-4d1c-85fc-6d852fc8c61d"},"execution_count":248,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" label font file rank \\\n","134 0 2.0 data/general/Раздел ПД N 1. Пояснительная запи... 12 \n","43 0 0.0 data/general/ТП 19.066.084-ИЭИ.00011-20_ЕГЭ-21... 2 \n","310 0 2.0 data/general/Раздел ОД №4_Том 4.1.1_Изм.04.024... 12 \n","81 0 1.0 data/general/Раздел ПД №10-Том 10.138К-5-ИЭ-Кн... 8 \n","147 0 0.0 data/general/Раздел ПД N 1. Пояснительная запи... 12 \n","\n"," rank_squares bold_percentage id_percentage \n","134 12 115 66 \n","43 2 63 20 \n","310 14 100 63 \n","81 7 84 58 \n","147 12 100 66 "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
labelfontfilerankrank_squaresbold_percentageid_percentage
13402.0data/general/Раздел ПД N 1. Пояснительная запи...121211566
4300.0data/general/ТП 19.066.084-ИЭИ.00011-20_ЕГЭ-21...226320
31002.0data/general/Раздел ОД №4_Том 4.1.1_Изм.04.024...121410063
8101.0data/general/Раздел ПД №10-Том 10.138К-5-ИЭ-Кн...878458
14700.0data/general/Раздел ПД N 1. Пояснительная запи...121210066
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":248}]},{"cell_type":"code","source":["from sklearn.model_selection import StratifiedKFold, GroupKFold\n","\n","# kfold = GroupKFold(n_splits=5, shuffle=True, random_state=0xFACED)\n","kfold = GroupKFold(n_splits=5)\n","\n","metrics = []\n","models = []\n","fold_to_train = 0\n","for fold, (train_index, val_index) in enumerate(kfold.split(X=train_df,groups= train_df['file'])):\n"," # if fold == fold_to_train:\n"," X_train = train_df.iloc[train_index].drop(columns=['file'])\n"," X_val = train_df.iloc[val_index].drop(columns=['file'])\n"," print(X_train.columns, X_val.shape)\n","\n"," y_train = X_train[\"label\"]\n"," y_val = X_val[\"label\"]\n"," X_train = X_train.drop([\"label\"], axis=1)\n"," X_val = X_val.drop([\"label\"], axis=1)\n","\n"," train_pool = Pool(\n"," data=X_train,\n"," label=y_train,\n"," # text_features=text_features\n"," )\n"," eval_pool = Pool(\n"," data=X_val,\n"," label=y_val,\n"," # text_features=text_features\n"," )\n","\n"," model = CatBoostClassifier(eval_metric='F1')\n","\n"," model.fit(\n"," train_pool,\n"," eval_set=eval_pool,\n"," verbose=True,\n"," metric_period=100,\n"," use_best_model=True,\n"," early_stopping_rounds=200\n"," )\n","\n"," models.append(model)\n","\n"," # X_val[\"scores\"] = model.predict_proba(X_val[feats])[:, 1]\n","\n"," # y_val = features.iloc[val_index][[\"target\", \"variantid1\", \"variantid2\"]]\n"," # pr_auc_macro_metr = pr_auc_macro(\n"," # target_df=y_val,\n"," # predictions_df=X_val,\n"," # prec_level=0.75,\n"," # cat_column=\"cat3_grouped\"\n"," # )\n"," # print(f'Fold #{fold}: {pr_auc_macro_metr}')\n"," # metrics.append(pr_auc_macro_metr)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7n3EIEoIIc-v","outputId":"c9e162fc-c457-4127-9f4b-858bbac4f609","executionInfo":{"status":"ok","timestamp":1687583222238,"user_tz":180,"elapsed":1094,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":249,"outputs":[{"output_type":"stream","name":"stdout","text":["Index(['label', 'font', 'rank', 'rank_squares', 'bold_percentage',\n"," 'id_percentage'],\n"," dtype='object') (68, 6)\n","Learning rate set to 0.023016\n","0:\tlearn: 0.9545455\ttest: 0.9090909\tbest: 0.9090909 (0)\ttotal: 1.16ms\tremaining: 1.16s\n","100:\tlearn: 0.9545455\ttest: 0.9090909\tbest: 1.0000000 (2)\ttotal: 70.7ms\tremaining: 629ms\n"]},{"output_type":"stream","name":"stderr","text":["Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"]},{"output_type":"stream","name":"stdout","text":["200:\tlearn: 1.0000000\ttest: 0.9090909\tbest: 1.0000000 (2)\ttotal: 150ms\tremaining: 596ms\n","Stopped by overfitting detector (200 iterations wait)\n","\n","bestTest = 1\n","bestIteration = 2\n","\n","Shrink model to first 3 iterations.\n","Index(['label', 'font', 'rank', 'rank_squares', 'bold_percentage',\n"," 'id_percentage'],\n"," dtype='object') (73, 6)\n","Learning rate set to 0.022912\n","0:\tlearn: 0.9302326\ttest: 0.9230769\tbest: 0.9230769 (0)\ttotal: 846us\tremaining: 845ms\n","100:\tlearn: 0.9545455\ttest: 0.9230769\tbest: 1.0000000 (3)\ttotal: 77.1ms\tremaining: 686ms\n"]},{"output_type":"stream","name":"stderr","text":["Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"]},{"output_type":"stream","name":"stdout","text":["200:\tlearn: 1.0000000\ttest: 1.0000000\tbest: 1.0000000 (3)\ttotal: 157ms\tremaining: 626ms\n","Stopped by overfitting detector (200 iterations wait)\n","\n","bestTest = 1\n","bestIteration = 3\n","\n","Shrink model to first 4 iterations.\n","Index(['label', 'font', 'rank', 'rank_squares', 'bold_percentage',\n"," 'id_percentage'],\n"," dtype='object') (67, 6)\n","Learning rate set to 0.023037\n","0:\tlearn: 0.9565217\ttest: 0.8888889\tbest: 0.8888889 (0)\ttotal: 1.07ms\tremaining: 1.07s\n","100:\tlearn: 0.9787234\ttest: 0.8000000\tbest: 0.8888889 (0)\ttotal: 66.7ms\tremaining: 593ms\n"]},{"output_type":"stream","name":"stderr","text":["Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"]},{"output_type":"stream","name":"stdout","text":["200:\tlearn: 1.0000000\ttest: 0.8000000\tbest: 0.8888889 (0)\ttotal: 146ms\tremaining: 579ms\n","Stopped by overfitting detector (200 iterations wait)\n","\n","bestTest = 0.8888888889\n","bestIteration = 0\n","\n","Shrink model to first 1 iterations.\n","Index(['label', 'font', 'rank', 'rank_squares', 'bold_percentage',\n"," 'id_percentage'],\n"," dtype='object') (67, 6)\n","Learning rate set to 0.023037\n","0:\tlearn: 0.9545455\ttest: 0.9090909\tbest: 0.9090909 (0)\ttotal: 1.03ms\tremaining: 1.03s\n","100:\tlearn: 1.0000000\ttest: 0.9090909\tbest: 0.9090909 (0)\ttotal: 64.6ms\tremaining: 575ms\n"]},{"output_type":"stream","name":"stderr","text":["Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"]},{"output_type":"stream","name":"stdout","text":["200:\tlearn: 1.0000000\ttest: 0.9090909\tbest: 0.9090909 (0)\ttotal: 133ms\tremaining: 527ms\n","Stopped by overfitting detector (200 iterations wait)\n","\n","bestTest = 0.9090909091\n","bestIteration = 0\n","\n","Shrink model to first 1 iterations.\n","Index(['label', 'font', 'rank', 'rank_squares', 'bold_percentage',\n"," 'id_percentage'],\n"," dtype='object') (67, 6)\n","Learning rate set to 0.023037\n","0:\tlearn: 0.9302326\ttest: 1.0000000\tbest: 1.0000000 (0)\ttotal: 948us\tremaining: 947ms\n","100:\tlearn: 0.9545455\ttest: 1.0000000\tbest: 1.0000000 (0)\ttotal: 70.9ms\tremaining: 631ms\n"]},{"output_type":"stream","name":"stderr","text":["Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.\n"]},{"output_type":"stream","name":"stdout","text":["200:\tlearn: 1.0000000\ttest: 1.0000000\tbest: 1.0000000 (0)\ttotal: 144ms\tremaining: 572ms\n","Stopped by overfitting detector (200 iterations wait)\n","\n","bestTest = 1\n","bestIteration = 0\n","\n","Shrink model to first 1 iterations.\n"]}]},{"cell_type":"code","source":["train_df.iloc[val_index].file.unique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oMuy4qlPonDV","executionInfo":{"status":"ok","timestamp":1687563842571,"user_tz":180,"elapsed":5,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"1a6d28e2-1fd2-4af8-85e1-4f2402a2144a"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['data/general/ТП 19.066.084-ИЭИ.00011-20_ЕГЭ-21673.pdf',\n"," 'data/general/РС-19-1405-ИЭИ Инженерно-экологические изыскания часть 1 Изм.1 от 14.04.20.00008-20_СКЭ-22197.pdf',\n"," 'data/general/Раздел ПД N1 Экологические изыскания Книга 2.00052-20_СПЭ-18764.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 6.2-1.4.3 Изм.1.00072-20_СПЭ-21948.pdf',\n"," 'data/general/4735.IV-ИЭИ_изм2.00341-20_ГГЭ-20462.pdf',\n"," 'data/general/19-15_ОКЭФ_12-ПИР_ИПС-503-19-ИЭИ Изм.2.00484-21_РГЭ-25515.pdf',\n"," 'data/general/Раздел РИИ №3_ИЭИ-Фрагмент 1.00120-21_КМЭ-25202.pdf'],\n"," dtype=object)"]},"metadata":{},"execution_count":164}]},{"cell_type":"code","source":["# data/general/Раздел ИИ ИЭИ4.1.06920-22_ГГЭ-35136.pdf\n","# data/general/Раздел 1. Раздел II Часть 10_КНИ-ИЭИ изм.2.00127-20_ГГЭ-13675.pdf\n","# data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf\n","# data/general/Раздел ИИ 4-ИЭИ1-1-ИЗМ1.07970-22_ГГЭ-36050.pdf\n"],"metadata":{"id":"S6z8hnhcPNWk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["df[df['file'] == path]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":113},"id":"tuh_E8uARiJ-","executionInfo":{"status":"ok","timestamp":1687574423881,"user_tz":180,"elapsed":289,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"abc9cad1-7a83-4f17-e164-b27d30026265"},"execution_count":102,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Empty DataFrame\n","Columns: [text, label, font, file, squares, ids, len_of_text, rank, rank_squares, bold, bold_percentage, id_percentage]\n","Index: []"],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
textlabelfontfilesquaresidslen_of_textrankrank_squaresboldbold_percentageid_percentage
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":102}]},{"cell_type":"code","source":["files = ['data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf']\n","# files = df.file.unique()\n","for path in tqdm(files):\n"," temp_df = df[df['file'] == path].reset_index(drop=True)\n"," test_pool = Pool(\n"," data=temp_df.drop(columns=['label', 'file', 'text', 'squares', 'ids', 'len_of_text', 'bold']),\n"," # text_features=text_features\n"," )\n"," preds = []\n"," for model in models:\n"," preds.append(model.predict_proba(test_pool)[:, 1])\n"," temp_df['pred'] = np.mean(preds, axis=0)\n"," if (temp_df.iloc[temp_df['pred'].idxmax(), 1]) == 0:\n"," print(path)\n"," # print(temp_df.iloc[temp_df['pred'].idxmax()])\n"," # break"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Aya5civBkcXn","executionInfo":{"status":"ok","timestamp":1687577359797,"user_tz":180,"elapsed":257,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"5a446ca3-eb92-4ea5-8246-2dffd0955045"},"execution_count":196,"outputs":[{"output_type":"stream","name":"stderr","text":["100%|██████████| 1/1 [00:00<00:00, 129.65it/s]"]},{"output_type":"stream","name":"stdout","text":["data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf\n"]},{"output_type":"stream","name":"stderr","text":["\n"]}]},{"cell_type":"code","source":["# italic, отчет, stupid, bad pdf reading"],"metadata":{"id":"cHQO2tfkQPnB","executionInfo":{"status":"ok","timestamp":1687577360203,"user_tz":180,"elapsed":6,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":197,"outputs":[]},{"cell_type":"code","source":["temp_df"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":975},"id":"s-aXhgnwPTor","executionInfo":{"status":"ok","timestamp":1687577360529,"user_tz":180,"elapsed":9,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"05b94f06-f1a9-494e-fa46-8f063fd235f4"},"execution_count":198,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" text label font \\\n","0 общество с ограниченной ответственностью научн... 0 2.0 \n","1 (ооо нпп «изыскатель») 0 0.0 \n","2 регистрационный номер в реестре членов сро №13... 0 2.0 \n","3 заказчик – ооо «пермтотинефть» 0 0.0 \n","4 «строительство и обустройство скважин куста № ... 1 1.0 \n","5 технический отчет по результатам инженерно-гео... 0 2.0 \n","6 приложение н.2 к разделу 1 «пояснительная запи... 0 0.0 \n","7 2019/021-игди2 0 0.0 \n","8 том 1.2 0 2.0 \n","9 изм. № док. подп. 0 0.0 \n","10 дата 0 0.0 \n","11 березники, 2019 0 1.0 \n","\n"," file squares ids \\\n","0 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 12512.0 0.0 \n","1 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 2296.0 1.0 \n","2 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 5754.0 2.0 \n","3 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 3122.0 3.0 \n","4 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 13410.0 4.0 \n","5 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 20069.0 5.0 \n","6 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 10488.0 6.0 \n","7 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 1484.0 7.0 \n","8 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 686.0 8.0 \n","9 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 1332.0 9.0 \n","10 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 324.0 10.0 \n","11 data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf 1428.0 11.0 \n","\n"," len_of_text rank rank_squares bold bold_percentage id_percentage \\\n","0 91 2 3 0 33 0 \n","1 23 7 7 0 33 9 \n","2 64 5 5 0 33 18 \n","3 31 6 6 0 33 27 \n","4 79 3 2 1 33 36 \n","5 94 1 1 0 33 45 \n","6 70 4 4 0 33 54 \n","7 15 9 8 0 33 63 \n","8 8 11 11 0 33 72 \n","9 20 8 10 0 33 81 \n","10 5 12 12 0 33 90 \n","11 15 9 9 1 33 100 \n","\n"," pred \n","0 0.422350 \n","1 0.424133 \n","2 0.429294 \n","3 0.420076 \n","4 0.429754 \n","5 0.481646 \n","6 0.419379 \n","7 0.405708 \n","8 0.419875 \n","9 0.405732 \n","10 0.411569 \n","11 0.409215 "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
textlabelfontfilesquaresidslen_of_textrankrank_squaresboldbold_percentageid_percentagepred
0общество с ограниченной ответственностью научн...02.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf12512.00.0912303300.422350
1(ооо нпп «изыскатель»)00.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf2296.01.0237703390.424133
2регистрационный номер в реестре членов сро №13...02.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf5754.02.06455033180.429294
3заказчик – ооо «пермтотинефть»00.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf3122.03.03166033270.420076
4«строительство и обустройство скважин куста № ...11.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf13410.04.07932133360.429754
5технический отчет по результатам инженерно-гео...02.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf20069.05.09411033450.481646
6приложение н.2 к разделу 1 «пояснительная запи...00.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf10488.06.07044033540.419379
72019/021-игди200.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf1484.07.01598033630.405708
8том 1.202.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf686.08.081111033720.419875
9изм. № док. подп.00.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf1332.09.020810033810.405732
10дата00.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf324.010.051212033900.411569
11березники, 201901.0data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf1428.011.015991331000.409215
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":198}]},{"cell_type":"code","source":["files = [\n"," 'data/general/Раздел РИИ №3_ИЭИ-Фрагмент 1.00120-21_КМЭ-25202.pdf',\n"," 'data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-20_СГЭ-04495.pdf',\n"," 'data/general/3864 ИИ 4 К. Изм. 2.02608-19_ГГЭ-07898.pdf',\n"," 'data/general/Раздел ОД №4_Том 4.1.1_Изм.04.02480-21_ГГЭ-04081.pdf',\n"," 'data/general/Раздел ОД №4_Том 4.1.5.02480-21_ГГЭ-04081.pdf',\n"," 'data/general/896-0-ИЭИ-1 изм.2.01499-19_ГГЭ-17661.pdf',\n"," 'data/general/19-15_ОКЭФ_12-ПИР_ИПС-503-19-ИЭИ Изм.2.00484-21_РГЭ-25515.pdf',\n"," 'data/general/01_20_ГК-ПИР_2020-2001-ИЭИ_изм.2.00015-21_РГЭ-25886.pdf',\n"," 'data/general/Кузб-183267_КРАС–ИЭИ1_изм.6.00256-21_КРЭ-26756.pdf',\n"," 'data/general/02_2-15-20 ИЭИ_Изм.5.00158-21_ГГЭ-26767.pdf',\n"," 'data/general/ИЭИ Проектирование оснащения и оснащение ИТСОТБ Верхнего Самарского гидроузла.04955-22_ГГЭ-29629.pdf',\n"," 'data/general/Раздел ИИ 4-ИЭИ1-1-ИЗМ1.07970-22_ГГЭ-36050.pdf',\n"," 'data/general/4735.IV-ИЭИ_изм2.00341-20_ГГЭ-20462.pdf',\n"," 'data/general/╨в╨╛╨╝ 1.2.00001-21_╨Х╨У╨н-26404.pdf',\n"," 'data/general/Отчет ИИ №3_Книга №3_(ДПТПиИТ-2020-834-ИЭИ3)_Том 3.3 Фрагмент 1 Изм.1.07430-22_ГГЭ-35753.pdf',\n"," 'data/general/Раздел 1. Раздел II Часть 10_КНИ-ИЭИ изм.2.00127-20_ГГЭ-13675.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 5.7-1.5 Изм.2.00060-20_СПЭ-21903.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 6.2-1.4.1 Изм.1.00072-20_СПЭ-21948.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 6.2-1.4.2 Изм.1.00072-20_СПЭ-21948.pdf',\n"," 'data/general/Раздел ПД N 1. Пояснительная записка Том 6.2-1.4.3 Изм.1.00072-20_СПЭ-21948.pdf',\n"," 'data/general/Раздел ПД N1 Экологические изыскания Книга 1.00052-20_СПЭ-18764.pdf',\n"," 'data/general/Раздел ПД N1 Экологические изыскания Книга 2.00052-20_СПЭ-18764.pdf',\n"," 'data/general/Раздел ПД N1 Экологические изыскания.00052-20_СПЭ-18764.pdf',\n"," 'data/general/Раздел ПД №10-Том 10.138К-5-ИЭ-Книга2.00082-20_ГГЭ-06683.pdf',\n"," 'data/general/РС-19-1405-ИЭИ Инженерно-экологические изыскания часть 1 Изм.1 от 14.04.20.00008-20_СКЭ-22197.pdf',\n"," 'data/general/Том3ИЭИ.00321-20_СПЭ-22115.pdf',\n"," 'data/general/ТП 19.066.084-ИЭИ.00011-20_ЕГЭ-21673.pdf',\n"," 'data/general/Раздел ИИ № 4 Том 4.1.07125-22_ГГЭ-35572.pdf',\n"," 'data/general/Раздел ИИ № 4 Том 4.2.07125-22_ГГЭ-35572.pdf',\n"," 'data/general/Раздел ИИ ИЭИ4.1.06920-22_ГГЭ-35136.pdf',\n"," 'data/general/064-2011-Р.pdf']"],"metadata":{"id":"MM-HrUjUSagU","executionInfo":{"status":"ok","timestamp":1687657609906,"user_tz":180,"elapsed":7,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":92,"outputs":[]},{"cell_type":"code","source":["import json\n","from functools import partial\n","from typing import List\n","import glob\n","from tqdm import tqdm\n","\n","import matplotlib.pyplot as plt\n","import numpy as np\n","import pandas as pd\n","import seaborn as sns\n","!pip install Levenshtein -qqq\n","!pip install catboost transformers -qqq\n","from catboost import CatBoostClassifier, Pool\n","from catboost.utils import eval_metric\n","from scipy.spatial.distance import cosine, euclidean\n","from sklearn.metrics import pairwise_distances\n","from sklearn.model_selection import train_test_split"],"metadata":{"id":"DWU2XBJGfimg","executionInfo":{"status":"ok","timestamp":1687657622065,"user_tz":180,"elapsed":10133,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":93,"outputs":[]},{"cell_type":"code","source":["from pdfminer.high_level import extract_pages\n","from pdfminer.layout import LTTextContainer\n","from tqdm import tqdm\n","import pandas as pd\n","from pdfminer.layout import LTTextContainer, LTChar\n","import re\n","import glob\n","import warnings\n","warnings.filterwarnings(\"ignore\")\n","\n","\n","def extract_test_features(file):\n"," print(file)\n"," texts = []\n"," fonts = []\n"," squares = []\n"," ids = []\n"," coords = []\n"," relative_coords = []\n"," for page_layout in extract_pages(file):\n"," _x1, _y1, _x2, _y2 = page_layout.bbox\n"," for i, element in enumerate(page_layout):\n"," if isinstance(element, LTTextContainer):\n"," text = element.get_text().replace('\\n', '')\n","\n"," if '(cid:' in text:\n"," return [], 'Files encoding is corrupted'\n","\n"," if text.split() != [] and len(text) > 4:\n"," texts.append(text)\n","\n"," end = False\n"," for text_line in element:\n"," if end: break\n"," for character in text_line:\n"," if isinstance(character, LTChar):\n"," if 'bold' in character.fontname.lower():\n"," fonts.append(1)\n"," elif 'italic' in character.fontname.lower():\n"," fonts.append(2)\n"," else:\n"," fonts.append(0)\n"," end = True\n"," break\n","\n"," x1, y1, x2, y2 = element.bbox\n"," coords.append([x1, y1, x2, y2])\n"," relative_coords.append([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2])\n","\n"," squares.append((int(x2) - int(x1)) * (int(y2) - int(y1)))\n","\n"," match = re.search(r'LTTextBoxHorizontal\\((\\d+)\\)', str(element))\n"," if match:\n"," id = int(match.group(1))\n"," ids.append(id)\n"," break\n","\n"," if not texts: return [], 'File consists of scans'\n"," if len(texts) < 3: return [], 'Main page consists of scan'\n"," if len(texts) > 25: return [], 'Seems like incorrect reading'\n","\n"," test_df = pd.DataFrame({\n"," 'text': texts,\n"," 'font': fonts,\n"," 'file': file,\n"," 'squares': squares,\n"," 'ids': ids,\n"," 'coords': coords,\n"," 'relative_coords': relative_coords\n"," })\n"," return test_df, True\n","\n","\n","def create_test_features(df):\n"," df['len_of_text'] = df['text'].apply(len)\n"," # df['len_of_text'] = df['text'].apply(lambda x: len(x.split()))\n","\n"," df['rank'] = df.groupby('file')['len_of_text'].rank(ascending=False, method='min').astype(int)\n"," df['rank_squares'] = df.groupby('file')['squares'].rank(ascending=False, method='min').astype(int)\n"," df['font'] = df['font'].astype(object) # Convert boolean to int for computation, True will be 1 and False will be 0\n"," df['bold'] = (df['font'] == 1).astype(int)\n"," df['bold_percentage'] = df.groupby('file')['font'].transform(lambda x: x.mean() * 100).astype(int)\n"," df['id_percentage'] = df.groupby('file')['ids'].transform(lambda x: (x / x.max()) * 100).astype(int)\n","\n"," return df\n","\n","# test_df, result = extract_test_features(corrupted_files[5])\n","# if test_df:\n","# test_df = create_test_features(test_df)\n","# else:\n","# print(result)"],"metadata":{"id":"vOyVq3qPfqcH","executionInfo":{"status":"ok","timestamp":1687657769453,"user_tz":180,"elapsed":879,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":95,"outputs":[]},{"cell_type":"code","source":["def extract_test_features(file):\n"," print(file)\n"," texts = []\n"," fonts = []\n"," squares = []\n"," ids = []\n"," coords = []\n"," relative_coords = []\n"," for page_layout in extract_pages(file):\n"," _x1, _y1, _x2, _y2 = page_layout.bbox\n"," for i, element in enumerate(page_layout):\n"," if isinstance(element, LTTextContainer):\n"," text = element.get_text().replace('\\n', '')\n","\n"," if '(cid:' in text:\n"," return [], 'Files encoding is corrupted'\n","\n"," if text.split() != [] and len(text) > 4:\n"," texts.append(text)\n","\n"," end = False\n"," for text_line in element:\n"," if end: break\n"," for character in text_line:\n"," if isinstance(character, LTChar):\n"," if 'bold' in character.fontname.lower():\n"," fonts.append(1)\n"," elif 'italic' in character.fontname.lower():\n"," fonts.append(2)\n"," else:\n"," fonts.append(0)\n"," end = True\n"," break\n","\n"," x1, y1, x2, y2 = element.bbox\n"," coords.append([x1, y1, x2, y2])\n"," relative_coords.append([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2])\n","\n"," squares.append((int(x2) - int(x1)) * (int(y2) - int(y1)))\n","\n"," match = re.search(r'LTTextBoxHorizontal\\((\\d+)\\)', str(element))\n"," if match:\n"," id = int(match.group(1))\n"," ids.append(id)\n"," break\n","\n"," if not texts: return [], 'File consists of scans'\n"," if len(texts) < 3: return [], 'Main page consists of scan'\n"," if len(texts) > 25: return [], 'Seems like incorrect reading'\n","\n"," test_df = pd.DataFrame({\n"," 'text': texts,\n"," 'font': fonts,\n"," 'file': file,\n"," 'squares': squares,\n"," 'ids': ids,\n"," 'coords': coords,\n"," 'relative_coords': relative_coords\n"," })\n"," return test_df, True"],"metadata":{"id":"hHp9xUJeSFWs","executionInfo":{"status":"ok","timestamp":1687657770658,"user_tz":180,"elapsed":6,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":96,"outputs":[]},{"cell_type":"code","source":["for page_layout in extract_pages(files[1]):\n"," # for element in page_layout:\n"," break"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"18jcj7ndELdl","executionInfo":{"status":"ok","timestamp":1687638070091,"user_tz":180,"elapsed":6,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"cebf6e06-efd0-4e6b-babb-459ae54385c6"},"execution_count":10,"outputs":[{"output_type":"stream","name":"stdout","text":["(0, 0, 595, 842)\n"]}]},{"cell_type":"code","source":["columns_to_use = ['font', 'rank', 'rank_squares', 'bold_percentage', 'id_percentage']\n","checkpoint_name = 'checkpoints/models.pkl'"],"metadata":{"id":"O6BmYBH7zVr6","executionInfo":{"status":"ok","timestamp":1687651638402,"user_tz":180,"elapsed":527,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["import pickle\n","\n","def inference_models(checkpoint_name, test_df):\n"," with open(checkpoint_name, 'rb') as f:\n"," models = pickle.load(f)\n","\n"," test_pool = Pool(\n"," data=test_df[columns_to_use]\n"," )\n"," preds = []\n"," for model in models:\n"," preds.append(model.predict_proba(test_pool)[:, 1])\n"," test_df['pred'] = np.mean(preds, axis=0)\n"," return test_df, test_df.loc[test_df['pred'].idxmax(), 'text'].strip()"],"metadata":{"id":"7Ch86Gv6x8xE","executionInfo":{"status":"ok","timestamp":1687651638818,"user_tz":180,"elapsed":3,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":8,"outputs":[]},{"cell_type":"code","source":["import time"],"metadata":{"id":"xBHEKBnQZPNO","executionInfo":{"status":"ok","timestamp":1687651641936,"user_tz":180,"elapsed":4,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["import Levenshtein as lev\n","\n","def calculate_distances(target, list_of_strings):\n"," target_length = len(target.split())\n"," distances = {}\n","\n"," for string in list_of_strings:\n"," string_words = string.split()\n","\n"," # If the string has at least as many words as the target\n"," if len(string_words) >= target_length:\n"," for i in range(len(string_words) - target_length + 1):\n"," window = \" \".join(string_words[i:i+target_length])\n"," distance = lev.distance(target, window)\n","\n"," # Save the distance for this window\n"," distances[window] = (distance / len(target)) * 100\n"," else:\n"," # If the string has fewer words than the target\n"," distance = lev.distance(target, string)\n"," distances[string] = (distance / len(target)) * 100\n","\n"," return distances"],"metadata":{"id":"Qo0NzCJ0Ts5O","executionInfo":{"status":"ok","timestamp":1687654223901,"user_tz":180,"elapsed":4,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":79,"outputs":[]},{"cell_type":"code","source":["import Levenshtein as lev\n","import math\n","\n","def calculate_distances(target, list_of_strings, stride_fraction=1/4, threshold=0.3):\n"," target_length = len(target.split())\n"," min_distances = []\n","\n"," stride_length = math.ceil(target_length * stride_fraction)\n","\n"," for string in list_of_strings:\n"," all_distances = []\n"," string_words = string.split()\n","\n"," if len(string_words) > target_length:\n"," i = 0\n"," while i < len(string_words) - target_length + 1:\n"," window = \" \".join(string_words[i:i+target_length])\n","\n"," distance = lev.distance(target, window) / len(target)\n"," if distance < threshold:\n"," for j in range(max(i-target_length, 0), min(i + target_length, len(string_words) - target_length + 1)):\n"," detailed_window = \" \".join(string_words[j:j+target_length])\n"," detailed_distance = lev.distance(target, detailed_window) / len(target)\n","\n"," all_distances.append((detailed_window, detailed_distance * 100))\n"," i += stride_length\n"," else:\n"," i += stride_length\n"," else:\n"," dist = lev.distance(target, string) / len(target)\n"," all_distances.append((string, dist * 100))\n","\n"," if all_distances:\n"," min_window = min(all_distances, key=lambda x: x[1])\n"," min_distances.append([min_window[0], min_window[1]])\n","\n"," return min_distances\n"],"metadata":{"id":"0XxZ1fakDx8_"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import Levenshtein as lev\n","import math\n","\n","def calculate_distances(target, list_of_strings, stride_fraction=1/4, threshold=0.3):\n"," target_length = len(target.split())\n"," min_distances = []\n","\n"," # Calculate stride length based on fraction of target length\n"," stride_length = math.ceil(target_length * stride_fraction)\n","\n"," for string in list_of_strings:\n"," all_distances = []\n"," string_words = string.split()\n","\n"," # If the string has at least as many words as the target\n"," # print(len(string_words), target_length)\n"," if len(string_words) > target_length:\n"," i = 0\n"," while i < len(string_words) - target_length + 1:\n"," window = \" \".join(string_words[i:i+target_length])\n","\n"," distance = lev.distance(target, window) / len(target)\n"," # print(window, distance)\n"," if distance < threshold:\n"," for j in range(max(i-target_length, 0), min(i + target_length, len(string_words) - target_length + 1)):\n"," detailed_window = \" \".join(string_words[j:j+target_length])\n"," detailed_distance = lev.distance(target, detailed_window) / len(target)\n","\n"," all_distances.append((detailed_window, detailed_distance * 100))\n"," # print(all_distances)\n"," i += stride_length # move with larger stride\n"," else:\n"," i += stride_length # move with larger stride\n"," else:\n"," dist = lev.distance(target, string) / len(target)\n"," # print(dist)\n"," all_distances.append((string, dist * 100))\n","\n"," # # Find minimum distances among every three consecutive windows\n"," # for i in range(0, maxlen(all_distances)-target_length+1, target_length):\n"," # windows = all_distances[i:i+target_length]\n"," # print(all_distances)\n"," if all_distances: # ensure windows is not empty\n"," min_window = min(all_distances, key=lambda x: x[1])\n"," min_distances.append([min_window[0], min_window[1]]) # update the dictionary with the minimum distance window\n","\n"," return min_distances\n"],"metadata":{"id":"cXn6WTV5Hg_5","executionInfo":{"status":"ok","timestamp":1687658569830,"user_tz":180,"elapsed":1275,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":97,"outputs":[]},{"cell_type":"code","source":["!pip install spacy fuzzywuzzy python-Levenshtein\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7n9RAiMLSkdi","executionInfo":{"status":"ok","timestamp":1687658878683,"user_tz":180,"elapsed":7351,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"a8738505-0c47-4764-e7dd-0fd14af458fa"},"execution_count":99,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","Requirement already satisfied: spacy in /usr/local/lib/python3.10/dist-packages (3.5.3)\n","Collecting fuzzywuzzy\n"," Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)\n","Collecting python-Levenshtein\n"," Downloading python_Levenshtein-0.21.1-py3-none-any.whl (9.4 kB)\n","Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy) (3.0.12)\n","Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (1.0.4)\n","Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (1.0.9)\n","Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy) (2.0.7)\n","Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy) (3.0.8)\n","Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /usr/local/lib/python3.10/dist-packages (from spacy) (8.1.10)\n","Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy) (1.1.2)\n","Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy) (2.4.6)\n","Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy) (2.0.8)\n","Requirement already satisfied: typer<0.8.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (0.7.0)\n","Requirement already satisfied: pathy>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (0.10.2)\n","Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from spacy) (6.3.0)\n","Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (4.65.0)\n","Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (1.22.4)\n","Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (2.27.1)\n","Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from spacy) (1.10.9)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from spacy) (3.1.2)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy) (67.7.2)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (23.1)\n","Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy) (3.3.0)\n","Requirement already satisfied: Levenshtein==0.21.1 in /usr/local/lib/python3.10/dist-packages (from python-Levenshtein) (0.21.1)\n","Requirement already satisfied: rapidfuzz<4.0.0,>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from Levenshtein==0.21.1->python-Levenshtein) (3.1.1)\n","Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy) (4.6.3)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.16)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2023.5.7)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n","Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.10/dist-packages (from thinc<8.2.0,>=8.1.8->spacy) (0.7.9)\n","Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from thinc<8.2.0,>=8.1.8->spacy) (0.0.4)\n","Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.10/dist-packages (from typer<0.8.0,>=0.3.0->spacy) (8.1.3)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->spacy) (2.1.3)\n","Installing collected packages: fuzzywuzzy, python-Levenshtein\n","Successfully installed fuzzywuzzy-0.18.0 python-Levenshtein-0.21.1\n"]}]},{"cell_type":"code","source":["import spacy\n","import re\n","from fuzzywuzzy import fuzz\n","import Levenshtein\n","\n","nlp = spacy.load('ru_core_news_sm')\n","\n","import re\n","\n","def remove_special_characters(string):\n"," return re.sub(r'\\W', '', string)\n","\n","\n","def difference_type(word1, word2):\n"," if word1 == word2:\n"," return None # слова совпадают, пропускаем их\n","\n"," if remove_special_characters(word1) == remove_special_characters(word2):\n"," return \"Missing special character\"\n","\n"," if word1.lower() == word2.lower():\n"," return \"Just different capital\"\n","\n"," if word1.isdigit() and word2.isdigit():\n"," if abs(int(word1) - int(word2)) < 10:\n"," return 'Numerical difference'\n"," else:\n"," return 'Large numerical difference'\n"," token1 = nlp(word1)[0]\n"," token2 = nlp(word2)[0]\n"," if token1.lemma_ == token2.lemma_:\n"," if token1.pos_ != token2.pos_:\n"," return 'Different word forms'\n"," else:\n"," return 'Same lemma, different inflection'\n","\n"," if Levenshtein.distance(word1, word2) <= 2:\n"," return 'Possible spelling mistake'\n","\n"," # Using Spacy for lemmatization and part-of-speech tagging\n","\n","\n"," return 'Different words'\n","\n","def compare_strings(str1, str2):\n"," words1 = str1.split()\n"," words2 = str2.split()\n","\n"," # Find words that are only in one of the strings\n"," words1_only = set(words1) - set(words2)\n"," words2_only = set(words2) - set(words1)\n","\n"," differences = []\n"," mn_len = min(len(words1), len(words2))\n"," for i in range(mn_len):\n"," difference = difference_type(words1[i], words2[i])\n"," differences.append((words1[i], words2[i], difference))\n","\n"," for word in words1_only:\n"," differences.append((word, None, 'Word only in first string'))\n","\n"," for word in words2_only:\n"," differences.append((None, word, 'Word only in second string'))\n","\n"," return differences\n","\n","str1 = \"я играю 123\"\n","str2 = \"ты играешь 124 и что-то еще\"\n","print(compare_strings(str1, str2))\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yz25qP6dTtqU","executionInfo":{"status":"ok","timestamp":1687661552567,"user_tz":180,"elapsed":2895,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"74047468-d9af-4996-e296-62bc3b13a835"},"execution_count":126,"outputs":[{"output_type":"stream","name":"stdout","text":["[('я', 'ты', 'Possible spelling mistake'), ('играю', 'играешь', 'Same lemma, different inflection'), ('123', '124', 'Numerical difference'), ('123', None, 'Word only in first string'), ('я', None, 'Word only in first string'), ('играю', None, 'Word only in first string'), (None, 'ты', 'Word only in second string'), (None, 'и', 'Word only in second string'), (None, 'еще', 'Word only in second string'), (None, 'играешь', 'Word only in second string'), (None, '124', 'Word only in second string'), (None, 'что-то', 'Word only in second string')]\n"]}]},{"cell_type":"code","source":["s1 = '«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»'\n","s2 = '«Реконструкции платины Горельского гидроузла на р Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»'\n","compare_strings(s1, s2)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Yvq86-wMVU3-","executionInfo":{"status":"ok","timestamp":1687661553626,"user_tz":180,"elapsed":5,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"e8b867b1-bdd1-4001-dc88-67f4d63a37a8"},"execution_count":127,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[('«Реконструкция', '«Реконструкции', 'Same lemma, different inflection'),\n"," ('плотины', 'платины', 'Possible spelling mistake'),\n"," ('Горельского', 'Горельского', None),\n"," ('гидроузла', 'гидроузла', None),\n"," ('на', 'на', None),\n"," ('р.', 'р', 'Missing special character'),\n"," ('Цна,', 'Цна,', None),\n"," ('с.', 'с.', None),\n"," ('Горелое,', 'Горелое,', None),\n"," ('Тамбовский', 'Тамбовский', None),\n"," ('район,', 'район,', None),\n"," ('Тамбовская', 'Тамбовская', None),\n"," ('область.', 'область.', None),\n"," ('(Корректировка', '(Корректировка', None),\n"," ('в', 'в', None),\n"," ('том', 'том', None),\n"," ('числе', 'числе', None),\n"," ('проектные', 'проектные', None),\n"," ('и', 'и', None),\n"," ('изыскательские', 'изыскательские', None),\n"," ('работы)»', 'работы)»', None),\n"," ('«Реконструкция', None, 'Word only in first string'),\n"," ('р.', None, 'Word only in first string'),\n"," ('плотины', None, 'Word only in first string'),\n"," (None, 'платины', 'Word only in second string'),\n"," (None, '«Реконструкции', 'Word only in second string'),\n"," (None, 'р', 'Word only in second string')]"]},"metadata":{},"execution_count":127}]},{"cell_type":"code","source":["!python -m spacy download ru_core_news_sm\n"],"metadata":{"id":"N-fVD1elUHNv"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["s1 = 'Комплекс работ в ходе инженерно-экологических изысканий для объекта «Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)» для ГФУ «Цнинская шлюзованная система» выполнен в июне 2019 г. Основанием для производства работ является договор № 19/119-И, техническое задание на выполнение инженерно-экологических изысканий. '"],"metadata":{"id":"MMt2Eaxnf8Xb","executionInfo":{"status":"ok","timestamp":1687647996060,"user_tz":180,"elapsed":4,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":172,"outputs":[]},{"cell_type":"code","source":["s2 = '«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)» разработаны в соответствии с требованиями СП 47.13330.2016 «Инженерные изыскания для строительства. Основные положения». Актуализированная редакция [16]; СП 11-102-97 «Инженерно-экологические изыскания для строительства» [17], Постановлением Правительства РФ от 26.12.2014 г. № 1521 [7] и на основании других действующих законодательных актов и нормативных документов. '"],"metadata":{"id":"Tunqxdo-kCii","executionInfo":{"status":"ok","timestamp":1687647994639,"user_tz":180,"elapsed":427,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":171,"outputs":[]},{"cell_type":"code","source":["distances = calculate_distances(target, [s1, s2])\n","distances"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-CNSIcF9fnKQ","executionInfo":{"status":"ok","timestamp":1687648739154,"user_tz":180,"elapsed":1094,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"46c48a38-e26b-4433-e267-d126687ca2fd"},"execution_count":186,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[['«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»',\n"," 9.392265193370166],\n"," ['«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»',\n"," 9.392265193370166]]"]},"metadata":{},"execution_count":186}]},{"cell_type":"code","source":["for window, distance in distances:\n"," if distance / len(target) < 0.2:\n"," print(i)\n"," print(window)\n"," print(distance)\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"NrLgF3TOgreO","executionInfo":{"status":"ok","timestamp":1687648493217,"user_tz":180,"elapsed":588,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"9828fcc1-67e0-4b95-f7e6-94a06c500167"},"execution_count":183,"outputs":[{"output_type":"stream","name":"stdout","text":["8\n","«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»\n","9.392265193370166\n","8\n","«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»\n","9.392265193370166\n"]}]},{"cell_type":"code","source":["target"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":53},"id":"BCrS4zOl-Jkb","executionInfo":{"status":"ok","timestamp":1687653332584,"user_tz":180,"elapsed":19,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"722de45c-045a-4b55-b87c-7442b3413eea"},"execution_count":44,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'«Строительство скоростной автомобильной дороги Москва – Санкт-Петербург на участке км 58 – км 684 (с последующей эксплуатацией на платной основе), 3 этап км 149 – км 208»'"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":44}]},{"cell_type":"code","source":["target"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":53},"id":"DaABrdaJmWDJ","executionInfo":{"status":"ok","timestamp":1687646989959,"user_tz":180,"elapsed":543,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"347860bd-579f-4515-c553-04c32e9bc277"},"execution_count":155,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные Корректировкаи изыскательские работы)'"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":155}]},{"cell_type":"code","source":["import re\n","\n","def replace_multiple_spaces(text):\n"," return re.sub(' +', ' ', text)\n","\n","target = replace_multiple_spaces(target)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":201},"id":"BNyXIbo3l_my","executionInfo":{"status":"error","timestamp":1687651653699,"user_tz":180,"elapsed":457,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"9321a589-05be-49a5-c185-279aa58d9ba4"},"execution_count":11,"outputs":[{"output_type":"error","ename":"NameError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' +'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m' '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mtarget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreplace_multiple_spaces\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mNameError\u001b[0m: name 'target' is not defined"]}]},{"cell_type":"code","source":["for file in files[2:3]:\n"," changed = []\n"," s = time.time()\n"," test_df, result = extract_test_features(file)\n","\n"," if isinstance(test_df, pd.DataFrame):\n"," test_df = create_test_features(test_df)\n"," else:\n"," print(result)\n","\n"," result_df, target = inference_models(checkpoint_name, test_df)\n"," target = replace_multiple_spaces(target)\n"," # target = '«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»'\n"," from pdfminer.high_level import extract_pages\n"," result = []\n"," for i, page_layout in enumerate(tqdm(extract_pages(file))):\n"," # if i != 1: continue\n"," _x1, _y1, _x2, _y2 = page_layout.bbox\n"," texts = []\n"," relative_coords = []\n"," for element in page_layout:\n"," if isinstance(element, LTTextContainer):\n"," # print(element.get_text())\n"," x1, y1, x2, y2 = element.bbox\n"," text = replace_multiple_spaces(element.get_text().replace('\\n', ' ').strip())\n"," if len(text) > 3:\n"," relative_coords.append(([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2]))\n"," texts.append(text)\n","\n"," distances = calculate_distances(target, texts)\n","\n"," for window, distance in distances.items():\n"," if distance / len(target) < 0.2:\n","\n"," for j in range(len(texts)):\n"," if window in texts[j]:\n"," # changed.append((texts[j], texts[j].replace(window, target)))\n"," rel_coord = relative_coords[j]\n"," break\n","\n"," result.append((i, window, rel_coord, distance / len(target) ))\n"," # break\n"," print(file)\n"," print('It took ', int(time.time() - s), 's')\n"," print()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VEWS1T1yuztM","executionInfo":{"status":"ok","timestamp":1687654518591,"user_tz":180,"elapsed":16244,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"f92cc9ce-3f30-4420-a872-b3bce4403e56"},"execution_count":87,"outputs":[{"output_type":"stream","name":"stdout","text":["data/general/3864 ИИ 4 К. Изм. 2.02608-19_ГГЭ-07898.pdf\n"]},{"output_type":"stream","name":"stderr","text":["134it [00:15, 8.61it/s]"]},{"output_type":"stream","name":"stdout","text":["data/general/3864 ИИ 4 К. Изм. 2.02608-19_ГГЭ-07898.pdf\n","It took 16 s\n","\n"]},{"output_type":"stream","name":"stderr","text":["\n"]}]},{"cell_type":"code","source":["changed[0][0]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":166},"id":"bYeyWCTu0UMB","executionInfo":{"status":"error","timestamp":1687654646043,"user_tz":180,"elapsed":416,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"58366224-e068-438e-be0f-6d28ee6b67ef"},"execution_count":89,"outputs":[{"output_type":"error","ename":"IndexError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mchanged\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mIndexError\u001b[0m: list index out of range"]}]},{"cell_type":"code","source":["changed[0][1]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":166},"id":"6mC0H8epAZ09","executionInfo":{"status":"error","timestamp":1687654647574,"user_tz":180,"elapsed":11,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"8f0545ac-46b1-48bd-8a49-d91fbe2b92c3"},"execution_count":90,"outputs":[{"output_type":"error","ename":"IndexError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mchanged\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mIndexError\u001b[0m: list index out of range"]}]},{"cell_type":"code","source":["target"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":70},"id":"LbjYE8lrAu-j","executionInfo":{"status":"ok","timestamp":1687654648290,"user_tz":180,"elapsed":14,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"1f3b3c4d-5ba7-4c08-9773-e33ae8584af8"},"execution_count":91,"outputs":[{"output_type":"execute_result","data":{"text/plain":["'«Строительство и реконструкция автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска Строительство транспортной развязки на км 974 автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска, Самарская область. Корректировка».'"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":91}]},{"cell_type":"code","source":["result"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Yk6_xrXB9Mcc","executionInfo":{"status":"ok","timestamp":1687654518591,"user_tz":180,"elapsed":25,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"d38ef021-6431-438c-c8bd-6bc7d1fb419c"},"execution_count":88,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[(0,\n"," '«Строительство и реконструкция автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска Строительство транспортной развязки на км 974 автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска, Самарская область. Корректировка».',\n"," [0.1580729691594436,\n"," 0.5093864500190042,\n"," 0.764091177853927,\n"," 0.12813568985176751],\n"," 0.0),\n"," (1,\n"," '«Строительство и реконструкция автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска Строительство транспортной развязки на км 974 автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска, Самарская область. Корректировка».',\n"," [0.1580729691594436,\n"," 0.5273454009882174,\n"," 0.7639272996035744,\n"," 0.12827822120866605],\n"," 0.0),\n"," (131,\n"," '«Строительство и реконструкция автомобильной дороги М-5 «Урал» -от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска Строительство транспортной развязки на км 974 автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска, Самарская область. Корректировка»',\n"," [0.6767086481947943,\n"," 0.16495075762470307,\n"," 0.2475508699244331,\n"," 0.04184093529691212],\n"," 0.002313850710352168),\n"," (132,\n"," '«Строительство и реконструкция автомобильной дороги М-5 «Урал» -от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска Строительство транспортной развязки на км 974 автомобильной дороги М-5 «Урал» - от Москвы через Рязань, Пензу, Самару, Уфу до Челябинска, Самарская область. Корректировка»',\n"," [0.6755499580184718,\n"," 0.16673223030878856,\n"," 0.2475508699244332,\n"," 0.0417696763895487],\n"," 0.002313850710352168)]"]},"metadata":{},"execution_count":88}]},{"cell_type":"code","source":["len(result)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"NWnTD4yLTtGo","executionInfo":{"status":"ok","timestamp":1687654482804,"user_tz":180,"elapsed":369,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"ef5c51de-9b12-4062-a6ac-afca781907a5"},"execution_count":84,"outputs":[{"output_type":"execute_result","data":{"text/plain":["1"]},"metadata":{},"execution_count":84}]},{"cell_type":"code","source":["target = '«Реконструкция плотины Горельского гидроузла на р. Цна, с. Горелое, Тамбовский район, Тамбовская область. (Корректировка в том числе проектные и изыскательские работы)»'"],"metadata":{"id":"0KbDBxAct006","executionInfo":{"status":"ok","timestamp":1687648998922,"user_tz":180,"elapsed":541,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}}},"execution_count":191,"outputs":[]},{"cell_type":"code","source":["result_df"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":819},"id":"xkA5ly6lga5M","executionInfo":{"status":"ok","timestamp":1687641488609,"user_tz":180,"elapsed":7,"user":{"displayName":"Nikita Dilman","userId":"06419702465665096398"}},"outputId":"ae6f701b-9e2a-4cd1-ebe0-e344847a8ae1"},"execution_count":59,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" text font \\\n","0 Реконструкция плотины Горельского гидроузла н... 1 \n","1 Технический отчет по результатам инженерно-эко... 1 \n","2 19/119-И-ИЭИ 1 \n","3 Том 3 1 \n","4 35-20 0 \n","5 2921.04.2020 0 \n","6 2019 1 \n","\n"," file squares ids \\\n","0 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 56172 0 \n","1 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 17550 1 \n","2 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 2250 2 \n","3 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 918 3 \n","4 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 286 5 \n","5 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 600 6 \n","6 data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2... 738 7 \n","\n"," coords \\\n","0 [95.2800342756015, 532.9594999589843, 548.0043... \n","1 [98.3995963384576, 446.1314000011057, 548.9640... \n","2 [260.76, 367.8500000007152, 385.98260631509044... \n","3 [295.438, 283.1300000007152, 346.7355896315769... \n","4 [414.9242006103037, 128.87693584416243, 440.58... \n","5 [507.7139916413332, 125.8110002762967, 557.213... \n","6 [318.6, 24.410600000715224, 359.0983811252344,... \n","\n"," relative_coords pages len_of_text \\\n","0 [0.16013451138756554, 0.6329685272672023, 0.76... 0 182 \n","1 [0.16537747283774387, 0.5298472684098643, 0.75... 1 68 \n","2 [0.4382521008403361, 0.43687648456141953, 0.21... 2 14 \n","3 [0.49653445378151256, 0.33625890736426983, 0.0... 3 6 \n","4 [0.697351597664376, 0.15306049387667747, 0.043... 5 5 \n","5 [0.853300826287955, 0.14941924023313147, 0.083... 6 12 \n","6 [0.5354621848739496, 0.028991211402274612, 0.0... 7 5 \n","\n"," rank rank_squares bold bold_percentage id_percentage pred \n","0 1 1 1 71 0 0.529246 \n","1 2 2 1 71 14 0.457483 \n","2 3 3 1 71 28 0.457483 \n","3 5 4 1 71 42 0.463904 \n","4 6 7 0 71 71 0.447048 \n","5 4 6 0 71 85 0.460314 \n","6 6 5 1 71 100 0.447048 "],"text/html":["\n","
\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
textfontfilesquaresidscoordsrelative_coordspageslen_of_textrankrank_squaresboldbold_percentageid_percentagepred
0Реконструкция плотины Горельского гидроузла н...1data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...561720[95.2800342756015, 532.9594999589843, 548.0043...[0.16013451138756554, 0.6329685272672023, 0.76...01821117100.529246
1Технический отчет по результатам инженерно-эко...1data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...175501[98.3995963384576, 446.1314000011057, 548.9640...[0.16537747283774387, 0.5298472684098643, 0.75...16822171140.457483
219/119-И-ИЭИ1data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...22502[260.76, 367.8500000007152, 385.98260631509044...[0.4382521008403361, 0.43687648456141953, 0.21...21433171280.457483
3Том 31data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...9183[295.438, 283.1300000007152, 346.7355896315769...[0.49653445378151256, 0.33625890736426983, 0.0...3654171420.463904
435-200data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...2865[414.9242006103037, 128.87693584416243, 440.58...[0.697351597664376, 0.15306049387667747, 0.043...5567071710.447048
52921.04.20200data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...6006[507.7139916413332, 125.8110002762967, 557.213...[0.853300826287955, 0.14941924023313147, 0.083...61246071850.460314
620191data/general/Отчет ИЭИ 19-119-И-3 изм1.00020-2...7387[318.6, 24.410600000715224, 359.0983811252344,...[0.5354621848739496, 0.028991211402274612, 0.0...75651711000.447048
\n","
\n"," \n"," \n"," \n","\n"," \n","
\n","
\n"," "]},"metadata":{},"execution_count":59}]},{"cell_type":"code","source":[],"metadata":{"id":"43Y5XlLbRSSu"},"execution_count":null,"outputs":[]}]} \ No newline at end of file