mirror of
				https://github.com/mistakes-23/backend.git
				synced 2025-10-31 07:27:27 +03:00 
			
		
		
		
	updated file handling
This commit is contained in:
		
							parent
							
								
									e35215f22a
								
							
						
					
					
						commit
						f588fead51
					
				|  | @ -61,7 +61,14 @@ class FullFileSerializer(FileSerializer): | ||||||
| 
 | 
 | ||||||
|     class Meta: |     class Meta: | ||||||
|         model = File |         model = File | ||||||
|         fields = ["name", "ideal_title", "file", "images", "text_locations"] |         fields = [ | ||||||
|  |             "name", | ||||||
|  |             "ideal_title", | ||||||
|  |             "file", | ||||||
|  |             "processed_file", | ||||||
|  |             "images", | ||||||
|  |             "text_locations", | ||||||
|  |         ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class UpdateFileTitleSerializer(serializers.Serializer): | class UpdateFileTitleSerializer(serializers.Serializer): | ||||||
|  |  | ||||||
|  | @ -0,0 +1,21 @@ | ||||||
|  | # Generated by Django 4.2.2 on 2023-06-25 02:54 | ||||||
|  | 
 | ||||||
|  | from django.db import migrations, models | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class Migration(migrations.Migration): | ||||||
|  | 
 | ||||||
|  |     dependencies = [ | ||||||
|  |         ( | ||||||
|  |             "processor", | ||||||
|  |             "0007_delete_task_remove_fileimage_text_file_ideal_title_and_more", | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     operations = [ | ||||||
|  |         migrations.AddField( | ||||||
|  |             model_name="file", | ||||||
|  |             name="processed_file", | ||||||
|  |             field=models.FileField(blank=True, null=True, upload_to="processed/"), | ||||||
|  |         ), | ||||||
|  |     ] | ||||||
|  | @ -14,6 +14,7 @@ class File(models.Model): | ||||||
|         upload_to="uploads/", |         upload_to="uploads/", | ||||||
|         validators=[FileExtensionValidator(allowed_extensions=["pdf"])], |         validators=[FileExtensionValidator(allowed_extensions=["pdf"])], | ||||||
|     ) |     ) | ||||||
|  |     processed_file = models.FileField(upload_to="processed/", null=True, blank=True) | ||||||
| 
 | 
 | ||||||
|     class Meta: |     class Meta: | ||||||
|         ordering = ("-uploaded",) |         ordering = ("-uploaded",) | ||||||
|  |  | ||||||
|  | @ -1,3 +1,7 @@ | ||||||
|  | from typing import Tuple | ||||||
|  | from io import BytesIO | ||||||
|  | import re | ||||||
|  | import fitz | ||||||
| from django.core.cache import cache | from django.core.cache import cache | ||||||
| from rest_framework.exceptions import NotFound | from rest_framework.exceptions import NotFound | ||||||
| 
 | 
 | ||||||
|  | @ -17,3 +21,151 @@ def get_task_status(pk: str) -> dict: | ||||||
|         "error": error, |         "error": error, | ||||||
|         "error_description": error_description, |         "error_description": error_description, | ||||||
|     } |     } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def extract_info(input_file: str): | ||||||
|  |     """ | ||||||
|  |     Extracts file info | ||||||
|  |     """ | ||||||
|  |     # Open the PDF | ||||||
|  |     pdfDoc = fitz.open(input_file) | ||||||
|  |     output = { | ||||||
|  |         "File": input_file, | ||||||
|  |         "Encrypted": ("True" if pdfDoc.isEncrypted else "False"), | ||||||
|  |     } | ||||||
|  |     # If PDF is encrypted the file metadata cannot be extracted | ||||||
|  |     if not pdfDoc.isEncrypted: | ||||||
|  |         for key, value in pdfDoc.metadata.items(): | ||||||
|  |             output[key] = value | ||||||
|  |     # To Display File Info | ||||||
|  |     print("## File Information ##################################################") | ||||||
|  |     print("\n".join("{}:{}".format(i, j) for i, j in output.items())) | ||||||
|  |     print("######################################################################") | ||||||
|  |     return True, output | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def search_for_text(lines, search_str): | ||||||
|  |     """ | ||||||
|  |     Search for the search string within the document lines | ||||||
|  |     """ | ||||||
|  |     if search_str in lines: | ||||||
|  |         return search_str | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def redact_matching_data(page, matched_values): | ||||||
|  |     """ | ||||||
|  |     Redacts matching values | ||||||
|  |     """ | ||||||
|  |     matches_found = 0 | ||||||
|  |     # Loop throughout matching values | ||||||
|  |     for val in matched_values: | ||||||
|  |         matches_found += 1 | ||||||
|  |         matching_val_area = page.search_for(val) | ||||||
|  |         # Redact matching values | ||||||
|  |         [ | ||||||
|  |             page.addRedactAnnot(area, text=" ", fill=(0, 0, 0)) | ||||||
|  |             for area in matching_val_area | ||||||
|  |         ] | ||||||
|  |     # Apply the redaction | ||||||
|  |     page.apply_redactions() | ||||||
|  |     return matches_found | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def frame_matching_data(page, matched_values): | ||||||
|  |     """ | ||||||
|  |     frames matching values | ||||||
|  |     """ | ||||||
|  |     matches_found = 0 | ||||||
|  |     # Loop throughout matching values | ||||||
|  |     for val in matched_values: | ||||||
|  |         matches_found += 1 | ||||||
|  |         matching_val_area = page.search_for(val) | ||||||
|  |         for area in matching_val_area: | ||||||
|  |             if isinstance(area, fitz.fitz.Rect): | ||||||
|  |                 # Draw a rectangle around matched values | ||||||
|  |                 annot = page.addRectAnnot(area) | ||||||
|  |                 # , fill = fitz.utils.getColor('black') | ||||||
|  |                 annot.setColors(stroke=fitz.utils.getColor("red")) | ||||||
|  |                 # If you want to remove matched data | ||||||
|  |                 # page.addFreetextAnnot(area, ' ') | ||||||
|  |                 annot.update() | ||||||
|  |     return matches_found | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def highlight_matching_data(page, matched_values, type): | ||||||
|  |     """ | ||||||
|  |     Highlight matching values | ||||||
|  |     """ | ||||||
|  |     matches_found = 0 | ||||||
|  |     # Loop throughout matching values | ||||||
|  |     for val in matched_values: | ||||||
|  |         matches_found += 1 | ||||||
|  |         matching_val_area = page.search_for(val) | ||||||
|  |         # print("matching_val_area",matching_val_area) | ||||||
|  |         highlight = None | ||||||
|  |         if type == "Highlight": | ||||||
|  |             highlight = page.add_highlight_annot(matching_val_area) | ||||||
|  |         elif type == "Squiggly": | ||||||
|  |             highlight = page.add_squiggly_annot(matching_val_area) | ||||||
|  |         elif type == "Underline": | ||||||
|  |             highlight = page.add_underline_annot(matching_val_area) | ||||||
|  |         elif type == "Strikeout": | ||||||
|  |             highlight = page.add_strikeout_annot(matching_val_area) | ||||||
|  |         else: | ||||||
|  |             highlight = page.add_highlight_annot(matching_val_area) | ||||||
|  |         # To change the highlight colar | ||||||
|  |         # highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) }) | ||||||
|  |         # highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red')) | ||||||
|  |         # highlight.setColors(colors= fitz.utils.getColor('red')) | ||||||
|  |         highlight.update() | ||||||
|  |     return matches_found | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def process_data( | ||||||
|  |     input_file: str, | ||||||
|  |     search_str: str, | ||||||
|  |     pages: Tuple = None, | ||||||
|  |     action: str = "Highlight", | ||||||
|  | ): | ||||||
|  |     """ | ||||||
|  |     Process the pages of the PDF File | ||||||
|  |     """ | ||||||
|  |     # Open the PDF | ||||||
|  |     pdfDoc = fitz.open(input_file) | ||||||
|  |     # Save the generated PDF to memory buffer | ||||||
|  |     output_buffer = BytesIO() | ||||||
|  |     total_matches = 0 | ||||||
|  |     # Iterate through pages | ||||||
|  |     for pg in range(len(pdfDoc)): | ||||||
|  |         # If required for specific pages | ||||||
|  |         if pages: | ||||||
|  |             if str(pg) not in pages: | ||||||
|  |                 continue | ||||||
|  |         # Select the page | ||||||
|  |         page = pdfDoc[pg] | ||||||
|  |         # Get Matching Data | ||||||
|  |         # Split page by lines | ||||||
|  |         page_lines = page.get_text("text") | ||||||
|  |         matched_values = search_for_text(page_lines, search_str) | ||||||
|  |         if matched_values: | ||||||
|  |             if action == "Redact": | ||||||
|  |                 matches_found = redact_matching_data(page, matched_values) | ||||||
|  |             elif action == "Frame": | ||||||
|  |                 matches_found = frame_matching_data(page, matched_values) | ||||||
|  |             elif action in ("Highlight", "Squiggly", "Underline", "Strikeout"): | ||||||
|  |                 matches_found = highlight_matching_data(page, matched_values, action) | ||||||
|  |             else: | ||||||
|  |                 matches_found = highlight_matching_data( | ||||||
|  |                     page, matched_values, "Highlight" | ||||||
|  |                 ) | ||||||
|  |             total_matches += matches_found | ||||||
|  |     print( | ||||||
|  |         f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}" | ||||||
|  |     ) | ||||||
|  |     # Save to output | ||||||
|  |     pdfDoc.save(output_buffer) | ||||||
|  |     pdfDoc.close() | ||||||
|  |     # Save the output buffer to the output file | ||||||
|  |     with open(input_file, mode="wb") as f: | ||||||
|  |         f.write(output_buffer.getbuffer()) | ||||||
|  | 
 | ||||||
|  |  | ||||||
|  | @ -1,8 +1,10 @@ | ||||||
| import os | import os | ||||||
| 
 | 
 | ||||||
| import shutil | import shutil | ||||||
|  | from io import BytesIO | ||||||
| from time import sleep | from time import sleep | ||||||
| 
 | 
 | ||||||
|  | import fitz | ||||||
| from celery import shared_task | from celery import shared_task | ||||||
| from django.core.files import File | from django.core.files import File | ||||||
| from pdf2image import convert_from_path | from pdf2image import convert_from_path | ||||||
|  | @ -26,8 +28,6 @@ def process_pdf(pk: str): | ||||||
|     cache.set(f"{pk}-features_loaded", False) |     cache.set(f"{pk}-features_loaded", False) | ||||||
|     cache.set(f"{pk}-processed", 1) |     cache.set(f"{pk}-processed", 1) | ||||||
|     extract_pdf_features.apply_async(kwargs={"pk": pk}) |     extract_pdf_features.apply_async(kwargs={"pk": pk}) | ||||||
|     split_pdf_into_images.apply_async(kwargs={"pk": pk}) |  | ||||||
|     load_pdf.apply_async(kwargs={"pk": pk}) |  | ||||||
|     return pk |     return pk | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -46,8 +46,23 @@ def extract_pdf_features(pk: str): | ||||||
|         text_locations = get_matches(file.file.path, target) |         text_locations = get_matches(file.file.path, target) | ||||||
|         file.ideal_title = target |         file.ideal_title = target | ||||||
|         file.text_locations = text_locations |         file.text_locations = text_locations | ||||||
|  | 
 | ||||||
|  |         pdfDoc = fitz.open(file.file.path) | ||||||
|  |         for loc in text_locations: | ||||||
|  |             page = pdfDoc[loc["page"] - 1] | ||||||
|  |             matching_val_area = page.search_for(loc["raw_text"]) | ||||||
|  |             for rect in matching_val_area: | ||||||
|  |                 page.add_highlight_annot(rect) | ||||||
|  |         output_buffer = BytesIO() | ||||||
|  |         pdfDoc.close() | ||||||
|  |         with open(file.file.path, mode="wb") as f: | ||||||
|  |             f.write(output_buffer.getbuffer()) | ||||||
|  | 
 | ||||||
|         file.save() |         file.save() | ||||||
|     cache.set(f"{pk}-features_loaded", True) |     cache.set(f"{pk}-features_loaded", True) | ||||||
|  |     split_pdf_into_images.apply_async(kwargs={"pk": pk}) | ||||||
|  |     load_pdf.apply_async(kwargs={"pk": pk}) | ||||||
|  |     # create_processed_pdf.apply_async(kwargs={"pk": pk}) | ||||||
|     return pk |     return pk | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -70,6 +85,18 @@ def update_pdf_features(pk: str, target: str): | ||||||
|     return pk |     return pk | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # @shared_task | ||||||
|  | # def create_processed_pdf(pk: str): | ||||||
|  | #     file = FileModel.objects.get(pk=pk) | ||||||
|  | #     f_path = "processed_" + file.file.path.split("/")[-1] | ||||||
|  | #     shutil.copy(file.file.path, f_path) | ||||||
|  | # | ||||||
|  | #     for loc in file.text_locations: | ||||||
|  | #         highlight_pdf(f_path, loc["raw_text"], page=loc["page"] - 1) | ||||||
|  | # | ||||||
|  | #     os.remove(f_path) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @shared_task | @shared_task | ||||||
| def split_pdf_into_images(pk: str): | def split_pdf_into_images(pk: str): | ||||||
|     file = FileModel.objects.get(pk=pk) |     file = FileModel.objects.get(pk=pk) | ||||||
|  |  | ||||||
							
								
								
									
										131
									
								
								ml/main.py
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								ml/main.py
									
									
									
									
									
								
							|  | @ -1,6 +1,9 @@ | ||||||
| import re | import re | ||||||
|  | import math | ||||||
|  | import spacy | ||||||
| import pickle | import pickle | ||||||
| import warnings | import warnings | ||||||
|  | import Levenshtein | ||||||
| import numpy as np | import numpy as np | ||||||
| import pandas as pd | import pandas as pd | ||||||
| import Levenshtein as lev | import Levenshtein as lev | ||||||
|  | @ -128,66 +131,156 @@ def inference_models(checkpoint_name, test_df): | ||||||
|     return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip() |     return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def calculate_distances(target, list_of_strings): | def calculate_distances(target, list_of_strings, stride_fraction=1 / 4, threshold=0.3): | ||||||
|     target_length = len(target.split()) |     target_length = len(target.split()) | ||||||
|     distances = {} |     min_distances = [] | ||||||
|  | 
 | ||||||
|  |     stride_length = math.ceil(target_length * stride_fraction) | ||||||
| 
 | 
 | ||||||
|     for string in list_of_strings: |     for string in list_of_strings: | ||||||
|  |         all_distances = [] | ||||||
|         string_words = string.split() |         string_words = string.split() | ||||||
| 
 | 
 | ||||||
|         # If the string has at least as many words as the target |         if len(string_words) > target_length: | ||||||
|         if len(string_words) >= target_length: |             i = 0 | ||||||
|             for i in range(len(string_words) - target_length + 1): |             while i < len(string_words) - target_length + 1: | ||||||
|                 window = " ".join(string_words[i : i + target_length]) |                 window = " ".join(string_words[i : i + target_length]) | ||||||
|                 distance = lev.distance(target, window) |  | ||||||
| 
 | 
 | ||||||
|                 # Save the distance for this window |                 distance = lev.distance(target, window) / len(target) | ||||||
|                 distances[window] = (distance / len(target)) * 100 |                 if distance < threshold: | ||||||
|  |                     for j in range( | ||||||
|  |                         max(i - target_length, 0), | ||||||
|  |                         min(i + target_length, len(string_words) - target_length + 1), | ||||||
|  |                     ): | ||||||
|  |                         detailed_window = " ".join(string_words[j : j + target_length]) | ||||||
|  |                         detailed_distance = lev.distance(target, detailed_window) / len( | ||||||
|  |                             target | ||||||
|  |                         ) | ||||||
|  | 
 | ||||||
|  |                         all_distances.append((detailed_window, detailed_distance * 100)) | ||||||
|  |                     i += stride_length | ||||||
|  |                 else: | ||||||
|  |                     i += stride_length | ||||||
|         else: |         else: | ||||||
|             # If the string has fewer words than the target |             dist = lev.distance(target, string) / len(target) | ||||||
|             distance = lev.distance(target, string) |             all_distances.append((string, dist * 100)) | ||||||
|             distances[string] = (distance / len(target)) * 100 |  | ||||||
| 
 | 
 | ||||||
|     return distances |         if all_distances: | ||||||
|  |             min_window = min(all_distances, key=lambda x: x[1]) | ||||||
|  |             min_distances.append([min_window[0], min_window[1]]) | ||||||
|  | 
 | ||||||
|  |     return min_distances | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def replace_multiple_spaces(text): | def replace_multiple_spaces(text): | ||||||
|     return re.sub(" +", " ", text) |     return re.sub(" +", " ", text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | nlp = spacy.load("ru_core_news_sm") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def remove_special_characters(string): | ||||||
|  |     return re.sub(r"\W", "", string) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def difference_type(word1, word2): | ||||||
|  |     if word1 == word2: | ||||||
|  |         return None  # слова совпадают, пропускаем их | ||||||
|  | 
 | ||||||
|  |     if remove_special_characters(word1) == remove_special_characters(word2): | ||||||
|  |         return "Пропущен специцальный символ" | ||||||
|  | 
 | ||||||
|  |     if word1.lower() == word2.lower(): | ||||||
|  |         return "Разная капитуляция слов" | ||||||
|  | 
 | ||||||
|  |     if word1.isdigit() and word2.isdigit(): | ||||||
|  |         if abs(int(word1) - int(word2)) < 10: | ||||||
|  |             return "Небольшое числовое различие" | ||||||
|  |         else: | ||||||
|  |             return "Разные числа" | ||||||
|  | 
 | ||||||
|  |     token1 = nlp(word1)[0] | ||||||
|  |     token2 = nlp(word2)[0] | ||||||
|  |     if token1.lemma_ == token2.lemma_: | ||||||
|  |         if token1.pos_ != token2.pos_: | ||||||
|  |             return "Разные формы слова" | ||||||
|  |         else: | ||||||
|  |             return "Одинаковый корень, но разные формы" | ||||||
|  | 
 | ||||||
|  |     if Levenshtein.distance(word1, word2) <= 2: | ||||||
|  |         return "Возможная орфографическая ошибка или опечатка" | ||||||
|  |     return "Разные слова" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def compare_strings(str1, str2): | ||||||
|  |     words1 = str1.split() | ||||||
|  |     words2 = str2.split() | ||||||
|  | 
 | ||||||
|  |     words1_only = set(words1) - set(words2) | ||||||
|  |     words2_only = set(words2) - set(words1) | ||||||
|  | 
 | ||||||
|  |     differences = [] | ||||||
|  |     mn_len = min(len(words1), len(words2)) | ||||||
|  |     for i in range(mn_len): | ||||||
|  |         difference = difference_type(words1[i], words2[i]) | ||||||
|  |         differences.append((words1[i], words2[i], difference)) | ||||||
|  | 
 | ||||||
|  |     for word in words1_only: | ||||||
|  |         differences.append((word, None, "Word only in first string")) | ||||||
|  | 
 | ||||||
|  |     for word in words2_only: | ||||||
|  |         differences.append((None, word, "Word only in second string")) | ||||||
|  | 
 | ||||||
|  |     diff_types = set() | ||||||
|  |     for diff in differences: | ||||||
|  |         if diff[2]: | ||||||
|  |             diff_types.add(diff[2]) | ||||||
|  | 
 | ||||||
|  |     return differences, diff_types | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def get_matches(file, target): | def get_matches(file, target): | ||||||
|  |     target = replace_multiple_spaces(target) | ||||||
|  | 
 | ||||||
|     result = [] |     result = [] | ||||||
|     for i, page_layout in enumerate(tqdm(extract_pages(file))): |     for i, page_layout in enumerate(tqdm(extract_pages(file))): | ||||||
|         _x1, _y1, _x2, _y2 = page_layout.bbox |         _x1, _y1, _x2, _y2 = page_layout.bbox | ||||||
|         texts = [] |         texts = [] | ||||||
|         relative_coords = [] |         relative_coords = [] | ||||||
|  |         d = {} | ||||||
|         for element in page_layout: |         for element in page_layout: | ||||||
|             if isinstance(element, LTTextContainer): |             if isinstance(element, LTTextContainer): | ||||||
|                 # print(element.get_text()) |                 # print(element.get_text()) | ||||||
|                 x1, y1, x2, y2 = element.bbox |                 x1, y1, x2, y2 = element.bbox | ||||||
|                 relative_coords.append( |                 raw = element.get_text() | ||||||
|                     [x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2] |                 text = replace_multiple_spaces(raw.replace("\n", " ").strip()) | ||||||
|                 ) |                 if len(text) > 3: | ||||||
|  |                     relative_coords.append( | ||||||
|  |                         ([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2]) | ||||||
|  |                     ) | ||||||
|  |                     texts.append(text) | ||||||
|  |                     d[text] = raw | ||||||
| 
 | 
 | ||||||
|                 texts.append( |  | ||||||
|                     replace_multiple_spaces(element.get_text().replace("\n", "")) |  | ||||||
|                 ) |  | ||||||
|         distances = calculate_distances(target, texts) |         distances = calculate_distances(target, texts) | ||||||
| 
 | 
 | ||||||
|         for window, distance in distances.items(): |         for window, distance in distances: | ||||||
|             if distance / len(target) < 0.2: |             if distance / len(target) < 0.2: | ||||||
|                 # print(i) |                 # print(i) | ||||||
|                 # print(window) |                 # print(window) | ||||||
|                 for j in range(len(texts)): |                 for j in range(len(texts)): | ||||||
|                     if window in texts[j]: |                     if window in texts[j]: | ||||||
|  |                         raw_text = d[texts[j]] | ||||||
|                         rel_coord = relative_coords[j] |                         rel_coord = relative_coords[j] | ||||||
|                         break |                         break | ||||||
|  |                 difference, diff_types = compare_strings(window, target) | ||||||
|                 result.append( |                 result.append( | ||||||
|                     { |                     { | ||||||
|                         "page": i + 1, |                         "page": i + 1, | ||||||
|                         "window": window, |                         "window": window, | ||||||
|                         "coordinates": rel_coord, |                         "coordinates": rel_coord, | ||||||
|                         "distance": distance / len(target), |                         "distance": distance / len(target), | ||||||
|  |                         "diff_type": list(diff_types), | ||||||
|  |                         "raw_text": raw_text, | ||||||
|                     } |                     } | ||||||
|                 ) |                 ) | ||||||
|     return result |     return result | ||||||
|  |  | ||||||
							
								
								
									
										1059
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1059
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -54,7 +54,9 @@ levenshtein = "^0.21.1" | ||||||
| pdfminer-six = "^20221105" | pdfminer-six = "^20221105" | ||||||
| pandas = "^2.0.2" | pandas = "^2.0.2" | ||||||
| tqdm = "^4.65.0" | tqdm = "^4.65.0" | ||||||
| easyocr = "^1.7.0" | pymupdf = "^1.22.5" | ||||||
|  | spacy = "^3.5.3" | ||||||
|  | python-levenshtein = "^0.21.1" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| [build-system] | [build-system] | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user