mirror of
				https://github.com/mistakes-23/backend.git
				synced 2025-10-30 23:17:27 +03:00 
			
		
		
		
	updated file handling
This commit is contained in:
		
							parent
							
								
									e35215f22a
								
							
						
					
					
						commit
						f588fead51
					
				|  | @ -61,7 +61,14 @@ class FullFileSerializer(FileSerializer): | |||
| 
 | ||||
|     class Meta: | ||||
|         model = File | ||||
|         fields = ["name", "ideal_title", "file", "images", "text_locations"] | ||||
|         fields = [ | ||||
|             "name", | ||||
|             "ideal_title", | ||||
|             "file", | ||||
|             "processed_file", | ||||
|             "images", | ||||
|             "text_locations", | ||||
|         ] | ||||
| 
 | ||||
| 
 | ||||
| class UpdateFileTitleSerializer(serializers.Serializer): | ||||
|  |  | |||
|  | @ -0,0 +1,21 @@ | |||
| # Generated by Django 4.2.2 on 2023-06-25 02:54 | ||||
| 
 | ||||
| from django.db import migrations, models | ||||
| 
 | ||||
| 
 | ||||
| class Migration(migrations.Migration): | ||||
| 
 | ||||
|     dependencies = [ | ||||
|         ( | ||||
|             "processor", | ||||
|             "0007_delete_task_remove_fileimage_text_file_ideal_title_and_more", | ||||
|         ), | ||||
|     ] | ||||
| 
 | ||||
|     operations = [ | ||||
|         migrations.AddField( | ||||
|             model_name="file", | ||||
|             name="processed_file", | ||||
|             field=models.FileField(blank=True, null=True, upload_to="processed/"), | ||||
|         ), | ||||
|     ] | ||||
|  | @ -14,6 +14,7 @@ class File(models.Model): | |||
|         upload_to="uploads/", | ||||
|         validators=[FileExtensionValidator(allowed_extensions=["pdf"])], | ||||
|     ) | ||||
|     processed_file = models.FileField(upload_to="processed/", null=True, blank=True) | ||||
| 
 | ||||
|     class Meta: | ||||
|         ordering = ("-uploaded",) | ||||
|  |  | |||
|  | @ -1,3 +1,7 @@ | |||
| from typing import Tuple | ||||
| from io import BytesIO | ||||
| import re | ||||
| import fitz | ||||
| from django.core.cache import cache | ||||
| from rest_framework.exceptions import NotFound | ||||
| 
 | ||||
|  | @ -17,3 +21,151 @@ def get_task_status(pk: str) -> dict: | |||
|         "error": error, | ||||
|         "error_description": error_description, | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def extract_info(input_file: str): | ||||
|     """ | ||||
|     Extracts file info | ||||
|     """ | ||||
|     # Open the PDF | ||||
|     pdfDoc = fitz.open(input_file) | ||||
|     output = { | ||||
|         "File": input_file, | ||||
|         "Encrypted": ("True" if pdfDoc.isEncrypted else "False"), | ||||
|     } | ||||
|     # If PDF is encrypted the file metadata cannot be extracted | ||||
|     if not pdfDoc.isEncrypted: | ||||
|         for key, value in pdfDoc.metadata.items(): | ||||
|             output[key] = value | ||||
|     # To Display File Info | ||||
|     print("## File Information ##################################################") | ||||
|     print("\n".join("{}:{}".format(i, j) for i, j in output.items())) | ||||
|     print("######################################################################") | ||||
|     return True, output | ||||
| 
 | ||||
| 
 | ||||
| def search_for_text(lines, search_str): | ||||
|     """ | ||||
|     Search for the search string within the document lines | ||||
|     """ | ||||
|     if search_str in lines: | ||||
|         return search_str | ||||
| 
 | ||||
| 
 | ||||
| def redact_matching_data(page, matched_values): | ||||
|     """ | ||||
|     Redacts matching values | ||||
|     """ | ||||
|     matches_found = 0 | ||||
|     # Loop throughout matching values | ||||
|     for val in matched_values: | ||||
|         matches_found += 1 | ||||
|         matching_val_area = page.search_for(val) | ||||
|         # Redact matching values | ||||
|         [ | ||||
|             page.addRedactAnnot(area, text=" ", fill=(0, 0, 0)) | ||||
|             for area in matching_val_area | ||||
|         ] | ||||
|     # Apply the redaction | ||||
|     page.apply_redactions() | ||||
|     return matches_found | ||||
| 
 | ||||
| 
 | ||||
| def frame_matching_data(page, matched_values): | ||||
|     """ | ||||
|     frames matching values | ||||
|     """ | ||||
|     matches_found = 0 | ||||
|     # Loop throughout matching values | ||||
|     for val in matched_values: | ||||
|         matches_found += 1 | ||||
|         matching_val_area = page.search_for(val) | ||||
|         for area in matching_val_area: | ||||
|             if isinstance(area, fitz.fitz.Rect): | ||||
|                 # Draw a rectangle around matched values | ||||
|                 annot = page.addRectAnnot(area) | ||||
|                 # , fill = fitz.utils.getColor('black') | ||||
|                 annot.setColors(stroke=fitz.utils.getColor("red")) | ||||
|                 # If you want to remove matched data | ||||
|                 # page.addFreetextAnnot(area, ' ') | ||||
|                 annot.update() | ||||
|     return matches_found | ||||
| 
 | ||||
| 
 | ||||
| def highlight_matching_data(page, matched_values, type): | ||||
|     """ | ||||
|     Highlight matching values | ||||
|     """ | ||||
|     matches_found = 0 | ||||
|     # Loop throughout matching values | ||||
|     for val in matched_values: | ||||
|         matches_found += 1 | ||||
|         matching_val_area = page.search_for(val) | ||||
|         # print("matching_val_area",matching_val_area) | ||||
|         highlight = None | ||||
|         if type == "Highlight": | ||||
|             highlight = page.add_highlight_annot(matching_val_area) | ||||
|         elif type == "Squiggly": | ||||
|             highlight = page.add_squiggly_annot(matching_val_area) | ||||
|         elif type == "Underline": | ||||
|             highlight = page.add_underline_annot(matching_val_area) | ||||
|         elif type == "Strikeout": | ||||
|             highlight = page.add_strikeout_annot(matching_val_area) | ||||
|         else: | ||||
|             highlight = page.add_highlight_annot(matching_val_area) | ||||
|         # To change the highlight colar | ||||
|         # highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) }) | ||||
|         # highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red')) | ||||
|         # highlight.setColors(colors= fitz.utils.getColor('red')) | ||||
|         highlight.update() | ||||
|     return matches_found | ||||
| 
 | ||||
| 
 | ||||
| def process_data( | ||||
|     input_file: str, | ||||
|     search_str: str, | ||||
|     pages: Tuple = None, | ||||
|     action: str = "Highlight", | ||||
| ): | ||||
|     """ | ||||
|     Process the pages of the PDF File | ||||
|     """ | ||||
|     # Open the PDF | ||||
|     pdfDoc = fitz.open(input_file) | ||||
|     # Save the generated PDF to memory buffer | ||||
|     output_buffer = BytesIO() | ||||
|     total_matches = 0 | ||||
|     # Iterate through pages | ||||
|     for pg in range(len(pdfDoc)): | ||||
|         # If required for specific pages | ||||
|         if pages: | ||||
|             if str(pg) not in pages: | ||||
|                 continue | ||||
|         # Select the page | ||||
|         page = pdfDoc[pg] | ||||
|         # Get Matching Data | ||||
|         # Split page by lines | ||||
|         page_lines = page.get_text("text") | ||||
|         matched_values = search_for_text(page_lines, search_str) | ||||
|         if matched_values: | ||||
|             if action == "Redact": | ||||
|                 matches_found = redact_matching_data(page, matched_values) | ||||
|             elif action == "Frame": | ||||
|                 matches_found = frame_matching_data(page, matched_values) | ||||
|             elif action in ("Highlight", "Squiggly", "Underline", "Strikeout"): | ||||
|                 matches_found = highlight_matching_data(page, matched_values, action) | ||||
|             else: | ||||
|                 matches_found = highlight_matching_data( | ||||
|                     page, matched_values, "Highlight" | ||||
|                 ) | ||||
|             total_matches += matches_found | ||||
|     print( | ||||
|         f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}" | ||||
|     ) | ||||
|     # Save to output | ||||
|     pdfDoc.save(output_buffer) | ||||
|     pdfDoc.close() | ||||
|     # Save the output buffer to the output file | ||||
|     with open(input_file, mode="wb") as f: | ||||
|         f.write(output_buffer.getbuffer()) | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,8 +1,10 @@ | |||
| import os | ||||
| 
 | ||||
| import shutil | ||||
| from io import BytesIO | ||||
| from time import sleep | ||||
| 
 | ||||
| import fitz | ||||
| from celery import shared_task | ||||
| from django.core.files import File | ||||
| from pdf2image import convert_from_path | ||||
|  | @ -26,8 +28,6 @@ def process_pdf(pk: str): | |||
|     cache.set(f"{pk}-features_loaded", False) | ||||
|     cache.set(f"{pk}-processed", 1) | ||||
|     extract_pdf_features.apply_async(kwargs={"pk": pk}) | ||||
|     split_pdf_into_images.apply_async(kwargs={"pk": pk}) | ||||
|     load_pdf.apply_async(kwargs={"pk": pk}) | ||||
|     return pk | ||||
| 
 | ||||
| 
 | ||||
|  | @ -46,8 +46,23 @@ def extract_pdf_features(pk: str): | |||
|         text_locations = get_matches(file.file.path, target) | ||||
|         file.ideal_title = target | ||||
|         file.text_locations = text_locations | ||||
| 
 | ||||
|         pdfDoc = fitz.open(file.file.path) | ||||
|         for loc in text_locations: | ||||
|             page = pdfDoc[loc["page"] - 1] | ||||
|             matching_val_area = page.search_for(loc["raw_text"]) | ||||
|             for rect in matching_val_area: | ||||
|                 page.add_highlight_annot(rect) | ||||
|         output_buffer = BytesIO() | ||||
|         pdfDoc.close() | ||||
|         with open(file.file.path, mode="wb") as f: | ||||
|             f.write(output_buffer.getbuffer()) | ||||
| 
 | ||||
|         file.save() | ||||
|     cache.set(f"{pk}-features_loaded", True) | ||||
|     split_pdf_into_images.apply_async(kwargs={"pk": pk}) | ||||
|     load_pdf.apply_async(kwargs={"pk": pk}) | ||||
|     # create_processed_pdf.apply_async(kwargs={"pk": pk}) | ||||
|     return pk | ||||
| 
 | ||||
| 
 | ||||
|  | @ -70,6 +85,18 @@ def update_pdf_features(pk: str, target: str): | |||
|     return pk | ||||
| 
 | ||||
| 
 | ||||
| # @shared_task | ||||
| # def create_processed_pdf(pk: str): | ||||
| #     file = FileModel.objects.get(pk=pk) | ||||
| #     f_path = "processed_" + file.file.path.split("/")[-1] | ||||
| #     shutil.copy(file.file.path, f_path) | ||||
| # | ||||
| #     for loc in file.text_locations: | ||||
| #         highlight_pdf(f_path, loc["raw_text"], page=loc["page"] - 1) | ||||
| # | ||||
| #     os.remove(f_path) | ||||
| 
 | ||||
| 
 | ||||
| @shared_task | ||||
| def split_pdf_into_images(pk: str): | ||||
|     file = FileModel.objects.get(pk=pk) | ||||
|  |  | |||
							
								
								
									
										131
									
								
								ml/main.py
									
									
									
									
									
								
							
							
						
						
									
										131
									
								
								ml/main.py
									
									
									
									
									
								
							|  | @ -1,6 +1,9 @@ | |||
| import re | ||||
| import math | ||||
| import spacy | ||||
| import pickle | ||||
| import warnings | ||||
| import Levenshtein | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| import Levenshtein as lev | ||||
|  | @ -128,66 +131,156 @@ def inference_models(checkpoint_name, test_df): | |||
|     return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip() | ||||
| 
 | ||||
| 
 | ||||
| def calculate_distances(target, list_of_strings): | ||||
| def calculate_distances(target, list_of_strings, stride_fraction=1 / 4, threshold=0.3): | ||||
|     target_length = len(target.split()) | ||||
|     distances = {} | ||||
|     min_distances = [] | ||||
| 
 | ||||
|     stride_length = math.ceil(target_length * stride_fraction) | ||||
| 
 | ||||
|     for string in list_of_strings: | ||||
|         all_distances = [] | ||||
|         string_words = string.split() | ||||
| 
 | ||||
|         # If the string has at least as many words as the target | ||||
|         if len(string_words) >= target_length: | ||||
|             for i in range(len(string_words) - target_length + 1): | ||||
|         if len(string_words) > target_length: | ||||
|             i = 0 | ||||
|             while i < len(string_words) - target_length + 1: | ||||
|                 window = " ".join(string_words[i : i + target_length]) | ||||
|                 distance = lev.distance(target, window) | ||||
| 
 | ||||
|                 # Save the distance for this window | ||||
|                 distances[window] = (distance / len(target)) * 100 | ||||
|                 distance = lev.distance(target, window) / len(target) | ||||
|                 if distance < threshold: | ||||
|                     for j in range( | ||||
|                         max(i - target_length, 0), | ||||
|                         min(i + target_length, len(string_words) - target_length + 1), | ||||
|                     ): | ||||
|                         detailed_window = " ".join(string_words[j : j + target_length]) | ||||
|                         detailed_distance = lev.distance(target, detailed_window) / len( | ||||
|                             target | ||||
|                         ) | ||||
| 
 | ||||
|                         all_distances.append((detailed_window, detailed_distance * 100)) | ||||
|                     i += stride_length | ||||
|                 else: | ||||
|                     i += stride_length | ||||
|         else: | ||||
|             # If the string has fewer words than the target | ||||
|             distance = lev.distance(target, string) | ||||
|             distances[string] = (distance / len(target)) * 100 | ||||
|             dist = lev.distance(target, string) / len(target) | ||||
|             all_distances.append((string, dist * 100)) | ||||
| 
 | ||||
|     return distances | ||||
|         if all_distances: | ||||
|             min_window = min(all_distances, key=lambda x: x[1]) | ||||
|             min_distances.append([min_window[0], min_window[1]]) | ||||
| 
 | ||||
|     return min_distances | ||||
| 
 | ||||
| 
 | ||||
| def replace_multiple_spaces(text): | ||||
|     return re.sub(" +", " ", text) | ||||
| 
 | ||||
| 
 | ||||
| nlp = spacy.load("ru_core_news_sm") | ||||
| 
 | ||||
| 
 | ||||
| def remove_special_characters(string): | ||||
|     return re.sub(r"\W", "", string) | ||||
| 
 | ||||
| 
 | ||||
| def difference_type(word1, word2): | ||||
|     if word1 == word2: | ||||
|         return None  # слова совпадают, пропускаем их | ||||
| 
 | ||||
|     if remove_special_characters(word1) == remove_special_characters(word2): | ||||
|         return "Пропущен специцальный символ" | ||||
| 
 | ||||
|     if word1.lower() == word2.lower(): | ||||
|         return "Разная капитуляция слов" | ||||
| 
 | ||||
|     if word1.isdigit() and word2.isdigit(): | ||||
|         if abs(int(word1) - int(word2)) < 10: | ||||
|             return "Небольшое числовое различие" | ||||
|         else: | ||||
|             return "Разные числа" | ||||
| 
 | ||||
|     token1 = nlp(word1)[0] | ||||
|     token2 = nlp(word2)[0] | ||||
|     if token1.lemma_ == token2.lemma_: | ||||
|         if token1.pos_ != token2.pos_: | ||||
|             return "Разные формы слова" | ||||
|         else: | ||||
|             return "Одинаковый корень, но разные формы" | ||||
| 
 | ||||
|     if Levenshtein.distance(word1, word2) <= 2: | ||||
|         return "Возможная орфографическая ошибка или опечатка" | ||||
|     return "Разные слова" | ||||
| 
 | ||||
| 
 | ||||
| def compare_strings(str1, str2): | ||||
|     words1 = str1.split() | ||||
|     words2 = str2.split() | ||||
| 
 | ||||
|     words1_only = set(words1) - set(words2) | ||||
|     words2_only = set(words2) - set(words1) | ||||
| 
 | ||||
|     differences = [] | ||||
|     mn_len = min(len(words1), len(words2)) | ||||
|     for i in range(mn_len): | ||||
|         difference = difference_type(words1[i], words2[i]) | ||||
|         differences.append((words1[i], words2[i], difference)) | ||||
| 
 | ||||
|     for word in words1_only: | ||||
|         differences.append((word, None, "Word only in first string")) | ||||
| 
 | ||||
|     for word in words2_only: | ||||
|         differences.append((None, word, "Word only in second string")) | ||||
| 
 | ||||
|     diff_types = set() | ||||
|     for diff in differences: | ||||
|         if diff[2]: | ||||
|             diff_types.add(diff[2]) | ||||
| 
 | ||||
|     return differences, diff_types | ||||
| 
 | ||||
| 
 | ||||
| def get_matches(file, target): | ||||
|     target = replace_multiple_spaces(target) | ||||
| 
 | ||||
|     result = [] | ||||
|     for i, page_layout in enumerate(tqdm(extract_pages(file))): | ||||
|         _x1, _y1, _x2, _y2 = page_layout.bbox | ||||
|         texts = [] | ||||
|         relative_coords = [] | ||||
|         d = {} | ||||
|         for element in page_layout: | ||||
|             if isinstance(element, LTTextContainer): | ||||
|                 # print(element.get_text()) | ||||
|                 x1, y1, x2, y2 = element.bbox | ||||
|                 relative_coords.append( | ||||
|                     [x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2] | ||||
|                 ) | ||||
|                 raw = element.get_text() | ||||
|                 text = replace_multiple_spaces(raw.replace("\n", " ").strip()) | ||||
|                 if len(text) > 3: | ||||
|                     relative_coords.append( | ||||
|                         ([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2]) | ||||
|                     ) | ||||
|                     texts.append(text) | ||||
|                     d[text] = raw | ||||
| 
 | ||||
|                 texts.append( | ||||
|                     replace_multiple_spaces(element.get_text().replace("\n", "")) | ||||
|                 ) | ||||
|         distances = calculate_distances(target, texts) | ||||
| 
 | ||||
|         for window, distance in distances.items(): | ||||
|         for window, distance in distances: | ||||
|             if distance / len(target) < 0.2: | ||||
|                 # print(i) | ||||
|                 # print(window) | ||||
|                 for j in range(len(texts)): | ||||
|                     if window in texts[j]: | ||||
|                         raw_text = d[texts[j]] | ||||
|                         rel_coord = relative_coords[j] | ||||
|                         break | ||||
|                 difference, diff_types = compare_strings(window, target) | ||||
|                 result.append( | ||||
|                     { | ||||
|                         "page": i + 1, | ||||
|                         "window": window, | ||||
|                         "coordinates": rel_coord, | ||||
|                         "distance": distance / len(target), | ||||
|                         "diff_type": list(diff_types), | ||||
|                         "raw_text": raw_text, | ||||
|                     } | ||||
|                 ) | ||||
|     return result | ||||
|  |  | |||
							
								
								
									
										1059
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1059
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -54,7 +54,9 @@ levenshtein = "^0.21.1" | |||
| pdfminer-six = "^20221105" | ||||
| pandas = "^2.0.2" | ||||
| tqdm = "^4.65.0" | ||||
| easyocr = "^1.7.0" | ||||
| pymupdf = "^1.22.5" | ||||
| spacy = "^3.5.3" | ||||
| python-levenshtein = "^0.21.1" | ||||
| 
 | ||||
| 
 | ||||
| [build-system] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user