updated file handling

2025-10-30 23:17:27 +03:00 · 2023-06-25 07:24:29 +03:00 · 2023-06-25 07:24:29 +03:00 · f588fead51
commit f588fead51
parent e35215f22a
8 changed files with 922 additions and 486 deletions
--- a/dock_checker/processor/api/serializers.py
+++ b/dock_checker/processor/api/serializers.py
@ -61,7 +61,14 @@ class FullFileSerializer(FileSerializer):

    class Meta:
        model = File
-        fields = ["name", "ideal_title", "file", "images", "text_locations"]
+        fields = [
+            "name",
+            "ideal_title",
+            "file",
+            "processed_file",
+            "images",
+            "text_locations",
+        ]


 class UpdateFileTitleSerializer(serializers.Serializer):
--- a/dock_checker/processor/migrations/0008_file_processed_file.py
+++ b/dock_checker/processor/migrations/0008_file_processed_file.py
@ -0,0 +1,21 @@
+# Generated by Django 4.2.2 on 2023-06-25 02:54
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        (
+            "processor",
+            "0007_delete_task_remove_fileimage_text_file_ideal_title_and_more",
+        ),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="file",
+            name="processed_file",
+            field=models.FileField(blank=True, null=True, upload_to="processed/"),
+        ),
+    ]
--- a/dock_checker/processor/models.py
+++ b/dock_checker/processor/models.py
@ -14,6 +14,7 @@ class File(models.Model):
        upload_to="uploads/",
        validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
    )
+    processed_file = models.FileField(upload_to="processed/", null=True, blank=True)

    class Meta:
        ordering = ("-uploaded",)
--- a/dock_checker/processor/services.py
+++ b/dock_checker/processor/services.py
@ -1,3 +1,7 @@
+from typing import Tuple
+from io import BytesIO
+import re
+import fitz
 from django.core.cache import cache
 from rest_framework.exceptions import NotFound

@ -17,3 +21,151 @@ def get_task_status(pk: str) -> dict:
        "error": error,
        "error_description": error_description,
    }
+
+
+def extract_info(input_file: str):
+    """
+    Extracts file info
+    """
+    # Open the PDF
+    pdfDoc = fitz.open(input_file)
+    output = {
+        "File": input_file,
+        "Encrypted": ("True" if pdfDoc.isEncrypted else "False"),
+    }
+    # If PDF is encrypted the file metadata cannot be extracted
+    if not pdfDoc.isEncrypted:
+        for key, value in pdfDoc.metadata.items():
+            output[key] = value
+    # To Display File Info
+    print("## File Information ##################################################")
+    print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
+    print("######################################################################")
+    return True, output
+
+
+def search_for_text(lines, search_str):
+    """
+    Search for the search string within the document lines
+    """
+    if search_str in lines:
+        return search_str
+
+
+def redact_matching_data(page, matched_values):
+    """
+    Redacts matching values
+    """
+    matches_found = 0
+    # Loop throughout matching values
+    for val in matched_values:
+        matches_found += 1
+        matching_val_area = page.search_for(val)
+        # Redact matching values
+        [
+            page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
+            for area in matching_val_area
+        ]
+    # Apply the redaction
+    page.apply_redactions()
+    return matches_found
+
+
+def frame_matching_data(page, matched_values):
+    """
+    frames matching values
+    """
+    matches_found = 0
+    # Loop throughout matching values
+    for val in matched_values:
+        matches_found += 1
+        matching_val_area = page.search_for(val)
+        for area in matching_val_area:
+            if isinstance(area, fitz.fitz.Rect):
+                # Draw a rectangle around matched values
+                annot = page.addRectAnnot(area)
+                # , fill = fitz.utils.getColor('black')
+                annot.setColors(stroke=fitz.utils.getColor("red"))
+                # If you want to remove matched data
+                # page.addFreetextAnnot(area, ' ')
+                annot.update()
+    return matches_found
+
+
+def highlight_matching_data(page, matched_values, type):
+    """
+    Highlight matching values
+    """
+    matches_found = 0
+    # Loop throughout matching values
+    for val in matched_values:
+        matches_found += 1
+        matching_val_area = page.search_for(val)
+        # print("matching_val_area",matching_val_area)
+        highlight = None
+        if type == "Highlight":
+            highlight = page.add_highlight_annot(matching_val_area)
+        elif type == "Squiggly":
+            highlight = page.add_squiggly_annot(matching_val_area)
+        elif type == "Underline":
+            highlight = page.add_underline_annot(matching_val_area)
+        elif type == "Strikeout":
+            highlight = page.add_strikeout_annot(matching_val_area)
+        else:
+            highlight = page.add_highlight_annot(matching_val_area)
+        # To change the highlight colar
+        # highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
+        # highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
+        # highlight.setColors(colors= fitz.utils.getColor('red'))
+        highlight.update()
+    return matches_found
+
+
+def process_data(
+    input_file: str,
+    search_str: str,
+    pages: Tuple = None,
+    action: str = "Highlight",
+):
+    """
+    Process the pages of the PDF File
+    """
+    # Open the PDF
+    pdfDoc = fitz.open(input_file)
+    # Save the generated PDF to memory buffer
+    output_buffer = BytesIO()
+    total_matches = 0
+    # Iterate through pages
+    for pg in range(len(pdfDoc)):
+        # If required for specific pages
+        if pages:
+            if str(pg) not in pages:
+                continue
+        # Select the page
+        page = pdfDoc[pg]
+        # Get Matching Data
+        # Split page by lines
+        page_lines = page.get_text("text")
+        matched_values = search_for_text(page_lines, search_str)
+        if matched_values:
+            if action == "Redact":
+                matches_found = redact_matching_data(page, matched_values)
+            elif action == "Frame":
+                matches_found = frame_matching_data(page, matched_values)
+            elif action in ("Highlight", "Squiggly", "Underline", "Strikeout"):
+                matches_found = highlight_matching_data(page, matched_values, action)
+            else:
+                matches_found = highlight_matching_data(
+                    page, matched_values, "Highlight"
+                )
+            total_matches += matches_found
+    print(
+        f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}"
+    )
+    # Save to output
+    pdfDoc.save(output_buffer)
+    pdfDoc.close()
+    # Save the output buffer to the output file
+    with open(input_file, mode="wb") as f:
+        f.write(output_buffer.getbuffer())
+
--- a/dock_checker/processor/tasks.py
+++ b/dock_checker/processor/tasks.py
@ -1,8 +1,10 @@
 import os

 import shutil
+from io import BytesIO
 from time import sleep

+import fitz
 from celery import shared_task
 from django.core.files import File
 from pdf2image import convert_from_path
@ -26,8 +28,6 @@ def process_pdf(pk: str):
    cache.set(f"{pk}-features_loaded", False)
    cache.set(f"{pk}-processed", 1)
    extract_pdf_features.apply_async(kwargs={"pk": pk})
-    split_pdf_into_images.apply_async(kwargs={"pk": pk})
-    load_pdf.apply_async(kwargs={"pk": pk})
    return pk


@ -46,8 +46,23 @@ def extract_pdf_features(pk: str):
        text_locations = get_matches(file.file.path, target)
        file.ideal_title = target
        file.text_locations = text_locations
+
+        pdfDoc = fitz.open(file.file.path)
+        for loc in text_locations:
+            page = pdfDoc[loc["page"] - 1]
+            matching_val_area = page.search_for(loc["raw_text"])
+            for rect in matching_val_area:
+                page.add_highlight_annot(rect)
+        output_buffer = BytesIO()
+        pdfDoc.close()
+        with open(file.file.path, mode="wb") as f:
+            f.write(output_buffer.getbuffer())
+
        file.save()
    cache.set(f"{pk}-features_loaded", True)
+    split_pdf_into_images.apply_async(kwargs={"pk": pk})
+    load_pdf.apply_async(kwargs={"pk": pk})
+    # create_processed_pdf.apply_async(kwargs={"pk": pk})
    return pk


@ -70,6 +85,18 @@ def update_pdf_features(pk: str, target: str):
    return pk


+# @shared_task
+# def create_processed_pdf(pk: str):
+#     file = FileModel.objects.get(pk=pk)
+#     f_path = "processed_" + file.file.path.split("/")[-1]
+#     shutil.copy(file.file.path, f_path)
+#
+#     for loc in file.text_locations:
+#         highlight_pdf(f_path, loc["raw_text"], page=loc["page"] - 1)
+#
+#     os.remove(f_path)
+
+
@shared_task
 def split_pdf_into_images(pk: str):
    file = FileModel.objects.get(pk=pk)
--- a/ml/main.py
+++ b/ml/main.py
@ -1,6 +1,9 @@
 import re
+import math
+import spacy
 import pickle
 import warnings
+import Levenshtein
 import numpy as np
 import pandas as pd
 import Levenshtein as lev
@ -128,66 +131,156 @@ def inference_models(checkpoint_name, test_df):
    return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip()


-def calculate_distances(target, list_of_strings):
+def calculate_distances(target, list_of_strings, stride_fraction=1 / 4, threshold=0.3):
    target_length = len(target.split())
-    distances = {}
+    min_distances = []
+
+    stride_length = math.ceil(target_length * stride_fraction)

    for string in list_of_strings:
+        all_distances = []
        string_words = string.split()

-        # If the string has at least as many words as the target
-        if len(string_words) >= target_length:
-            for i in range(len(string_words) - target_length + 1):
+        if len(string_words) > target_length:
+            i = 0
+            while i < len(string_words) - target_length + 1:
                window = " ".join(string_words[i : i + target_length])
-                distance = lev.distance(target, window)

-                # Save the distance for this window
-                distances[window] = (distance / len(target)) * 100
+                distance = lev.distance(target, window) / len(target)
+                if distance < threshold:
+                    for j in range(
+                        max(i - target_length, 0),
+                        min(i + target_length, len(string_words) - target_length + 1),
+                    ):
+                        detailed_window = " ".join(string_words[j : j + target_length])
+                        detailed_distance = lev.distance(target, detailed_window) / len(
+                            target
+                        )
+
+                        all_distances.append((detailed_window, detailed_distance * 100))
+                    i += stride_length
+                else:
+                    i += stride_length
        else:
-            # If the string has fewer words than the target
-            distance = lev.distance(target, string)
-            distances[string] = (distance / len(target)) * 100
+            dist = lev.distance(target, string) / len(target)
+            all_distances.append((string, dist * 100))

-    return distances
+        if all_distances:
+            min_window = min(all_distances, key=lambda x: x[1])
+            min_distances.append([min_window[0], min_window[1]])
+
+    return min_distances


 def replace_multiple_spaces(text):
    return re.sub(" +", " ", text)


+nlp = spacy.load("ru_core_news_sm")
+
+
+def remove_special_characters(string):
+    return re.sub(r"\W", "", string)
+
+
+def difference_type(word1, word2):
+    if word1 == word2:
+        return None  # слова совпадают, пропускаем их
+
+    if remove_special_characters(word1) == remove_special_characters(word2):
+        return "Пропущен специцальный символ"
+
+    if word1.lower() == word2.lower():
+        return "Разная капитуляция слов"
+
+    if word1.isdigit() and word2.isdigit():
+        if abs(int(word1) - int(word2)) < 10:
+            return "Небольшое числовое различие"
+        else:
+            return "Разные числа"
+
+    token1 = nlp(word1)[0]
+    token2 = nlp(word2)[0]
+    if token1.lemma_ == token2.lemma_:
+        if token1.pos_ != token2.pos_:
+            return "Разные формы слова"
+        else:
+            return "Одинаковый корень, но разные формы"
+
+    if Levenshtein.distance(word1, word2) <= 2:
+        return "Возможная орфографическая ошибка или опечатка"
+    return "Разные слова"
+
+
+def compare_strings(str1, str2):
+    words1 = str1.split()
+    words2 = str2.split()
+
+    words1_only = set(words1) - set(words2)
+    words2_only = set(words2) - set(words1)
+
+    differences = []
+    mn_len = min(len(words1), len(words2))
+    for i in range(mn_len):
+        difference = difference_type(words1[i], words2[i])
+        differences.append((words1[i], words2[i], difference))
+
+    for word in words1_only:
+        differences.append((word, None, "Word only in first string"))
+
+    for word in words2_only:
+        differences.append((None, word, "Word only in second string"))
+
+    diff_types = set()
+    for diff in differences:
+        if diff[2]:
+            diff_types.add(diff[2])
+
+    return differences, diff_types
+
+
 def get_matches(file, target):
+    target = replace_multiple_spaces(target)
+
    result = []
    for i, page_layout in enumerate(tqdm(extract_pages(file))):
        _x1, _y1, _x2, _y2 = page_layout.bbox
        texts = []
        relative_coords = []
+        d = {}
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                # print(element.get_text())
                x1, y1, x2, y2 = element.bbox
-                relative_coords.append(
-                    [x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2]
-                )
+                raw = element.get_text()
+                text = replace_multiple_spaces(raw.replace("\n", " ").strip())
+                if len(text) > 3:
+                    relative_coords.append(
+                        ([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2])
+                    )
+                    texts.append(text)
+                    d[text] = raw

-                texts.append(
-                    replace_multiple_spaces(element.get_text().replace("\n", ""))
-                )
        distances = calculate_distances(target, texts)

-        for window, distance in distances.items():
+        for window, distance in distances:
            if distance / len(target) < 0.2:
                # print(i)
                # print(window)
                for j in range(len(texts)):
                    if window in texts[j]:
+                        raw_text = d[texts[j]]
                        rel_coord = relative_coords[j]
                        break
+                difference, diff_types = compare_strings(window, target)
                result.append(
                    {
                        "page": i + 1,
                        "window": window,
                        "coordinates": rel_coord,
                        "distance": distance / len(target),
+                        "diff_type": list(diff_types),
+                        "raw_text": raw_text,
                    }
                )
    return result
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -54,7 +54,9 @@ levenshtein = "^0.21.1"
 pdfminer-six = "^20221105"
 pandas = "^2.0.2"
 tqdm = "^4.65.0"
-easyocr = "^1.7.0"
+pymupdf = "^1.22.5"
+spacy = "^3.5.3"
+python-levenshtein = "^0.21.1"


 [build-system]