updated file handling

This commit is contained in:
Alexander Karpov 2023-06-25 07:24:29 +03:00
parent e35215f22a
commit f588fead51
8 changed files with 922 additions and 486 deletions

View File

@ -61,7 +61,14 @@ class FullFileSerializer(FileSerializer):
class Meta:
model = File
fields = ["name", "ideal_title", "file", "images", "text_locations"]
fields = [
"name",
"ideal_title",
"file",
"processed_file",
"images",
"text_locations",
]
class UpdateFileTitleSerializer(serializers.Serializer):

View File

@ -0,0 +1,21 @@
# Generated by Django 4.2.2 on 2023-06-25 02:54
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
(
"processor",
"0007_delete_task_remove_fileimage_text_file_ideal_title_and_more",
),
]
operations = [
migrations.AddField(
model_name="file",
name="processed_file",
field=models.FileField(blank=True, null=True, upload_to="processed/"),
),
]

View File

@ -14,6 +14,7 @@ class File(models.Model):
upload_to="uploads/",
validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
)
processed_file = models.FileField(upload_to="processed/", null=True, blank=True)
class Meta:
ordering = ("-uploaded",)

View File

@ -1,3 +1,7 @@
from typing import Tuple
from io import BytesIO
import re
import fitz
from django.core.cache import cache
from rest_framework.exceptions import NotFound
@ -17,3 +21,151 @@ def get_task_status(pk: str) -> dict:
"error": error,
"error_description": error_description,
}
def extract_info(input_file: str):
"""
Extracts file info
"""
# Open the PDF
pdfDoc = fitz.open(input_file)
output = {
"File": input_file,
"Encrypted": ("True" if pdfDoc.isEncrypted else "False"),
}
# If PDF is encrypted the file metadata cannot be extracted
if not pdfDoc.isEncrypted:
for key, value in pdfDoc.metadata.items():
output[key] = value
# To Display File Info
print("## File Information ##################################################")
print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
print("######################################################################")
return True, output
def search_for_text(lines, search_str):
"""
Search for the search string within the document lines
"""
if search_str in lines:
return search_str
def redact_matching_data(page, matched_values):
"""
Redacts matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
# Redact matching values
[
page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
for area in matching_val_area
]
# Apply the redaction
page.apply_redactions()
return matches_found
def frame_matching_data(page, matched_values):
"""
frames matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
for area in matching_val_area:
if isinstance(area, fitz.fitz.Rect):
# Draw a rectangle around matched values
annot = page.addRectAnnot(area)
# , fill = fitz.utils.getColor('black')
annot.setColors(stroke=fitz.utils.getColor("red"))
# If you want to remove matched data
# page.addFreetextAnnot(area, ' ')
annot.update()
return matches_found
def highlight_matching_data(page, matched_values, type):
"""
Highlight matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
# print("matching_val_area",matching_val_area)
highlight = None
if type == "Highlight":
highlight = page.add_highlight_annot(matching_val_area)
elif type == "Squiggly":
highlight = page.add_squiggly_annot(matching_val_area)
elif type == "Underline":
highlight = page.add_underline_annot(matching_val_area)
elif type == "Strikeout":
highlight = page.add_strikeout_annot(matching_val_area)
else:
highlight = page.add_highlight_annot(matching_val_area)
# To change the highlight colar
# highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
# highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
# highlight.setColors(colors= fitz.utils.getColor('red'))
highlight.update()
return matches_found
def process_data(
input_file: str,
search_str: str,
pages: Tuple = None,
action: str = "Highlight",
):
"""
Process the pages of the PDF File
"""
# Open the PDF
pdfDoc = fitz.open(input_file)
# Save the generated PDF to memory buffer
output_buffer = BytesIO()
total_matches = 0
# Iterate through pages
for pg in range(len(pdfDoc)):
# If required for specific pages
if pages:
if str(pg) not in pages:
continue
# Select the page
page = pdfDoc[pg]
# Get Matching Data
# Split page by lines
page_lines = page.get_text("text")
matched_values = search_for_text(page_lines, search_str)
if matched_values:
if action == "Redact":
matches_found = redact_matching_data(page, matched_values)
elif action == "Frame":
matches_found = frame_matching_data(page, matched_values)
elif action in ("Highlight", "Squiggly", "Underline", "Strikeout"):
matches_found = highlight_matching_data(page, matched_values, action)
else:
matches_found = highlight_matching_data(
page, matched_values, "Highlight"
)
total_matches += matches_found
print(
f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}"
)
# Save to output
pdfDoc.save(output_buffer)
pdfDoc.close()
# Save the output buffer to the output file
with open(input_file, mode="wb") as f:
f.write(output_buffer.getbuffer())

View File

@ -1,8 +1,10 @@
import os
import shutil
from io import BytesIO
from time import sleep
import fitz
from celery import shared_task
from django.core.files import File
from pdf2image import convert_from_path
@ -26,8 +28,6 @@ def process_pdf(pk: str):
cache.set(f"{pk}-features_loaded", False)
cache.set(f"{pk}-processed", 1)
extract_pdf_features.apply_async(kwargs={"pk": pk})
split_pdf_into_images.apply_async(kwargs={"pk": pk})
load_pdf.apply_async(kwargs={"pk": pk})
return pk
@ -46,8 +46,23 @@ def extract_pdf_features(pk: str):
text_locations = get_matches(file.file.path, target)
file.ideal_title = target
file.text_locations = text_locations
pdfDoc = fitz.open(file.file.path)
for loc in text_locations:
page = pdfDoc[loc["page"] - 1]
matching_val_area = page.search_for(loc["raw_text"])
for rect in matching_val_area:
page.add_highlight_annot(rect)
output_buffer = BytesIO()
pdfDoc.close()
with open(file.file.path, mode="wb") as f:
f.write(output_buffer.getbuffer())
file.save()
cache.set(f"{pk}-features_loaded", True)
split_pdf_into_images.apply_async(kwargs={"pk": pk})
load_pdf.apply_async(kwargs={"pk": pk})
# create_processed_pdf.apply_async(kwargs={"pk": pk})
return pk
@ -70,6 +85,18 @@ def update_pdf_features(pk: str, target: str):
return pk
# @shared_task
# def create_processed_pdf(pk: str):
# file = FileModel.objects.get(pk=pk)
# f_path = "processed_" + file.file.path.split("/")[-1]
# shutil.copy(file.file.path, f_path)
#
# for loc in file.text_locations:
# highlight_pdf(f_path, loc["raw_text"], page=loc["page"] - 1)
#
# os.remove(f_path)
@shared_task
def split_pdf_into_images(pk: str):
file = FileModel.objects.get(pk=pk)

View File

@ -1,6 +1,9 @@
import re
import math
import spacy
import pickle
import warnings
import Levenshtein
import numpy as np
import pandas as pd
import Levenshtein as lev
@ -128,66 +131,156 @@ def inference_models(checkpoint_name, test_df):
return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip()
def calculate_distances(target, list_of_strings):
def calculate_distances(target, list_of_strings, stride_fraction=1 / 4, threshold=0.3):
target_length = len(target.split())
distances = {}
min_distances = []
stride_length = math.ceil(target_length * stride_fraction)
for string in list_of_strings:
all_distances = []
string_words = string.split()
# If the string has at least as many words as the target
if len(string_words) >= target_length:
for i in range(len(string_words) - target_length + 1):
if len(string_words) > target_length:
i = 0
while i < len(string_words) - target_length + 1:
window = " ".join(string_words[i : i + target_length])
distance = lev.distance(target, window)
# Save the distance for this window
distances[window] = (distance / len(target)) * 100
distance = lev.distance(target, window) / len(target)
if distance < threshold:
for j in range(
max(i - target_length, 0),
min(i + target_length, len(string_words) - target_length + 1),
):
detailed_window = " ".join(string_words[j : j + target_length])
detailed_distance = lev.distance(target, detailed_window) / len(
target
)
all_distances.append((detailed_window, detailed_distance * 100))
i += stride_length
else:
# If the string has fewer words than the target
distance = lev.distance(target, string)
distances[string] = (distance / len(target)) * 100
i += stride_length
else:
dist = lev.distance(target, string) / len(target)
all_distances.append((string, dist * 100))
return distances
if all_distances:
min_window = min(all_distances, key=lambda x: x[1])
min_distances.append([min_window[0], min_window[1]])
return min_distances
def replace_multiple_spaces(text):
return re.sub(" +", " ", text)
nlp = spacy.load("ru_core_news_sm")
def remove_special_characters(string):
return re.sub(r"\W", "", string)
def difference_type(word1, word2):
if word1 == word2:
return None # слова совпадают, пропускаем их
if remove_special_characters(word1) == remove_special_characters(word2):
return "Пропущен специцальный символ"
if word1.lower() == word2.lower():
return "Разная капитуляция слов"
if word1.isdigit() and word2.isdigit():
if abs(int(word1) - int(word2)) < 10:
return "Небольшое числовое различие"
else:
return "Разные числа"
token1 = nlp(word1)[0]
token2 = nlp(word2)[0]
if token1.lemma_ == token2.lemma_:
if token1.pos_ != token2.pos_:
return "Разные формы слова"
else:
return "Одинаковый корень, но разные формы"
if Levenshtein.distance(word1, word2) <= 2:
return "Возможная орфографическая ошибка или опечатка"
return "Разные слова"
def compare_strings(str1, str2):
words1 = str1.split()
words2 = str2.split()
words1_only = set(words1) - set(words2)
words2_only = set(words2) - set(words1)
differences = []
mn_len = min(len(words1), len(words2))
for i in range(mn_len):
difference = difference_type(words1[i], words2[i])
differences.append((words1[i], words2[i], difference))
for word in words1_only:
differences.append((word, None, "Word only in first string"))
for word in words2_only:
differences.append((None, word, "Word only in second string"))
diff_types = set()
for diff in differences:
if diff[2]:
diff_types.add(diff[2])
return differences, diff_types
def get_matches(file, target):
target = replace_multiple_spaces(target)
result = []
for i, page_layout in enumerate(tqdm(extract_pages(file))):
_x1, _y1, _x2, _y2 = page_layout.bbox
texts = []
relative_coords = []
d = {}
for element in page_layout:
if isinstance(element, LTTextContainer):
# print(element.get_text())
x1, y1, x2, y2 = element.bbox
raw = element.get_text()
text = replace_multiple_spaces(raw.replace("\n", " ").strip())
if len(text) > 3:
relative_coords.append(
[x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2]
([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2])
)
texts.append(text)
d[text] = raw
texts.append(
replace_multiple_spaces(element.get_text().replace("\n", ""))
)
distances = calculate_distances(target, texts)
for window, distance in distances.items():
for window, distance in distances:
if distance / len(target) < 0.2:
# print(i)
# print(window)
for j in range(len(texts)):
if window in texts[j]:
raw_text = d[texts[j]]
rel_coord = relative_coords[j]
break
difference, diff_types = compare_strings(window, target)
result.append(
{
"page": i + 1,
"window": window,
"coordinates": rel_coord,
"distance": distance / len(target),
"diff_type": list(diff_types),
"raw_text": raw_text,
}
)
return result

1059
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -54,7 +54,9 @@ levenshtein = "^0.21.1"
pdfminer-six = "^20221105"
pandas = "^2.0.2"
tqdm = "^4.65.0"
easyocr = "^1.7.0"
pymupdf = "^1.22.5"
spacy = "^3.5.3"
python-levenshtein = "^0.21.1"
[build-system]