updated file handling

This commit is contained in:
Alexander Karpov 2023-06-25 07:24:29 +03:00
parent e35215f22a
commit f588fead51
8 changed files with 922 additions and 486 deletions

View File

@ -61,7 +61,14 @@ class FullFileSerializer(FileSerializer):
class Meta: class Meta:
model = File model = File
fields = ["name", "ideal_title", "file", "images", "text_locations"] fields = [
"name",
"ideal_title",
"file",
"processed_file",
"images",
"text_locations",
]
class UpdateFileTitleSerializer(serializers.Serializer): class UpdateFileTitleSerializer(serializers.Serializer):

View File

@ -0,0 +1,21 @@
# Generated by Django 4.2.2 on 2023-06-25 02:54
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
(
"processor",
"0007_delete_task_remove_fileimage_text_file_ideal_title_and_more",
),
]
operations = [
migrations.AddField(
model_name="file",
name="processed_file",
field=models.FileField(blank=True, null=True, upload_to="processed/"),
),
]

View File

@ -14,6 +14,7 @@ class File(models.Model):
upload_to="uploads/", upload_to="uploads/",
validators=[FileExtensionValidator(allowed_extensions=["pdf"])], validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
) )
processed_file = models.FileField(upload_to="processed/", null=True, blank=True)
class Meta: class Meta:
ordering = ("-uploaded",) ordering = ("-uploaded",)

View File

@ -1,3 +1,7 @@
from typing import Tuple
from io import BytesIO
import re
import fitz
from django.core.cache import cache from django.core.cache import cache
from rest_framework.exceptions import NotFound from rest_framework.exceptions import NotFound
@ -17,3 +21,151 @@ def get_task_status(pk: str) -> dict:
"error": error, "error": error,
"error_description": error_description, "error_description": error_description,
} }
def extract_info(input_file: str):
"""
Extracts file info
"""
# Open the PDF
pdfDoc = fitz.open(input_file)
output = {
"File": input_file,
"Encrypted": ("True" if pdfDoc.isEncrypted else "False"),
}
# If PDF is encrypted the file metadata cannot be extracted
if not pdfDoc.isEncrypted:
for key, value in pdfDoc.metadata.items():
output[key] = value
# To Display File Info
print("## File Information ##################################################")
print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
print("######################################################################")
return True, output
def search_for_text(lines, search_str):
"""
Search for the search string within the document lines
"""
if search_str in lines:
return search_str
def redact_matching_data(page, matched_values):
"""
Redacts matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
# Redact matching values
[
page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
for area in matching_val_area
]
# Apply the redaction
page.apply_redactions()
return matches_found
def frame_matching_data(page, matched_values):
"""
frames matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
for area in matching_val_area:
if isinstance(area, fitz.fitz.Rect):
# Draw a rectangle around matched values
annot = page.addRectAnnot(area)
# , fill = fitz.utils.getColor('black')
annot.setColors(stroke=fitz.utils.getColor("red"))
# If you want to remove matched data
# page.addFreetextAnnot(area, ' ')
annot.update()
return matches_found
def highlight_matching_data(page, matched_values, type):
"""
Highlight matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
# print("matching_val_area",matching_val_area)
highlight = None
if type == "Highlight":
highlight = page.add_highlight_annot(matching_val_area)
elif type == "Squiggly":
highlight = page.add_squiggly_annot(matching_val_area)
elif type == "Underline":
highlight = page.add_underline_annot(matching_val_area)
elif type == "Strikeout":
highlight = page.add_strikeout_annot(matching_val_area)
else:
highlight = page.add_highlight_annot(matching_val_area)
# To change the highlight colar
# highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
# highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
# highlight.setColors(colors= fitz.utils.getColor('red'))
highlight.update()
return matches_found
def process_data(
input_file: str,
search_str: str,
pages: Tuple = None,
action: str = "Highlight",
):
"""
Process the pages of the PDF File
"""
# Open the PDF
pdfDoc = fitz.open(input_file)
# Save the generated PDF to memory buffer
output_buffer = BytesIO()
total_matches = 0
# Iterate through pages
for pg in range(len(pdfDoc)):
# If required for specific pages
if pages:
if str(pg) not in pages:
continue
# Select the page
page = pdfDoc[pg]
# Get Matching Data
# Split page by lines
page_lines = page.get_text("text")
matched_values = search_for_text(page_lines, search_str)
if matched_values:
if action == "Redact":
matches_found = redact_matching_data(page, matched_values)
elif action == "Frame":
matches_found = frame_matching_data(page, matched_values)
elif action in ("Highlight", "Squiggly", "Underline", "Strikeout"):
matches_found = highlight_matching_data(page, matched_values, action)
else:
matches_found = highlight_matching_data(
page, matched_values, "Highlight"
)
total_matches += matches_found
print(
f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}"
)
# Save to output
pdfDoc.save(output_buffer)
pdfDoc.close()
# Save the output buffer to the output file
with open(input_file, mode="wb") as f:
f.write(output_buffer.getbuffer())

View File

@ -1,8 +1,10 @@
import os import os
import shutil import shutil
from io import BytesIO
from time import sleep from time import sleep
import fitz
from celery import shared_task from celery import shared_task
from django.core.files import File from django.core.files import File
from pdf2image import convert_from_path from pdf2image import convert_from_path
@ -26,8 +28,6 @@ def process_pdf(pk: str):
cache.set(f"{pk}-features_loaded", False) cache.set(f"{pk}-features_loaded", False)
cache.set(f"{pk}-processed", 1) cache.set(f"{pk}-processed", 1)
extract_pdf_features.apply_async(kwargs={"pk": pk}) extract_pdf_features.apply_async(kwargs={"pk": pk})
split_pdf_into_images.apply_async(kwargs={"pk": pk})
load_pdf.apply_async(kwargs={"pk": pk})
return pk return pk
@ -46,8 +46,23 @@ def extract_pdf_features(pk: str):
text_locations = get_matches(file.file.path, target) text_locations = get_matches(file.file.path, target)
file.ideal_title = target file.ideal_title = target
file.text_locations = text_locations file.text_locations = text_locations
pdfDoc = fitz.open(file.file.path)
for loc in text_locations:
page = pdfDoc[loc["page"] - 1]
matching_val_area = page.search_for(loc["raw_text"])
for rect in matching_val_area:
page.add_highlight_annot(rect)
output_buffer = BytesIO()
pdfDoc.close()
with open(file.file.path, mode="wb") as f:
f.write(output_buffer.getbuffer())
file.save() file.save()
cache.set(f"{pk}-features_loaded", True) cache.set(f"{pk}-features_loaded", True)
split_pdf_into_images.apply_async(kwargs={"pk": pk})
load_pdf.apply_async(kwargs={"pk": pk})
# create_processed_pdf.apply_async(kwargs={"pk": pk})
return pk return pk
@ -70,6 +85,18 @@ def update_pdf_features(pk: str, target: str):
return pk return pk
# @shared_task
# def create_processed_pdf(pk: str):
# file = FileModel.objects.get(pk=pk)
# f_path = "processed_" + file.file.path.split("/")[-1]
# shutil.copy(file.file.path, f_path)
#
# for loc in file.text_locations:
# highlight_pdf(f_path, loc["raw_text"], page=loc["page"] - 1)
#
# os.remove(f_path)
@shared_task @shared_task
def split_pdf_into_images(pk: str): def split_pdf_into_images(pk: str):
file = FileModel.objects.get(pk=pk) file = FileModel.objects.get(pk=pk)

View File

@ -1,6 +1,9 @@
import re import re
import math
import spacy
import pickle import pickle
import warnings import warnings
import Levenshtein
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import Levenshtein as lev import Levenshtein as lev
@ -128,66 +131,156 @@ def inference_models(checkpoint_name, test_df):
return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip() return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip()
def calculate_distances(target, list_of_strings): def calculate_distances(target, list_of_strings, stride_fraction=1 / 4, threshold=0.3):
target_length = len(target.split()) target_length = len(target.split())
distances = {} min_distances = []
stride_length = math.ceil(target_length * stride_fraction)
for string in list_of_strings: for string in list_of_strings:
all_distances = []
string_words = string.split() string_words = string.split()
# If the string has at least as many words as the target if len(string_words) > target_length:
if len(string_words) >= target_length: i = 0
for i in range(len(string_words) - target_length + 1): while i < len(string_words) - target_length + 1:
window = " ".join(string_words[i : i + target_length]) window = " ".join(string_words[i : i + target_length])
distance = lev.distance(target, window)
# Save the distance for this window distance = lev.distance(target, window) / len(target)
distances[window] = (distance / len(target)) * 100 if distance < threshold:
for j in range(
max(i - target_length, 0),
min(i + target_length, len(string_words) - target_length + 1),
):
detailed_window = " ".join(string_words[j : j + target_length])
detailed_distance = lev.distance(target, detailed_window) / len(
target
)
all_distances.append((detailed_window, detailed_distance * 100))
i += stride_length
else: else:
# If the string has fewer words than the target i += stride_length
distance = lev.distance(target, string) else:
distances[string] = (distance / len(target)) * 100 dist = lev.distance(target, string) / len(target)
all_distances.append((string, dist * 100))
return distances if all_distances:
min_window = min(all_distances, key=lambda x: x[1])
min_distances.append([min_window[0], min_window[1]])
return min_distances
def replace_multiple_spaces(text): def replace_multiple_spaces(text):
return re.sub(" +", " ", text) return re.sub(" +", " ", text)
nlp = spacy.load("ru_core_news_sm")
def remove_special_characters(string):
return re.sub(r"\W", "", string)
def difference_type(word1, word2):
if word1 == word2:
return None # слова совпадают, пропускаем их
if remove_special_characters(word1) == remove_special_characters(word2):
return "Пропущен специцальный символ"
if word1.lower() == word2.lower():
return "Разная капитуляция слов"
if word1.isdigit() and word2.isdigit():
if abs(int(word1) - int(word2)) < 10:
return "Небольшое числовое различие"
else:
return "Разные числа"
token1 = nlp(word1)[0]
token2 = nlp(word2)[0]
if token1.lemma_ == token2.lemma_:
if token1.pos_ != token2.pos_:
return "Разные формы слова"
else:
return "Одинаковый корень, но разные формы"
if Levenshtein.distance(word1, word2) <= 2:
return "Возможная орфографическая ошибка или опечатка"
return "Разные слова"
def compare_strings(str1, str2):
words1 = str1.split()
words2 = str2.split()
words1_only = set(words1) - set(words2)
words2_only = set(words2) - set(words1)
differences = []
mn_len = min(len(words1), len(words2))
for i in range(mn_len):
difference = difference_type(words1[i], words2[i])
differences.append((words1[i], words2[i], difference))
for word in words1_only:
differences.append((word, None, "Word only in first string"))
for word in words2_only:
differences.append((None, word, "Word only in second string"))
diff_types = set()
for diff in differences:
if diff[2]:
diff_types.add(diff[2])
return differences, diff_types
def get_matches(file, target): def get_matches(file, target):
target = replace_multiple_spaces(target)
result = [] result = []
for i, page_layout in enumerate(tqdm(extract_pages(file))): for i, page_layout in enumerate(tqdm(extract_pages(file))):
_x1, _y1, _x2, _y2 = page_layout.bbox _x1, _y1, _x2, _y2 = page_layout.bbox
texts = [] texts = []
relative_coords = [] relative_coords = []
d = {}
for element in page_layout: for element in page_layout:
if isinstance(element, LTTextContainer): if isinstance(element, LTTextContainer):
# print(element.get_text()) # print(element.get_text())
x1, y1, x2, y2 = element.bbox x1, y1, x2, y2 = element.bbox
raw = element.get_text()
text = replace_multiple_spaces(raw.replace("\n", " ").strip())
if len(text) > 3:
relative_coords.append( relative_coords.append(
[x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2] ([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2])
) )
texts.append(text)
d[text] = raw
texts.append(
replace_multiple_spaces(element.get_text().replace("\n", ""))
)
distances = calculate_distances(target, texts) distances = calculate_distances(target, texts)
for window, distance in distances.items(): for window, distance in distances:
if distance / len(target) < 0.2: if distance / len(target) < 0.2:
# print(i) # print(i)
# print(window) # print(window)
for j in range(len(texts)): for j in range(len(texts)):
if window in texts[j]: if window in texts[j]:
raw_text = d[texts[j]]
rel_coord = relative_coords[j] rel_coord = relative_coords[j]
break break
difference, diff_types = compare_strings(window, target)
result.append( result.append(
{ {
"page": i + 1, "page": i + 1,
"window": window, "window": window,
"coordinates": rel_coord, "coordinates": rel_coord,
"distance": distance / len(target), "distance": distance / len(target),
"diff_type": list(diff_types),
"raw_text": raw_text,
} }
) )
return result return result

1059
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -54,7 +54,9 @@ levenshtein = "^0.21.1"
pdfminer-six = "^20221105" pdfminer-six = "^20221105"
pandas = "^2.0.2" pandas = "^2.0.2"
tqdm = "^4.65.0" tqdm = "^4.65.0"
easyocr = "^1.7.0" pymupdf = "^1.22.5"
spacy = "^3.5.3"
python-levenshtein = "^0.21.1"
[build-system] [build-system]