mirror of
https://github.com/mistakes-23/backend.git
synced 2024-11-25 01:13:43 +03:00
updated file handling
This commit is contained in:
parent
e35215f22a
commit
f588fead51
|
@ -61,7 +61,14 @@ class FullFileSerializer(FileSerializer):
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
model = File
|
model = File
|
||||||
fields = ["name", "ideal_title", "file", "images", "text_locations"]
|
fields = [
|
||||||
|
"name",
|
||||||
|
"ideal_title",
|
||||||
|
"file",
|
||||||
|
"processed_file",
|
||||||
|
"images",
|
||||||
|
"text_locations",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class UpdateFileTitleSerializer(serializers.Serializer):
|
class UpdateFileTitleSerializer(serializers.Serializer):
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
# Generated by Django 4.2.2 on 2023-06-25 02:54
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
(
|
||||||
|
"processor",
|
||||||
|
"0007_delete_task_remove_fileimage_text_file_ideal_title_and_more",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name="file",
|
||||||
|
name="processed_file",
|
||||||
|
field=models.FileField(blank=True, null=True, upload_to="processed/"),
|
||||||
|
),
|
||||||
|
]
|
|
@ -14,6 +14,7 @@ class File(models.Model):
|
||||||
upload_to="uploads/",
|
upload_to="uploads/",
|
||||||
validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
|
validators=[FileExtensionValidator(allowed_extensions=["pdf"])],
|
||||||
)
|
)
|
||||||
|
processed_file = models.FileField(upload_to="processed/", null=True, blank=True)
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
ordering = ("-uploaded",)
|
ordering = ("-uploaded",)
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
from typing import Tuple
|
||||||
|
from io import BytesIO
|
||||||
|
import re
|
||||||
|
import fitz
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
from rest_framework.exceptions import NotFound
|
from rest_framework.exceptions import NotFound
|
||||||
|
|
||||||
|
@ -17,3 +21,151 @@ def get_task_status(pk: str) -> dict:
|
||||||
"error": error,
|
"error": error,
|
||||||
"error_description": error_description,
|
"error_description": error_description,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_info(input_file: str):
|
||||||
|
"""
|
||||||
|
Extracts file info
|
||||||
|
"""
|
||||||
|
# Open the PDF
|
||||||
|
pdfDoc = fitz.open(input_file)
|
||||||
|
output = {
|
||||||
|
"File": input_file,
|
||||||
|
"Encrypted": ("True" if pdfDoc.isEncrypted else "False"),
|
||||||
|
}
|
||||||
|
# If PDF is encrypted the file metadata cannot be extracted
|
||||||
|
if not pdfDoc.isEncrypted:
|
||||||
|
for key, value in pdfDoc.metadata.items():
|
||||||
|
output[key] = value
|
||||||
|
# To Display File Info
|
||||||
|
print("## File Information ##################################################")
|
||||||
|
print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
|
||||||
|
print("######################################################################")
|
||||||
|
return True, output
|
||||||
|
|
||||||
|
|
||||||
|
def search_for_text(lines, search_str):
|
||||||
|
"""
|
||||||
|
Search for the search string within the document lines
|
||||||
|
"""
|
||||||
|
if search_str in lines:
|
||||||
|
return search_str
|
||||||
|
|
||||||
|
|
||||||
|
def redact_matching_data(page, matched_values):
|
||||||
|
"""
|
||||||
|
Redacts matching values
|
||||||
|
"""
|
||||||
|
matches_found = 0
|
||||||
|
# Loop throughout matching values
|
||||||
|
for val in matched_values:
|
||||||
|
matches_found += 1
|
||||||
|
matching_val_area = page.search_for(val)
|
||||||
|
# Redact matching values
|
||||||
|
[
|
||||||
|
page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
|
||||||
|
for area in matching_val_area
|
||||||
|
]
|
||||||
|
# Apply the redaction
|
||||||
|
page.apply_redactions()
|
||||||
|
return matches_found
|
||||||
|
|
||||||
|
|
||||||
|
def frame_matching_data(page, matched_values):
|
||||||
|
"""
|
||||||
|
frames matching values
|
||||||
|
"""
|
||||||
|
matches_found = 0
|
||||||
|
# Loop throughout matching values
|
||||||
|
for val in matched_values:
|
||||||
|
matches_found += 1
|
||||||
|
matching_val_area = page.search_for(val)
|
||||||
|
for area in matching_val_area:
|
||||||
|
if isinstance(area, fitz.fitz.Rect):
|
||||||
|
# Draw a rectangle around matched values
|
||||||
|
annot = page.addRectAnnot(area)
|
||||||
|
# , fill = fitz.utils.getColor('black')
|
||||||
|
annot.setColors(stroke=fitz.utils.getColor("red"))
|
||||||
|
# If you want to remove matched data
|
||||||
|
# page.addFreetextAnnot(area, ' ')
|
||||||
|
annot.update()
|
||||||
|
return matches_found
|
||||||
|
|
||||||
|
|
||||||
|
def highlight_matching_data(page, matched_values, type):
|
||||||
|
"""
|
||||||
|
Highlight matching values
|
||||||
|
"""
|
||||||
|
matches_found = 0
|
||||||
|
# Loop throughout matching values
|
||||||
|
for val in matched_values:
|
||||||
|
matches_found += 1
|
||||||
|
matching_val_area = page.search_for(val)
|
||||||
|
# print("matching_val_area",matching_val_area)
|
||||||
|
highlight = None
|
||||||
|
if type == "Highlight":
|
||||||
|
highlight = page.add_highlight_annot(matching_val_area)
|
||||||
|
elif type == "Squiggly":
|
||||||
|
highlight = page.add_squiggly_annot(matching_val_area)
|
||||||
|
elif type == "Underline":
|
||||||
|
highlight = page.add_underline_annot(matching_val_area)
|
||||||
|
elif type == "Strikeout":
|
||||||
|
highlight = page.add_strikeout_annot(matching_val_area)
|
||||||
|
else:
|
||||||
|
highlight = page.add_highlight_annot(matching_val_area)
|
||||||
|
# To change the highlight colar
|
||||||
|
# highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
|
||||||
|
# highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
|
||||||
|
# highlight.setColors(colors= fitz.utils.getColor('red'))
|
||||||
|
highlight.update()
|
||||||
|
return matches_found
|
||||||
|
|
||||||
|
|
||||||
|
def process_data(
|
||||||
|
input_file: str,
|
||||||
|
search_str: str,
|
||||||
|
pages: Tuple = None,
|
||||||
|
action: str = "Highlight",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Process the pages of the PDF File
|
||||||
|
"""
|
||||||
|
# Open the PDF
|
||||||
|
pdfDoc = fitz.open(input_file)
|
||||||
|
# Save the generated PDF to memory buffer
|
||||||
|
output_buffer = BytesIO()
|
||||||
|
total_matches = 0
|
||||||
|
# Iterate through pages
|
||||||
|
for pg in range(len(pdfDoc)):
|
||||||
|
# If required for specific pages
|
||||||
|
if pages:
|
||||||
|
if str(pg) not in pages:
|
||||||
|
continue
|
||||||
|
# Select the page
|
||||||
|
page = pdfDoc[pg]
|
||||||
|
# Get Matching Data
|
||||||
|
# Split page by lines
|
||||||
|
page_lines = page.get_text("text")
|
||||||
|
matched_values = search_for_text(page_lines, search_str)
|
||||||
|
if matched_values:
|
||||||
|
if action == "Redact":
|
||||||
|
matches_found = redact_matching_data(page, matched_values)
|
||||||
|
elif action == "Frame":
|
||||||
|
matches_found = frame_matching_data(page, matched_values)
|
||||||
|
elif action in ("Highlight", "Squiggly", "Underline", "Strikeout"):
|
||||||
|
matches_found = highlight_matching_data(page, matched_values, action)
|
||||||
|
else:
|
||||||
|
matches_found = highlight_matching_data(
|
||||||
|
page, matched_values, "Highlight"
|
||||||
|
)
|
||||||
|
total_matches += matches_found
|
||||||
|
print(
|
||||||
|
f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}"
|
||||||
|
)
|
||||||
|
# Save to output
|
||||||
|
pdfDoc.save(output_buffer)
|
||||||
|
pdfDoc.close()
|
||||||
|
# Save the output buffer to the output file
|
||||||
|
with open(input_file, mode="wb") as f:
|
||||||
|
f.write(output_buffer.getbuffer())
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import shutil
|
import shutil
|
||||||
|
from io import BytesIO
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
|
import fitz
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from django.core.files import File
|
from django.core.files import File
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
|
@ -26,8 +28,6 @@ def process_pdf(pk: str):
|
||||||
cache.set(f"{pk}-features_loaded", False)
|
cache.set(f"{pk}-features_loaded", False)
|
||||||
cache.set(f"{pk}-processed", 1)
|
cache.set(f"{pk}-processed", 1)
|
||||||
extract_pdf_features.apply_async(kwargs={"pk": pk})
|
extract_pdf_features.apply_async(kwargs={"pk": pk})
|
||||||
split_pdf_into_images.apply_async(kwargs={"pk": pk})
|
|
||||||
load_pdf.apply_async(kwargs={"pk": pk})
|
|
||||||
return pk
|
return pk
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,8 +46,23 @@ def extract_pdf_features(pk: str):
|
||||||
text_locations = get_matches(file.file.path, target)
|
text_locations = get_matches(file.file.path, target)
|
||||||
file.ideal_title = target
|
file.ideal_title = target
|
||||||
file.text_locations = text_locations
|
file.text_locations = text_locations
|
||||||
|
|
||||||
|
pdfDoc = fitz.open(file.file.path)
|
||||||
|
for loc in text_locations:
|
||||||
|
page = pdfDoc[loc["page"] - 1]
|
||||||
|
matching_val_area = page.search_for(loc["raw_text"])
|
||||||
|
for rect in matching_val_area:
|
||||||
|
page.add_highlight_annot(rect)
|
||||||
|
output_buffer = BytesIO()
|
||||||
|
pdfDoc.close()
|
||||||
|
with open(file.file.path, mode="wb") as f:
|
||||||
|
f.write(output_buffer.getbuffer())
|
||||||
|
|
||||||
file.save()
|
file.save()
|
||||||
cache.set(f"{pk}-features_loaded", True)
|
cache.set(f"{pk}-features_loaded", True)
|
||||||
|
split_pdf_into_images.apply_async(kwargs={"pk": pk})
|
||||||
|
load_pdf.apply_async(kwargs={"pk": pk})
|
||||||
|
# create_processed_pdf.apply_async(kwargs={"pk": pk})
|
||||||
return pk
|
return pk
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,6 +85,18 @@ def update_pdf_features(pk: str, target: str):
|
||||||
return pk
|
return pk
|
||||||
|
|
||||||
|
|
||||||
|
# @shared_task
|
||||||
|
# def create_processed_pdf(pk: str):
|
||||||
|
# file = FileModel.objects.get(pk=pk)
|
||||||
|
# f_path = "processed_" + file.file.path.split("/")[-1]
|
||||||
|
# shutil.copy(file.file.path, f_path)
|
||||||
|
#
|
||||||
|
# for loc in file.text_locations:
|
||||||
|
# highlight_pdf(f_path, loc["raw_text"], page=loc["page"] - 1)
|
||||||
|
#
|
||||||
|
# os.remove(f_path)
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
def split_pdf_into_images(pk: str):
|
def split_pdf_into_images(pk: str):
|
||||||
file = FileModel.objects.get(pk=pk)
|
file = FileModel.objects.get(pk=pk)
|
||||||
|
|
127
ml/main.py
127
ml/main.py
|
@ -1,6 +1,9 @@
|
||||||
import re
|
import re
|
||||||
|
import math
|
||||||
|
import spacy
|
||||||
import pickle
|
import pickle
|
||||||
import warnings
|
import warnings
|
||||||
|
import Levenshtein
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import Levenshtein as lev
|
import Levenshtein as lev
|
||||||
|
@ -128,66 +131,156 @@ def inference_models(checkpoint_name, test_df):
|
||||||
return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip()
|
return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip()
|
||||||
|
|
||||||
|
|
||||||
def calculate_distances(target, list_of_strings):
|
def calculate_distances(target, list_of_strings, stride_fraction=1 / 4, threshold=0.3):
|
||||||
target_length = len(target.split())
|
target_length = len(target.split())
|
||||||
distances = {}
|
min_distances = []
|
||||||
|
|
||||||
|
stride_length = math.ceil(target_length * stride_fraction)
|
||||||
|
|
||||||
for string in list_of_strings:
|
for string in list_of_strings:
|
||||||
|
all_distances = []
|
||||||
string_words = string.split()
|
string_words = string.split()
|
||||||
|
|
||||||
# If the string has at least as many words as the target
|
if len(string_words) > target_length:
|
||||||
if len(string_words) >= target_length:
|
i = 0
|
||||||
for i in range(len(string_words) - target_length + 1):
|
while i < len(string_words) - target_length + 1:
|
||||||
window = " ".join(string_words[i : i + target_length])
|
window = " ".join(string_words[i : i + target_length])
|
||||||
distance = lev.distance(target, window)
|
|
||||||
|
|
||||||
# Save the distance for this window
|
distance = lev.distance(target, window) / len(target)
|
||||||
distances[window] = (distance / len(target)) * 100
|
if distance < threshold:
|
||||||
|
for j in range(
|
||||||
|
max(i - target_length, 0),
|
||||||
|
min(i + target_length, len(string_words) - target_length + 1),
|
||||||
|
):
|
||||||
|
detailed_window = " ".join(string_words[j : j + target_length])
|
||||||
|
detailed_distance = lev.distance(target, detailed_window) / len(
|
||||||
|
target
|
||||||
|
)
|
||||||
|
|
||||||
|
all_distances.append((detailed_window, detailed_distance * 100))
|
||||||
|
i += stride_length
|
||||||
else:
|
else:
|
||||||
# If the string has fewer words than the target
|
i += stride_length
|
||||||
distance = lev.distance(target, string)
|
else:
|
||||||
distances[string] = (distance / len(target)) * 100
|
dist = lev.distance(target, string) / len(target)
|
||||||
|
all_distances.append((string, dist * 100))
|
||||||
|
|
||||||
return distances
|
if all_distances:
|
||||||
|
min_window = min(all_distances, key=lambda x: x[1])
|
||||||
|
min_distances.append([min_window[0], min_window[1]])
|
||||||
|
|
||||||
|
return min_distances
|
||||||
|
|
||||||
|
|
||||||
def replace_multiple_spaces(text):
|
def replace_multiple_spaces(text):
|
||||||
return re.sub(" +", " ", text)
|
return re.sub(" +", " ", text)
|
||||||
|
|
||||||
|
|
||||||
|
nlp = spacy.load("ru_core_news_sm")
|
||||||
|
|
||||||
|
|
||||||
|
def remove_special_characters(string):
|
||||||
|
return re.sub(r"\W", "", string)
|
||||||
|
|
||||||
|
|
||||||
|
def difference_type(word1, word2):
|
||||||
|
if word1 == word2:
|
||||||
|
return None # слова совпадают, пропускаем их
|
||||||
|
|
||||||
|
if remove_special_characters(word1) == remove_special_characters(word2):
|
||||||
|
return "Пропущен специцальный символ"
|
||||||
|
|
||||||
|
if word1.lower() == word2.lower():
|
||||||
|
return "Разная капитуляция слов"
|
||||||
|
|
||||||
|
if word1.isdigit() and word2.isdigit():
|
||||||
|
if abs(int(word1) - int(word2)) < 10:
|
||||||
|
return "Небольшое числовое различие"
|
||||||
|
else:
|
||||||
|
return "Разные числа"
|
||||||
|
|
||||||
|
token1 = nlp(word1)[0]
|
||||||
|
token2 = nlp(word2)[0]
|
||||||
|
if token1.lemma_ == token2.lemma_:
|
||||||
|
if token1.pos_ != token2.pos_:
|
||||||
|
return "Разные формы слова"
|
||||||
|
else:
|
||||||
|
return "Одинаковый корень, но разные формы"
|
||||||
|
|
||||||
|
if Levenshtein.distance(word1, word2) <= 2:
|
||||||
|
return "Возможная орфографическая ошибка или опечатка"
|
||||||
|
return "Разные слова"
|
||||||
|
|
||||||
|
|
||||||
|
def compare_strings(str1, str2):
|
||||||
|
words1 = str1.split()
|
||||||
|
words2 = str2.split()
|
||||||
|
|
||||||
|
words1_only = set(words1) - set(words2)
|
||||||
|
words2_only = set(words2) - set(words1)
|
||||||
|
|
||||||
|
differences = []
|
||||||
|
mn_len = min(len(words1), len(words2))
|
||||||
|
for i in range(mn_len):
|
||||||
|
difference = difference_type(words1[i], words2[i])
|
||||||
|
differences.append((words1[i], words2[i], difference))
|
||||||
|
|
||||||
|
for word in words1_only:
|
||||||
|
differences.append((word, None, "Word only in first string"))
|
||||||
|
|
||||||
|
for word in words2_only:
|
||||||
|
differences.append((None, word, "Word only in second string"))
|
||||||
|
|
||||||
|
diff_types = set()
|
||||||
|
for diff in differences:
|
||||||
|
if diff[2]:
|
||||||
|
diff_types.add(diff[2])
|
||||||
|
|
||||||
|
return differences, diff_types
|
||||||
|
|
||||||
|
|
||||||
def get_matches(file, target):
|
def get_matches(file, target):
|
||||||
|
target = replace_multiple_spaces(target)
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
for i, page_layout in enumerate(tqdm(extract_pages(file))):
|
for i, page_layout in enumerate(tqdm(extract_pages(file))):
|
||||||
_x1, _y1, _x2, _y2 = page_layout.bbox
|
_x1, _y1, _x2, _y2 = page_layout.bbox
|
||||||
texts = []
|
texts = []
|
||||||
relative_coords = []
|
relative_coords = []
|
||||||
|
d = {}
|
||||||
for element in page_layout:
|
for element in page_layout:
|
||||||
if isinstance(element, LTTextContainer):
|
if isinstance(element, LTTextContainer):
|
||||||
# print(element.get_text())
|
# print(element.get_text())
|
||||||
x1, y1, x2, y2 = element.bbox
|
x1, y1, x2, y2 = element.bbox
|
||||||
|
raw = element.get_text()
|
||||||
|
text = replace_multiple_spaces(raw.replace("\n", " ").strip())
|
||||||
|
if len(text) > 3:
|
||||||
relative_coords.append(
|
relative_coords.append(
|
||||||
[x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2]
|
([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2])
|
||||||
)
|
)
|
||||||
|
texts.append(text)
|
||||||
|
d[text] = raw
|
||||||
|
|
||||||
texts.append(
|
|
||||||
replace_multiple_spaces(element.get_text().replace("\n", ""))
|
|
||||||
)
|
|
||||||
distances = calculate_distances(target, texts)
|
distances = calculate_distances(target, texts)
|
||||||
|
|
||||||
for window, distance in distances.items():
|
for window, distance in distances:
|
||||||
if distance / len(target) < 0.2:
|
if distance / len(target) < 0.2:
|
||||||
# print(i)
|
# print(i)
|
||||||
# print(window)
|
# print(window)
|
||||||
for j in range(len(texts)):
|
for j in range(len(texts)):
|
||||||
if window in texts[j]:
|
if window in texts[j]:
|
||||||
|
raw_text = d[texts[j]]
|
||||||
rel_coord = relative_coords[j]
|
rel_coord = relative_coords[j]
|
||||||
break
|
break
|
||||||
|
difference, diff_types = compare_strings(window, target)
|
||||||
result.append(
|
result.append(
|
||||||
{
|
{
|
||||||
"page": i + 1,
|
"page": i + 1,
|
||||||
"window": window,
|
"window": window,
|
||||||
"coordinates": rel_coord,
|
"coordinates": rel_coord,
|
||||||
"distance": distance / len(target),
|
"distance": distance / len(target),
|
||||||
|
"diff_type": list(diff_types),
|
||||||
|
"raw_text": raw_text,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
1059
poetry.lock
generated
1059
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
|
@ -54,7 +54,9 @@ levenshtein = "^0.21.1"
|
||||||
pdfminer-six = "^20221105"
|
pdfminer-six = "^20221105"
|
||||||
pandas = "^2.0.2"
|
pandas = "^2.0.2"
|
||||||
tqdm = "^4.65.0"
|
tqdm = "^4.65.0"
|
||||||
easyocr = "^1.7.0"
|
pymupdf = "^1.22.5"
|
||||||
|
spacy = "^3.5.3"
|
||||||
|
python-levenshtein = "^0.21.1"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user