mirror of
https://github.com/mistakes-23/backend.git
synced 2024-11-10 18:26:35 +03:00
320 lines
10 KiB
Python
320 lines
10 KiB
Python
import re
|
|
import math
|
|
import spacy
|
|
import pickle
|
|
import warnings
|
|
import Levenshtein
|
|
import numpy as np
|
|
import pandas as pd
|
|
import Levenshtein as lev
|
|
|
|
from catboost import Pool
|
|
from pdfminer.high_level import extract_pages
|
|
from tqdm import tqdm
|
|
from pdfminer.layout import LTTextContainer, LTChar
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
|
|
def extract_test_features(file):
|
|
texts = []
|
|
fonts = []
|
|
squares = []
|
|
ids = []
|
|
coords = []
|
|
relative_coords = []
|
|
for page_layout in extract_pages(file):
|
|
_x1, _y1, _x2, _y2 = page_layout.bbox
|
|
for i, element in enumerate(page_layout):
|
|
if isinstance(element, LTTextContainer):
|
|
text = element.get_text().replace("\n", "")
|
|
|
|
if "(cid:" in text:
|
|
return "Неправильная кодировка файла", False
|
|
|
|
if text.split() != [] and len(text) > 4:
|
|
texts.append(text)
|
|
|
|
end = False
|
|
for text_line in element:
|
|
if end:
|
|
break
|
|
for character in text_line:
|
|
if isinstance(character, LTChar):
|
|
if "bold" in character.fontname.lower():
|
|
fonts.append(1)
|
|
elif "italic" in character.fontname.lower():
|
|
fonts.append(2)
|
|
else:
|
|
fonts.append(0)
|
|
end = True
|
|
break
|
|
|
|
x1, y1, x2, y2 = element.bbox
|
|
coords.append([x1, y1, x2, y2])
|
|
relative_coords.append(
|
|
[x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2]
|
|
)
|
|
|
|
squares.append((int(x2) - int(x1)) * (int(y2) - int(y1)))
|
|
|
|
match = re.search(r"LTTextBoxHorizontal\((\d+)\)", str(element))
|
|
if match:
|
|
id = int(match.group(1))
|
|
ids.append(id)
|
|
break
|
|
|
|
if not texts:
|
|
return "Файл состоит из сканов", False
|
|
if len(texts) < 3:
|
|
return "Главная страница состоит из сканов", False
|
|
if len(texts) > 25:
|
|
return "Произошла ошибка", False
|
|
|
|
test_df = pd.DataFrame(
|
|
{
|
|
"text": texts,
|
|
"font": fonts,
|
|
"file": file,
|
|
"squares": squares,
|
|
"ids": ids,
|
|
"coords": coords,
|
|
"relative_coords": relative_coords,
|
|
}
|
|
)
|
|
return test_df, True
|
|
|
|
|
|
def create_test_features(df):
|
|
df["len_of_text"] = df["text"].apply(len)
|
|
# df['len_of_text'] = df['text'].apply(lambda x: len(x.split()))
|
|
|
|
df["rank"] = (
|
|
df.groupby("file")["len_of_text"]
|
|
.rank(ascending=False, method="min")
|
|
.astype(int)
|
|
)
|
|
df["rank_squares"] = (
|
|
df.groupby("file")["squares"].rank(ascending=False, method="min").astype(int)
|
|
)
|
|
df["font"] = df["font"].astype(
|
|
object
|
|
) # Convert boolean to int for computation, True will be 1 and False will be 0
|
|
df["bold"] = (df["font"] == 1).astype(int)
|
|
df["bold_percentage"] = (
|
|
df.groupby("file")["font"].transform(lambda x: x.mean() * 100).astype(int)
|
|
)
|
|
df["id_percentage"] = (
|
|
df.groupby("file")["ids"].transform(lambda x: (x / x.max()) * 100).astype(int)
|
|
)
|
|
|
|
return df
|
|
|
|
|
|
def inference_models(checkpoint_name, test_df):
|
|
columns_to_use = [
|
|
"font",
|
|
"rank",
|
|
"rank_squares",
|
|
"bold_percentage",
|
|
"id_percentage",
|
|
]
|
|
with open(checkpoint_name, "rb") as f:
|
|
models = pickle.load(f)
|
|
|
|
test_pool = Pool(data=test_df[columns_to_use])
|
|
preds = []
|
|
for model in models:
|
|
preds.append(model.predict_proba(test_pool)[:, 1])
|
|
test_df["pred"] = np.mean(preds, axis=0)
|
|
return test_df, test_df.loc[test_df["pred"].idxmax(), "text"].strip()
|
|
|
|
|
|
def calculate_distances(target, list_of_strings, stride_fraction=1 / 4, threshold=0.3):
|
|
target_length = len(target.split())
|
|
min_distances = []
|
|
|
|
stride_length = math.ceil(target_length * stride_fraction)
|
|
|
|
for string in list_of_strings:
|
|
all_distances = []
|
|
string_words = string.split()
|
|
|
|
if len(string_words) > target_length:
|
|
i = 0
|
|
while i < len(string_words) - target_length + 1:
|
|
window = " ".join(string_words[i : i + target_length])
|
|
|
|
distance = lev.distance(target, window) / len(target)
|
|
if distance < threshold:
|
|
for j in range(
|
|
max(i - target_length, 0),
|
|
min(i + target_length, len(string_words) - target_length + 1),
|
|
):
|
|
detailed_window = " ".join(string_words[j : j + target_length])
|
|
detailed_distance = lev.distance(target, detailed_window) / len(
|
|
target
|
|
)
|
|
|
|
all_distances.append((detailed_window, detailed_distance * 100))
|
|
i += stride_length
|
|
else:
|
|
i += stride_length
|
|
else:
|
|
dist = lev.distance(target, string) / len(target)
|
|
all_distances.append((string, dist * 100))
|
|
|
|
if all_distances:
|
|
min_window = min(all_distances, key=lambda x: x[1])
|
|
min_distances.append([min_window[0], min_window[1]])
|
|
|
|
return min_distances
|
|
|
|
|
|
def replace_multiple_spaces(text):
|
|
return re.sub(" +", " ", text)
|
|
|
|
|
|
nlp = spacy.load("ru_core_news_sm")
|
|
|
|
|
|
def remove_special_characters(string):
|
|
return re.sub(r"\W", "", string)
|
|
|
|
|
|
def difference_type(word1, word2):
|
|
if word1 == word2:
|
|
return None # слова совпадают, пропускаем их
|
|
|
|
if remove_special_characters(word1) == remove_special_characters(word2):
|
|
return "Пропущен специцальный символ"
|
|
|
|
if word1.lower() == word2.lower():
|
|
return "Разная капитуляция слов"
|
|
|
|
if word1.isdigit() and word2.isdigit():
|
|
if abs(int(word1) - int(word2)) < 10:
|
|
return "Небольшое числовое различие"
|
|
else:
|
|
return "Разные числа"
|
|
|
|
token1 = nlp(word1)[0]
|
|
token2 = nlp(word2)[0]
|
|
if token1.lemma_ == token2.lemma_:
|
|
if token1.pos_ != token2.pos_:
|
|
return "Разные формы слова"
|
|
else:
|
|
return "Одинаковый корень, но разные формы"
|
|
|
|
if Levenshtein.distance(word1, word2) <= 2:
|
|
return "Возможная орфографическая ошибка или опечатка"
|
|
return "Разные слова"
|
|
|
|
|
|
def compare_strings(str1, str2):
|
|
words1 = str1.split()
|
|
words2 = str2.split()
|
|
|
|
words1_only = set(words1) - set(words2)
|
|
words2_only = set(words2) - set(words1)
|
|
|
|
differences = []
|
|
mn_len = min(len(words1), len(words2))
|
|
for i in range(mn_len):
|
|
difference = difference_type(words1[i], words2[i])
|
|
differences.append((words1[i], words2[i], difference))
|
|
|
|
for word in words1_only:
|
|
differences.append((word, None, "Word only in first string"))
|
|
|
|
for word in words2_only:
|
|
differences.append((None, word, "Word only in second string"))
|
|
|
|
diff_types = set()
|
|
for diff in differences:
|
|
if diff[2]:
|
|
diff_types.add(diff[2])
|
|
|
|
return differences, diff_types
|
|
|
|
|
|
def get_matches(file, target):
|
|
target = replace_multiple_spaces(target)
|
|
|
|
result = []
|
|
for i, page_layout in enumerate(tqdm(extract_pages(file))):
|
|
_x1, _y1, _x2, _y2 = page_layout.bbox
|
|
texts = []
|
|
relative_coords = []
|
|
d = {}
|
|
for element in page_layout:
|
|
if isinstance(element, LTTextContainer):
|
|
# print(element.get_text())
|
|
x1, y1, x2, y2 = element.bbox
|
|
raw = element.get_text()
|
|
text = replace_multiple_spaces(raw.replace("\n", " ").strip())
|
|
if len(text) > 3:
|
|
relative_coords.append(
|
|
([x1 / _x2, y1 / _y2, (x2 - x1) / _x2, (y2 - y1) / _y2])
|
|
)
|
|
texts.append(text)
|
|
d[text] = raw
|
|
|
|
distances = calculate_distances(target, texts)
|
|
|
|
for window, distance in distances:
|
|
if distance / len(target) < 0.2:
|
|
# print(i)
|
|
# print(window)
|
|
for j in range(len(texts)):
|
|
if window in texts[j]:
|
|
raw_text = d[texts[j]]
|
|
rel_coord = relative_coords[j]
|
|
break
|
|
difference, diff_types = compare_strings(window, target)
|
|
result.append(
|
|
{
|
|
"page": i + 1,
|
|
"window": window,
|
|
"coordinates": rel_coord,
|
|
"distance": distance / len(target),
|
|
"diff_type": list(diff_types),
|
|
"raw_text": raw_text,
|
|
}
|
|
)
|
|
return result
|
|
|
|
|
|
# if __name__ == "__main__":
|
|
# file = "some.pdf"
|
|
# columns_to_use = [
|
|
# "font",
|
|
# "rank",
|
|
# "rank_squares",
|
|
# "bold_percentage",
|
|
# "id_percentage",
|
|
# ]
|
|
# checkpoint_name = "checkpoints/models.pkl"
|
|
#
|
|
# test_df, result = extract_test_features(file)
|
|
#
|
|
# if isinstance(test_df, pd.DataFrame):
|
|
# test_df = create_test_features(test_df)
|
|
# else:
|
|
# print(result)
|
|
#
|
|
# _, target = inference_models(checkpoint_name, test_df, columns_to_use)
|
|
#
|
|
# result = []
|
|
# for page_layout in tqdm(extract_pages(file)):
|
|
# texts = []
|
|
# for element in page_layout:
|
|
# if isinstance(element, LTTextContainer):
|
|
# texts.append(element.get_text().replace("\n", ""))
|
|
# distances = calculate_distances(target, texts)
|
|
#
|
|
# for window, distance in distances.items():
|
|
# if distance < 20:
|
|
# result.append(window)
|