optimised docx highlighter

This commit is contained in:
Alexander Karpov 2022-08-27 15:46:13 +03:00
parent 1e6c23477e
commit 522a733c35
2 changed files with 28 additions and 24 deletions

View File

@ -25,7 +25,6 @@ def process_paragraphs(text):
def process_word_paragraphs(text): def process_word_paragraphs(text):
text = text.split("\\r") text = text.split("\\r")
print(text)
return _base_process(text) return _base_process(text)

View File

@ -20,7 +20,7 @@ def process_file(pk: int):
cut = 100 cut = 100
counter = 0 counter = 0
len_c = len(paragraphs) len_c = len(paragraphs) + 1
paragraphs = list(paragraphs.values()) paragraphs = list(paragraphs.values())
for i in range(0, len(paragraphs) // cut + 1): for i in range(0, len(paragraphs) // cut + 1):
vals = paragraphs[i * cut : (i + 1) * cut + 1] vals = paragraphs[i * cut : (i + 1) * cut + 1]
@ -49,15 +49,14 @@ def process_word(pk: int):
file = WordDocx.objects.get(pk=pk) file = WordDocx.objects.get(pk=pk)
uuid = file.uuid uuid = file.uuid
paragraphs = process_word_paragraphs(file.text.tobytes().decode()) paragraphs = process_word_paragraphs(file.text.tobytes().decode())
print(paragraphs)
file.paragraphs_loaded = len(paragraphs) file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"]) file.save(update_fields=["paragraphs_loaded"])
cut = 100 cut = 150
counter = 0 len_c = len(paragraphs) + 1
len_c = len(paragraphs)
paragraphs = list(paragraphs.values()) paragraphs = list(paragraphs.values())
counter = 0
for i in range(0, len(paragraphs) // cut + 1): for i in range(0, len(paragraphs) // cut + 1):
vals = paragraphs[i * cut : (i + 1) * cut + 1] vals = paragraphs[i * cut : (i + 1) * cut + 1]
dct = {x: vals[x] for x in range(len(vals))} dct = {x: vals[x] for x in range(len(vals))}
@ -83,25 +82,31 @@ def process_word(pk: int):
@shared_task @shared_task
def highlight_file(pk: int): def highlight_file(pk: int):
c = 0 c = 0
title = True lim = 0
file = Docx.objects.get(pk=pk) file = Docx.objects.get(pk=pk)
document = Document(file.file.path) document = Document(file.file.path)
for paragraph in document.paragraphs: paragraphs = document.paragraphs
if title: cut = 100
if (
paragraph.text for paragraph in paragraphs:
and len(paragraph.text) > 2 if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
and paragraph.text[:2] == "1." break
): lim += 1
title = False for i in range(0, len(paragraphs) // cut + 1):
else: paragraphs_sliced = paragraphs[i * cut + lim : (i + 1) * cut + lim + 1]
if paragraph.text: dct = {x: paragraphs_sliced[x].text for x in range(len(paragraphs_sliced))}
x = requests.post( n_dct = {}
"http://109.248.175.223:5000/api", json={1: paragraph.text} for el, dat in dct.items():
) if dat:
n_dct[el] = dat
x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
jsn = x.json()
if x.status_code == 200: if x.status_code == 200:
el_id, dat = x.json()["1"] for j in range(len(paragraphs_sliced)):
if j in n_dct:
paragraph = paragraphs_sliced[j]
el_id, dat = jsn[str(j)]
if dat < 50: if dat < 50:
text = paragraph.text text = paragraph.text
paragraph.clear() paragraph.clear()
@ -110,6 +115,6 @@ def highlight_file(pk: int):
run.add_text(text) run.add_text(text)
c += 1 c += 1
else: else:
print("AI ERROR") print("AI server error")
document.save(file.file.path) document.save(file.file.path)
return f"highlighted {c}, {pk}" return f"highlighted {c}, {pk}"