optimised docx highlighter

This commit is contained in:
Alexander Karpov 2022-08-27 15:46:13 +03:00
parent 1e6c23477e
commit 522a733c35
2 changed files with 28 additions and 24 deletions

View File

@ -25,7 +25,6 @@ def process_paragraphs(text):
def process_word_paragraphs(text):
text = text.split("\\r")
print(text)
return _base_process(text)

View File

@ -20,7 +20,7 @@ def process_file(pk: int):
cut = 100
counter = 0
len_c = len(paragraphs)
len_c = len(paragraphs) + 1
paragraphs = list(paragraphs.values())
for i in range(0, len(paragraphs) // cut + 1):
vals = paragraphs[i * cut : (i + 1) * cut + 1]
@ -49,15 +49,14 @@ def process_word(pk: int):
file = WordDocx.objects.get(pk=pk)
uuid = file.uuid
paragraphs = process_word_paragraphs(file.text.tobytes().decode())
print(paragraphs)
file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"])
cut = 100
counter = 0
len_c = len(paragraphs)
cut = 150
len_c = len(paragraphs) + 1
paragraphs = list(paragraphs.values())
counter = 0
for i in range(0, len(paragraphs) // cut + 1):
vals = paragraphs[i * cut : (i + 1) * cut + 1]
dct = {x: vals[x] for x in range(len(vals))}
@ -83,25 +82,31 @@ def process_word(pk: int):
@shared_task
def highlight_file(pk: int):
c = 0
title = True
lim = 0
file = Docx.objects.get(pk=pk)
document = Document(file.file.path)
for paragraph in document.paragraphs:
if title:
if (
paragraph.text
and len(paragraph.text) > 2
and paragraph.text[:2] == "1."
):
title = False
else:
if paragraph.text:
x = requests.post(
"http://109.248.175.223:5000/api", json={1: paragraph.text}
)
paragraphs = document.paragraphs
cut = 100
for paragraph in paragraphs:
if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
break
lim += 1
for i in range(0, len(paragraphs) // cut + 1):
paragraphs_sliced = paragraphs[i * cut + lim : (i + 1) * cut + lim + 1]
dct = {x: paragraphs_sliced[x].text for x in range(len(paragraphs_sliced))}
n_dct = {}
for el, dat in dct.items():
if dat:
n_dct[el] = dat
x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
jsn = x.json()
if x.status_code == 200:
el_id, dat = x.json()["1"]
for j in range(len(paragraphs_sliced)):
if j in n_dct:
paragraph = paragraphs_sliced[j]
el_id, dat = jsn[str(j)]
if dat < 50:
text = paragraph.text
paragraph.clear()
@ -110,6 +115,6 @@ def highlight_file(pk: int):
run.add_text(text)
c += 1
else:
print("AI ERROR")
print("AI server error")
document.save(file.file.path)
return f"highlighted {c}, {pk}"