optimised docx highlighter

2025-07-17 19:42:26 +03:00 · 2022-08-27 15:46:13 +03:00 · 2022-08-27 15:46:13 +03:00 · 522a733c35
commit 522a733c35
parent 1e6c23477e
2 changed files with 28 additions and 24 deletions
--- a/checker/services/file.py
+++ b/checker/services/file.py
@ -25,7 +25,6 @@ def process_paragraphs(text):
 def process_word_paragraphs(text):
    text = text.split("\\r")
    print(text)
    return _base_process(text)
--- a/checker/tasks.py
+++ b/checker/tasks.py
@ -20,7 +20,7 @@ def process_file(pk: int):
    cut = 100
    counter = 0
-    len_c = len(paragraphs)
+    len_c = len(paragraphs) + 1
    paragraphs = list(paragraphs.values())
    for i in range(0, len(paragraphs) // cut + 1):
        vals = paragraphs[i * cut : (i + 1) * cut + 1]
@ -49,15 +49,14 @@ def process_word(pk: int):
    file = WordDocx.objects.get(pk=pk)
    uuid = file.uuid
    paragraphs = process_word_paragraphs(file.text.tobytes().decode())
    print(paragraphs)
    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])
-    cut = 100
+    cut = 150
-    counter = 0
+    len_c = len(paragraphs) + 1
    len_c = len(paragraphs)
    paragraphs = list(paragraphs.values())
    counter = 0
    for i in range(0, len(paragraphs) // cut + 1):
        vals = paragraphs[i * cut : (i + 1) * cut + 1]
        dct = {x: vals[x] for x in range(len(vals))}
@ -83,25 +82,31 @@ def process_word(pk: int):
@shared_task
 def highlight_file(pk: int):
    c = 0
-    title = True
+    lim = 0
    file = Docx.objects.get(pk=pk)
    document = Document(file.file.path)
-    for paragraph in document.paragraphs:
+    paragraphs = document.paragraphs
-        if title:
+    cut = 100
-            if (
+
-                paragraph.text
+    for paragraph in paragraphs:
-                and len(paragraph.text) > 2
+        if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
-                and paragraph.text[:2] == "1."
+            break
-            ):
+        lim += 1
-                title = False
+    for i in range(0, len(paragraphs) // cut + 1):
-        else:
+        paragraphs_sliced = paragraphs[i * cut + lim : (i + 1) * cut + lim + 1]
-            if paragraph.text:
+        dct = {x: paragraphs_sliced[x].text for x in range(len(paragraphs_sliced))}
-                x = requests.post(
+        n_dct = {}
-                    "http://109.248.175.223:5000/api", json={1: paragraph.text}
+        for el, dat in dct.items():
-                )
+            if dat:
                n_dct[el] = dat
        x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
        jsn = x.json()
        if x.status_code == 200:
-                    el_id, dat = x.json()["1"]
+            for j in range(len(paragraphs_sliced)):
                if j in n_dct:
                    paragraph = paragraphs_sliced[j]
                    el_id, dat = jsn[str(j)]
                    if dat < 50:
                        text = paragraph.text
                        paragraph.clear()
@ -110,6 +115,6 @@ def highlight_file(pk: int):
                        run.add_text(text)
                        c += 1
        else:
-                    print("AI ERROR")
+            print("AI server error")
    document.save(file.file.path)
    return f"highlighted {c}, {pk}"