optimised docx highlighter

2025-10-25 21:11:08 +03:00 · 2022-08-27 15:46:13 +03:00 · 2022-08-27 15:46:13 +03:00 · 522a733c35
commit 522a733c35
parent 1e6c23477e
2 changed files with 28 additions and 24 deletions
--- a/checker/services/file.py
+++ b/checker/services/file.py
@ -25,7 +25,6 @@ def process_paragraphs(text):

 def process_word_paragraphs(text):
    text = text.split("\\r")
-    print(text)
    return _base_process(text)


--- a/checker/tasks.py
+++ b/checker/tasks.py
@ -20,7 +20,7 @@ def process_file(pk: int):

    cut = 100
    counter = 0
-    len_c = len(paragraphs)
+    len_c = len(paragraphs) + 1
    paragraphs = list(paragraphs.values())
    for i in range(0, len(paragraphs) // cut + 1):
        vals = paragraphs[i * cut : (i + 1) * cut + 1]
@ -49,15 +49,14 @@ def process_word(pk: int):
    file = WordDocx.objects.get(pk=pk)
    uuid = file.uuid
    paragraphs = process_word_paragraphs(file.text.tobytes().decode())
-    print(paragraphs)

    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])

-    cut = 100
-    counter = 0
-    len_c = len(paragraphs)
+    cut = 150
+    len_c = len(paragraphs) + 1
    paragraphs = list(paragraphs.values())
+    counter = 0
    for i in range(0, len(paragraphs) // cut + 1):
        vals = paragraphs[i * cut : (i + 1) * cut + 1]
        dct = {x: vals[x] for x in range(len(vals))}
@ -83,25 +82,31 @@ def process_word(pk: int):
@shared_task
 def highlight_file(pk: int):
    c = 0
-    title = True
+    lim = 0
    file = Docx.objects.get(pk=pk)
    document = Document(file.file.path)

-    for paragraph in document.paragraphs:
-        if title:
-            if (
-                paragraph.text
-                and len(paragraph.text) > 2
-                and paragraph.text[:2] == "1."
-            ):
-                title = False
-        else:
-            if paragraph.text:
-                x = requests.post(
-                    "http://109.248.175.223:5000/api", json={1: paragraph.text}
-                )
+    paragraphs = document.paragraphs
+    cut = 100
+
+    for paragraph in paragraphs:
+        if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
+            break
+        lim += 1
+    for i in range(0, len(paragraphs) // cut + 1):
+        paragraphs_sliced = paragraphs[i * cut + lim : (i + 1) * cut + lim + 1]
+        dct = {x: paragraphs_sliced[x].text for x in range(len(paragraphs_sliced))}
+        n_dct = {}
+        for el, dat in dct.items():
+            if dat:
+                n_dct[el] = dat
+        x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
+        jsn = x.json()
        if x.status_code == 200:
-                    el_id, dat = x.json()["1"]
+            for j in range(len(paragraphs_sliced)):
+                if j in n_dct:
+                    paragraph = paragraphs_sliced[j]
+                    el_id, dat = jsn[str(j)]
                    if dat < 50:
                        text = paragraph.text
                        paragraph.clear()
@ -110,6 +115,6 @@ def highlight_file(pk: int):
                        run.add_text(text)
                        c += 1
        else:
-                    print("AI ERROR")
+            print("AI server error")
    document.save(file.file.path)
    return f"highlighted {c}, {pk}"