Optimised file processing, added doc, odt -> docx converter, minor changes

2025-09-17 01:02:30 +03:00 · 2022-08-28 12:37:28 +03:00 · 2022-08-28 12:37:28 +03:00 · 6cffc965a8
commit 6cffc965a8
parent 2bf771e3a2
3 changed files with 121 additions and 56 deletions
--- a/app/checker/services/file.py
+++ b/app/checker/services/file.py
@ -1,4 +1,7 @@
 import os
+import re
+import convertapi
+

 from checker.services.generators import generate_charset

@ -28,5 +31,31 @@ def process_word_paragraphs(text):
    return _base_process(text)


+def doc_to_docx(file_path):
+    convertapi.api_secret = '0fp22XFRPwKmNJql'
+    result = convertapi.convert('docx', {'File': file_path}, from_format='doc')
+    result.file.save(file_path.split(".")[0] + ".docx")
+    return file_path.split(".")[0] + ".docx"
+
+
+def doc_to_odt(file_path):
+    convertapi.api_secret = '0fp22XFRPwKmNJql'
+    result = convertapi.convert('docx', {'File': file_path}, from_format='odt')
+    result.file.save(file_path.split(".")[0] + ".docx")
+    return file_path.split(".")[0] + ".docx"
+
+
 def media_upload_path(instance, filename):
    return os.path.join(f"uploads/{generate_charset(7)}/", filename)
+
+
+def split_text(text):
+    texts, groups = [], []
+    regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', ''))
+    for t in regt:
+        if t[0] == t[-1]:
+            texts.append(t[1])
+            groups.append(int(t[0]))
+        else:
+            print(t)
+    return texts, groups
--- a/app/checker/signals.py
+++ b/app/checker/signals.py
@ -1,15 +1,31 @@
+import magic
+
 from django.db.models.signals import post_save
+from django.core.files import File
 from django.dispatch import receiver
+from celery import chain

 from checker.models import Docx, WordDocx
+from checker.services.file import doc_to_docx, doc_to_odt
 from checker.tasks import process_file, process_word, highlight_file


@receiver(post_save, sender=Docx)
 def create_docs(sender, instance, created, **kwargs):
    if created:
-        process_file.apply_async(kwargs={"pk": instance.pk})
-        highlight_file.apply_async(kwargs={"pk": instance.pk})
+        type = magic.from_file(instance.file.path, mime=True)
+        if type == "application/msword":
+            pth = doc_to_docx(instance.file.path)
+            with open(pth, 'rb') as f:
+                instance.file = File(f, name=pth.split("/")[-1])
+                instance.save(update_fields=["file"])
+        elif type == "application/vnd.oasis.opendocument.text":
+            pth = doc_to_odt(instance.file.path)
+            with open(pth, 'rb') as f:
+                instance.file = File(f, name=pth.split("/")[-1])
+                instance.save(update_fields=["file"])
+
+        chain(process_file.s(instance.pk), highlight_file.s()).apply_async()
        return


--- a/app/checker/tasks.py
+++ b/app/checker/tasks.py
@ -3,49 +3,63 @@ import requests
 from celery import shared_task
 from docx import Document
 from docx.enum.text import WD_COLOR_INDEX
+from requests.exceptions import InvalidJSONError

 from checker.models import Paragraph, Docx, WordDocx, WordParagraph
-from checker.services.file import process_paragraphs, process_word_paragraphs
+from checker.services.file import process_paragraphs, process_word_paragraphs, split_text


@shared_task()
-def process_file(pk: int):
+def process_file(pk: int, *args, **kwargs):
    file = Docx.objects.get(pk=pk)
    uuid = file.uuid
    document = docx2txt.process(file.file.path)
-    paragraphs = process_paragraphs(document)
+    # paragraphs = process_paragraphs(document)
+    paragraphs, groups = split_text(document)

    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])

-    cut = 100
-    counter = 0
-    len_c = len(paragraphs) + 1
-    paragraphs = list(paragraphs.values())
-    for i in range(0, len(paragraphs) // cut + 1):
-        vals = paragraphs[i * cut : (i + 1) * cut + 1]
-        dct = {x: vals[x] for x in range(len(vals))}
+    cut = 10
+    for i in range(len(paragraphs) // cut):
+        vals = [x for x in range(i * cut, (i+ 1) * cut)]
+        dct = {x: paragraphs[x] for x in vals}
+        x = requests.post("http://109.248.175.223:5000/api", json=dct, timeout=1)
+        for el_id, dat in x.json().items():
+            type_id, score = dat
+            Paragraph.objects.create(
+                type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score
+            )

-        x = requests.post("http://109.248.175.223:5000/api", json=dct)
-        if x.status_code == 200:
-            for el_id, dat in x.json().items():
-                type_id, score = dat
-                Paragraph.objects.create(
-                    type_id=type_id, docx=file, text=dct[int(el_id)], score=score
-                )
+    #for i in range(0, len(paragraphs) // cut + 1):
+    #    vals = paragraphs[i * cut : (i + 1) * cut + 1]
+    #    dct = {x: vals[x] for x in range(len(vals))}
+    #
+    #    x = requests.post("http://109.248.175.223:5000/api", json=dct)
+    #    if x.status_code == 200:
+    #        try:
+    #            for el_id, dat in x.json().items():
+    #                type_id, score = dat
+    #                Paragraph.objects.create(
+    #                    type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score
+    #                )
+    #                g_c += 1
+    #
+    #            counter += len(vals)
+    #            print(f"processing {uuid}, {counter}/{len_c}")
+    #            file.paragraphs_processed = counter
+    #            file.save(update_fields=["paragraphs_processed"])
+    #        except InvalidJSONError:
+    #            print("json pars error")
+    #    else:
+    #        print(f"AI server error, {x.status_code}")

-            counter += len(vals)
-            print(f"processing {uuid}, {counter}/{len_c}")
-            file.paragraphs_processed = counter
-            file.save(update_fields=["paragraphs_processed"])
-        else:
-            print(f"AI server error, {x.status_code}")

-    return f"ok, {pk}"
+    return pk


@shared_task()
-def process_word(pk: int):
+def process_word(pk: int, *args, **kwargs):
    file = WordDocx.objects.get(pk=pk)
    uuid = file.uuid
    paragraphs = process_word_paragraphs(file.text.tobytes().decode())
@ -53,7 +67,7 @@ def process_word(pk: int):
    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])

-    cut = 150
+    cut = 10
    len_c = len(paragraphs) + 1
    paragraphs = list(paragraphs.values())
    counter = 0
@ -63,31 +77,34 @@ def process_word(pk: int):

        x = requests.post("http://109.248.175.223:5000/api", json=dct)
        if x.status_code == 200:
-            for el_id, dat in x.json().items():
-                type_id, score = dat
-                WordParagraph.objects.create(
-                    type_id=type_id, docx=file, text=dct[int(el_id)], score=score
-                )
+            try:
+                for el_id, dat in x.json().items():
+                    type_id, score = dat
+                    WordParagraph.objects.create(
+                        type_id=type_id, docx=file, text=dct[int(el_id)], score=score
+                    )

-            counter += len(vals)
-            print(f"processing {uuid}, {counter}/{len_c}")
-            file.paragraphs_processed = counter
-            file.save(update_fields=["paragraphs_processed"])
+                counter += len(vals)
+                print(f"processing {uuid}, {counter}/{len_c}")
+                file.paragraphs_processed = counter
+                file.save(update_fields=["paragraphs_processed"])
+            except InvalidJSONError:
+                print("json pars error")
        else:
            print(f"AI server error, {x.status_code}")

-    return f"ok, {pk}"
+    return pk


@shared_task
-def highlight_file(pk: int):
+def highlight_file(pk: int, *args, **kwargs):
    c = 0
    lim = 0
    file = Docx.objects.get(pk=pk)
    document = Document(file.file.path)

    paragraphs = document.paragraphs
-    cut = 100
+    cut = 10

    for paragraph in paragraphs:
        if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
@ -101,20 +118,23 @@ def highlight_file(pk: int):
            if dat:
                n_dct[el] = dat
        x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
-        jsn = x.json()
-        if x.status_code == 200:
-            for j in range(len(paragraphs_sliced)):
-                if j in n_dct:
-                    paragraph = paragraphs_sliced[j]
-                    el_id, dat = jsn[str(j)]
-                    if dat < 50:
-                        text = paragraph.text
-                        paragraph.clear()
-                        run = paragraph.add_run()
-                        run.font.highlight_color = WD_COLOR_INDEX.RED
-                        run.add_text(text)
-                        c += 1
-        else:
-            print("AI server error")
+        try:
+            jsn = x.json()
+            if x.status_code == 200:
+                for j in range(len(paragraphs_sliced)):
+                    if j in n_dct:
+                        paragraph = paragraphs_sliced[j]
+                        el_id, dat = jsn[str(j)]
+                        if dat < 50:
+                            text = paragraph.text
+                            paragraph.clear()
+                            run = paragraph.add_run()
+                            run.font.highlight_color = WD_COLOR_INDEX.RED
+                            run.add_text(text)
+                            c += 1
+            else:
+                print("AI server error")
+        except InvalidJSONError:
+            print("json pars error")
    document.save(file.file.path)
-    return f"highlighted {c}, {pk}"
+    return pk