Optimised file processing, added doc, odt -> docx converter, minor changes

2025-10-30 23:37:34 +03:00 · 2022-08-28 12:37:28 +03:00 · 2022-08-28 12:37:28 +03:00 · 6cffc965a8
commit 6cffc965a8
parent 2bf771e3a2
3 changed files with 121 additions and 56 deletions
--- a/app/checker/services/file.py
+++ b/app/checker/services/file.py
@ -1,4 +1,7 @@
 import os
+import re
+import convertapi
+

 from checker.services.generators import generate_charset

@ -28,5 +31,31 @@ def process_word_paragraphs(text):
    return _base_process(text)


+def doc_to_docx(file_path):
+    convertapi.api_secret = '0fp22XFRPwKmNJql'
+    result = convertapi.convert('docx', {'File': file_path}, from_format='doc')
+    result.file.save(file_path.split(".")[0] + ".docx")
+    return file_path.split(".")[0] + ".docx"
+
+
+def doc_to_odt(file_path):
+    convertapi.api_secret = '0fp22XFRPwKmNJql'
+    result = convertapi.convert('docx', {'File': file_path}, from_format='odt')
+    result.file.save(file_path.split(".")[0] + ".docx")
+    return file_path.split(".")[0] + ".docx"
+
+
 def media_upload_path(instance, filename):
    return os.path.join(f"uploads/{generate_charset(7)}/", filename)
+
+
+def split_text(text):
+    texts, groups = [], []
+    regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', ''))
+    for t in regt:
+        if t[0] == t[-1]:
+            texts.append(t[1])
+            groups.append(int(t[0]))
+        else:
+            print(t)
+    return texts, groups
--- a/app/checker/signals.py
+++ b/app/checker/signals.py
@ -1,15 +1,31 @@
+import magic
+
 from django.db.models.signals import post_save
+from django.core.files import File
 from django.dispatch import receiver
+from celery import chain

 from checker.models import Docx, WordDocx
+from checker.services.file import doc_to_docx, doc_to_odt
 from checker.tasks import process_file, process_word, highlight_file


@receiver(post_save, sender=Docx)
 def create_docs(sender, instance, created, **kwargs):
    if created:
-        process_file.apply_async(kwargs={"pk": instance.pk})
-        highlight_file.apply_async(kwargs={"pk": instance.pk})
+        type = magic.from_file(instance.file.path, mime=True)
+        if type == "application/msword":
+            pth = doc_to_docx(instance.file.path)
+            with open(pth, 'rb') as f:
+                instance.file = File(f, name=pth.split("/")[-1])
+                instance.save(update_fields=["file"])
+        elif type == "application/vnd.oasis.opendocument.text":
+            pth = doc_to_odt(instance.file.path)
+            with open(pth, 'rb') as f:
+                instance.file = File(f, name=pth.split("/")[-1])
+                instance.save(update_fields=["file"])
+
+        chain(process_file.s(instance.pk), highlight_file.s()).apply_async()
        return


--- a/app/checker/tasks.py
+++ b/app/checker/tasks.py
@ -3,49 +3,63 @@ import requests
 from celery import shared_task
 from docx import Document
 from docx.enum.text import WD_COLOR_INDEX
+from requests.exceptions import InvalidJSONError

 from checker.models import Paragraph, Docx, WordDocx, WordParagraph
-from checker.services.file import process_paragraphs, process_word_paragraphs
+from checker.services.file import process_paragraphs, process_word_paragraphs, split_text


@shared_task()
-def process_file(pk: int):
+def process_file(pk: int, *args, **kwargs):
    file = Docx.objects.get(pk=pk)
    uuid = file.uuid
    document = docx2txt.process(file.file.path)
-    paragraphs = process_paragraphs(document)
+    # paragraphs = process_paragraphs(document)
+    paragraphs, groups = split_text(document)

    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])

-    cut = 100
-    counter = 0
-    len_c = len(paragraphs) + 1
-    paragraphs = list(paragraphs.values())
-    for i in range(0, len(paragraphs) // cut + 1):
-        vals = paragraphs[i * cut : (i + 1) * cut + 1]
-        dct = {x: vals[x] for x in range(len(vals))}
-
-        x = requests.post("http://109.248.175.223:5000/api", json=dct)
-        if x.status_code == 200:
+    cut = 10
+    for i in range(len(paragraphs) // cut):
+        vals = [x for x in range(i * cut, (i+ 1) * cut)]
+        dct = {x: paragraphs[x] for x in vals}
+        x = requests.post("http://109.248.175.223:5000/api", json=dct, timeout=1)
        for el_id, dat in x.json().items():
            type_id, score = dat
            Paragraph.objects.create(
-                    type_id=type_id, docx=file, text=dct[int(el_id)], score=score
+                type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score
            )

-            counter += len(vals)
-            print(f"processing {uuid}, {counter}/{len_c}")
-            file.paragraphs_processed = counter
-            file.save(update_fields=["paragraphs_processed"])
-        else:
-            print(f"AI server error, {x.status_code}")
+    #for i in range(0, len(paragraphs) // cut + 1):
+    #    vals = paragraphs[i * cut : (i + 1) * cut + 1]
+    #    dct = {x: vals[x] for x in range(len(vals))}
+    #
+    #    x = requests.post("http://109.248.175.223:5000/api", json=dct)
+    #    if x.status_code == 200:
+    #        try:
+    #            for el_id, dat in x.json().items():
+    #                type_id, score = dat
+    #                Paragraph.objects.create(
+    #                    type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score
+    #                )
+    #                g_c += 1
+    #
+    #            counter += len(vals)
+    #            print(f"processing {uuid}, {counter}/{len_c}")
+    #            file.paragraphs_processed = counter
+    #            file.save(update_fields=["paragraphs_processed"])
+    #        except InvalidJSONError:
+    #            print("json pars error")
+    #    else:
+    #        print(f"AI server error, {x.status_code}")

-    return f"ok, {pk}"
+
+    return pk


@shared_task()
-def process_word(pk: int):
+def process_word(pk: int, *args, **kwargs):
    file = WordDocx.objects.get(pk=pk)
    uuid = file.uuid
    paragraphs = process_word_paragraphs(file.text.tobytes().decode())
@ -53,7 +67,7 @@ def process_word(pk: int):
    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])

-    cut = 150
+    cut = 10
    len_c = len(paragraphs) + 1
    paragraphs = list(paragraphs.values())
    counter = 0
@ -63,6 +77,7 @@ def process_word(pk: int):

        x = requests.post("http://109.248.175.223:5000/api", json=dct)
        if x.status_code == 200:
+            try:
                for el_id, dat in x.json().items():
                    type_id, score = dat
                    WordParagraph.objects.create(
@ -73,21 +88,23 @@ def process_word(pk: int):
                print(f"processing {uuid}, {counter}/{len_c}")
                file.paragraphs_processed = counter
                file.save(update_fields=["paragraphs_processed"])
+            except InvalidJSONError:
+                print("json pars error")
        else:
            print(f"AI server error, {x.status_code}")

-    return f"ok, {pk}"
+    return pk


@shared_task
-def highlight_file(pk: int):
+def highlight_file(pk: int, *args, **kwargs):
    c = 0
    lim = 0
    file = Docx.objects.get(pk=pk)
    document = Document(file.file.path)

    paragraphs = document.paragraphs
-    cut = 100
+    cut = 10

    for paragraph in paragraphs:
        if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
@ -101,6 +118,7 @@ def highlight_file(pk: int):
            if dat:
                n_dct[el] = dat
        x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
+        try:
            jsn = x.json()
            if x.status_code == 200:
                for j in range(len(paragraphs_sliced)):
@ -116,5 +134,7 @@ def highlight_file(pk: int):
                            c += 1
            else:
                print("AI server error")
+        except InvalidJSONError:
+            print("json pars error")
    document.save(file.file.path)
-    return f"highlighted {c}, {pk}"
+    return pk