Optimised file processing, added doc, odt -> docx converter, minor changes

2025-10-30 23:37:34 +03:00 · 2022-08-28 12:37:28 +03:00 · 2022-08-28 12:37:28 +03:00 · 6cffc965a8
commit 6cffc965a8
parent 2bf771e3a2
3 changed files with 121 additions and 56 deletions
--- a/app/checker/services/file.py
+++ b/app/checker/services/file.py
@ -1,4 +1,7 @@
 import os
 import re
 import convertapi
 from checker.services.generators import generate_charset
@ -28,5 +31,31 @@ def process_word_paragraphs(text):
    return _base_process(text)
 def doc_to_docx(file_path):
    convertapi.api_secret = '0fp22XFRPwKmNJql'
    result = convertapi.convert('docx', {'File': file_path}, from_format='doc')
    result.file.save(file_path.split(".")[0] + ".docx")
    return file_path.split(".")[0] + ".docx"
 def doc_to_odt(file_path):
    convertapi.api_secret = '0fp22XFRPwKmNJql'
    result = convertapi.convert('docx', {'File': file_path}, from_format='odt')
    result.file.save(file_path.split(".")[0] + ".docx")
    return file_path.split(".")[0] + ".docx"
 def media_upload_path(instance, filename):
    return os.path.join(f"uploads/{generate_charset(7)}/", filename)
 def split_text(text):
    texts, groups = [], []
    regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', ''))
    for t in regt:
        if t[0] == t[-1]:
            texts.append(t[1])
            groups.append(int(t[0]))
        else:
            print(t)
    return texts, groups
--- a/app/checker/signals.py
+++ b/app/checker/signals.py
@ -1,15 +1,31 @@
 import magic
 from django.db.models.signals import post_save
 from django.core.files import File
 from django.dispatch import receiver
 from celery import chain
 from checker.models import Docx, WordDocx
 from checker.services.file import doc_to_docx, doc_to_odt
 from checker.tasks import process_file, process_word, highlight_file
@receiver(post_save, sender=Docx)
 def create_docs(sender, instance, created, **kwargs):
    if created:
-        process_file.apply_async(kwargs={"pk": instance.pk})
+        type = magic.from_file(instance.file.path, mime=True)
-        highlight_file.apply_async(kwargs={"pk": instance.pk})
+        if type == "application/msword":
            pth = doc_to_docx(instance.file.path)
            with open(pth, 'rb') as f:
                instance.file = File(f, name=pth.split("/")[-1])
                instance.save(update_fields=["file"])
        elif type == "application/vnd.oasis.opendocument.text":
            pth = doc_to_odt(instance.file.path)
            with open(pth, 'rb') as f:
                instance.file = File(f, name=pth.split("/")[-1])
                instance.save(update_fields=["file"])
        chain(process_file.s(instance.pk), highlight_file.s()).apply_async()
        return
--- a/app/checker/tasks.py
+++ b/app/checker/tasks.py
@ -3,49 +3,63 @@ import requests
 from celery import shared_task
 from docx import Document
 from docx.enum.text import WD_COLOR_INDEX
 from requests.exceptions import InvalidJSONError
 from checker.models import Paragraph, Docx, WordDocx, WordParagraph
-from checker.services.file import process_paragraphs, process_word_paragraphs
+from checker.services.file import process_paragraphs, process_word_paragraphs, split_text
@shared_task()
-def process_file(pk: int):
+def process_file(pk: int, *args, **kwargs):
    file = Docx.objects.get(pk=pk)
    uuid = file.uuid
    document = docx2txt.process(file.file.path)
-    paragraphs = process_paragraphs(document)
+    # paragraphs = process_paragraphs(document)
    paragraphs, groups = split_text(document)
    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])
-    cut = 100
+    cut = 10
-    counter = 0
+    for i in range(len(paragraphs) // cut):
-    len_c = len(paragraphs) + 1
+        vals = [x for x in range(i * cut, (i+ 1) * cut)]
-    paragraphs = list(paragraphs.values())
+        dct = {x: paragraphs[x] for x in vals}
-    for i in range(0, len(paragraphs) // cut + 1):
+        x = requests.post("http://109.248.175.223:5000/api", json=dct, timeout=1)
        vals = paragraphs[i * cut : (i + 1) * cut + 1]
        dct = {x: vals[x] for x in range(len(vals))}
        x = requests.post("http://109.248.175.223:5000/api", json=dct)
        if x.status_code == 200:
        for el_id, dat in x.json().items():
            type_id, score = dat
            Paragraph.objects.create(
-                    type_id=type_id, docx=file, text=dct[int(el_id)], score=score
+                type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score
            )
-            counter += len(vals)
+    #for i in range(0, len(paragraphs) // cut + 1):
-            print(f"processing {uuid}, {counter}/{len_c}")
+    #    vals = paragraphs[i * cut : (i + 1) * cut + 1]
-            file.paragraphs_processed = counter
+    #    dct = {x: vals[x] for x in range(len(vals))}
-            file.save(update_fields=["paragraphs_processed"])
+    #
-        else:
+    #    x = requests.post("http://109.248.175.223:5000/api", json=dct)
-            print(f"AI server error, {x.status_code}")
+    #    if x.status_code == 200:
    #        try:
    #            for el_id, dat in x.json().items():
    #                type_id, score = dat
    #                Paragraph.objects.create(
    #                    type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score
    #                )
    #                g_c += 1
    #
    #            counter += len(vals)
    #            print(f"processing {uuid}, {counter}/{len_c}")
    #            file.paragraphs_processed = counter
    #            file.save(update_fields=["paragraphs_processed"])
    #        except InvalidJSONError:
    #            print("json pars error")
    #    else:
    #        print(f"AI server error, {x.status_code}")
-    return f"ok, {pk}"
+
    return pk
@shared_task()
-def process_word(pk: int):
+def process_word(pk: int, *args, **kwargs):
    file = WordDocx.objects.get(pk=pk)
    uuid = file.uuid
    paragraphs = process_word_paragraphs(file.text.tobytes().decode())
@ -53,7 +67,7 @@ def process_word(pk: int):
    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])
-    cut = 150
+    cut = 10
    len_c = len(paragraphs) + 1
    paragraphs = list(paragraphs.values())
    counter = 0
@ -63,6 +77,7 @@ def process_word(pk: int):
        x = requests.post("http://109.248.175.223:5000/api", json=dct)
        if x.status_code == 200:
            try:
                for el_id, dat in x.json().items():
                    type_id, score = dat
                    WordParagraph.objects.create(
@ -73,21 +88,23 @@ def process_word(pk: int):
                print(f"processing {uuid}, {counter}/{len_c}")
                file.paragraphs_processed = counter
                file.save(update_fields=["paragraphs_processed"])
            except InvalidJSONError:
                print("json pars error")
        else:
            print(f"AI server error, {x.status_code}")
-    return f"ok, {pk}"
+    return pk
@shared_task
-def highlight_file(pk: int):
+def highlight_file(pk: int, *args, **kwargs):
    c = 0
    lim = 0
    file = Docx.objects.get(pk=pk)
    document = Document(file.file.path)
    paragraphs = document.paragraphs
-    cut = 100
+    cut = 10
    for paragraph in paragraphs:
        if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
@ -101,6 +118,7 @@ def highlight_file(pk: int):
            if dat:
                n_dct[el] = dat
        x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
        try:
            jsn = x.json()
            if x.status_code == 200:
                for j in range(len(paragraphs_sliced)):
@ -116,5 +134,7 @@ def highlight_file(pk: int):
                            c += 1
            else:
                print("AI server error")
        except InvalidJSONError:
            print("json pars error")
    document.save(file.file.path)
-    return f"highlighted {c}, {pk}"
+    return pk