fixed parser, added file processing state

2025-09-06 12:04:49 +03:00 · 2022-08-27 10:16:21 +03:00 · 2022-08-27 10:16:21 +03:00 · f06651d1a9
commit f06651d1a9
parent 477afd4278
4 changed files with 36 additions and 38 deletions
--- a/checker/services/file.py
+++ b/checker/services/file.py
@ -4,19 +4,18 @@ from checker.services.generators import generate_charset
 def process_paragraphs(text):
    text = text.split("\n")
    paragraphs = {}
-    c = 0
+    c = 1
    title = True
    for line in text:
-        ind = line[:2]
+        if title:
-        if len(ind) == 2 and ind[1] == ".":
+            if line and len(line) > 2 and line[:2] == "1.":
-            try:
+                title = False
-                ind = int(ind[0])
+        else:
-                c = ind
+            if line:
-                paragraphs[c] = ""
+                paragraphs[c] = line
-            except ValueError:
+                c += 1
                print()
            if c:
                paragraphs[c] += line
    return paragraphs
--- a/checker/signals.py
+++ b/checker/signals.py
@ -1,18 +1,12 @@
 import asyncio
 import docx2txt
 from django.db.models.signals import post_save
 from django.dispatch import receiver
-from checker.models import Docx, Paragraph
+from checker.models import Docx
 from checker.services.file import process_paragraphs
 from checker.tasks import process_file
 import threading
 import asyncio
@receiver(post_save, sender=Docx)
 def create_docs(sender, instance, created, **kwargs):
    if created:
-        process_file.apply_async((instance.pk))
+        process_file.apply_async(kwargs={"pk": instance.pk})
        return
--- a/checker/tasks.py
+++ b/checker/tasks.py
@ -1,11 +1,7 @@
 from time import sleep
 import docx2txt
 import requests
 from celery import shared_task
 from django.conf import settings
 from checker.models import Paragraph, Docx
 from checker.services.file import process_paragraphs
@ -13,20 +9,28 @@ from checker.services.file import process_paragraphs
@shared_task()
 def process_file(pk: int):
    file = Docx.objects.get(pk=pk)
    uuid = file.uuid
    document = docx2txt.process(file.file.path)
-    paragraphs = process_paragraphs(document.split("\n"))
+    paragraphs = process_paragraphs(document)
    file.paragraphs_loaded = len(paragraphs)
    file.save(update_fields=["paragraphs_loaded"])
-    x = requests.post("http://185.244.175.164:5000/api", json=paragraphs)
+    cut = 100
-    for el_id, type_id in x.json().items():
+    counter = 0
-        Paragraph.objects.create(
+    len_c = len(paragraphs)
-            type_id=type_id, docx=file, text=paragraphs[el_id]
+    paragraphs = list(paragraphs.values())
-        )
+    for i in range(0, len(paragraphs) // cut + 1):
        vals = paragraphs[i * cut : (i + 1) * cut + 1]
        dct = {x: vals[x] for x in range(len(vals))}
-    file.paragraphs_processed = len(paragraphs)
+        x = requests.post("http://109.248.175.223:5000/api", json=dct)
-    file.save(update_fields=["paragraphs_processed"])
+        for el_id, type_id in x.json().items():
            Paragraph.objects.create(type_id=type_id, docx=file, text=dct[int(el_id)])
-    return file
+        counter += len(vals)
        print(f"processing {uuid}, {counter}/{len_c}")
        file.paragraphs_processed = counter
        file.save(update_fields=["paragraphs_processed"])
    return f"ok, {pk}"
--- a/conf/settings/base.py
+++ b/conf/settings/base.py
@ -4,7 +4,7 @@ from pathlib import Path
 ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent
-AI_URL = "http://185.244.175.164:5000/api"
+AI_URL = "http://109.248.175.223:5000/api"
 # AI_URL = "http://127.0.0.1:5000"
 APPS_DIR = ROOT_DIR
@ -64,7 +64,7 @@ THIRD_PARTY_APPS = [
    "rest_framework",
    "drf_yasg",
    "corsheaders",
-    "django_celery_results"
+    "django_celery_results",
 ]
 HEALTH_CHECKS = [
@ -199,10 +199,11 @@ CORS_ALLOW_ALL_ORIGINS = True
 # Celery
-CELERY_BROKER_URL = 'redis://localhost:6379/0'
+CELERY_BROKER_URL = "redis://localhost:6379/0"
 CELERY_TIMEZONE = "Europe/Moscow"
 CELERY_TASK_TRACK_STARTED = True
 CELERY_TASK_TIME_LIMIT = 30 * 60
-CELERY_ACCEPT_CONTENT = ['json']
+CELERY_ACCEPT_CONTENT = ["json"]
-CELERY_TASK_SERIALIZER = 'json'
+CELERY_TASK_SERIALIZER = "json"
-CELERY_RESULT_SERIALIZER = 'json'
+CELERY_RESULT_SERIALIZER = "json"
 CELERY_RESULT_BACKEND = "django-db"