diff --git a/checker/services/file.py b/checker/services/file.py index c51480a..4cfaa67 100644 --- a/checker/services/file.py +++ b/checker/services/file.py @@ -4,19 +4,18 @@ from checker.services.generators import generate_charset def process_paragraphs(text): + text = text.split("\n") paragraphs = {} - c = 0 + c = 1 + title = True for line in text: - ind = line[:2] - if len(ind) == 2 and ind[1] == ".": - try: - ind = int(ind[0]) - c = ind - paragraphs[c] = "" - except ValueError: - print() - if c: - paragraphs[c] += line + if title: + if line and len(line) > 2 and line[:2] == "1.": + title = False + else: + if line: + paragraphs[c] = line + c += 1 return paragraphs diff --git a/checker/signals.py b/checker/signals.py index c1f88f1..83064e7 100644 --- a/checker/signals.py +++ b/checker/signals.py @@ -1,18 +1,12 @@ -import asyncio - -import docx2txt from django.db.models.signals import post_save from django.dispatch import receiver -from checker.models import Docx, Paragraph -from checker.services.file import process_paragraphs +from checker.models import Docx from checker.tasks import process_file -import threading -import asyncio @receiver(post_save, sender=Docx) def create_docs(sender, instance, created, **kwargs): if created: - process_file.apply_async((instance.pk)) + process_file.apply_async(kwargs={"pk": instance.pk}) return diff --git a/checker/tasks.py b/checker/tasks.py index c50e967..e443f72 100644 --- a/checker/tasks.py +++ b/checker/tasks.py @@ -1,11 +1,7 @@ -from time import sleep - import docx2txt import requests from celery import shared_task -from django.conf import settings - from checker.models import Paragraph, Docx from checker.services.file import process_paragraphs @@ -13,20 +9,28 @@ from checker.services.file import process_paragraphs @shared_task() def process_file(pk: int): file = Docx.objects.get(pk=pk) + uuid = file.uuid document = docx2txt.process(file.file.path) - paragraphs = process_paragraphs(document.split("\n")) + paragraphs = process_paragraphs(document) file.paragraphs_loaded = len(paragraphs) file.save(update_fields=["paragraphs_loaded"]) - x = requests.post("http://185.244.175.164:5000/api", json=paragraphs) - for el_id, type_id in x.json().items(): - Paragraph.objects.create( - type_id=type_id, docx=file, text=paragraphs[el_id] - ) + cut = 100 + counter = 0 + len_c = len(paragraphs) + paragraphs = list(paragraphs.values()) + for i in range(0, len(paragraphs) // cut + 1): + vals = paragraphs[i * cut : (i + 1) * cut + 1] + dct = {x: vals[x] for x in range(len(vals))} - file.paragraphs_processed = len(paragraphs) - file.save(update_fields=["paragraphs_processed"]) + x = requests.post("http://109.248.175.223:5000/api", json=dct) + for el_id, type_id in x.json().items(): + Paragraph.objects.create(type_id=type_id, docx=file, text=dct[int(el_id)]) - return file + counter += len(vals) + print(f"processing {uuid}, {counter}/{len_c}") + file.paragraphs_processed = counter + file.save(update_fields=["paragraphs_processed"]) + return f"ok, {pk}" diff --git a/conf/settings/base.py b/conf/settings/base.py index 443f804..7b87556 100644 --- a/conf/settings/base.py +++ b/conf/settings/base.py @@ -4,7 +4,7 @@ from pathlib import Path ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent -AI_URL = "http://185.244.175.164:5000/api" +AI_URL = "http://109.248.175.223:5000/api" # AI_URL = "http://127.0.0.1:5000" APPS_DIR = ROOT_DIR @@ -64,7 +64,7 @@ THIRD_PARTY_APPS = [ "rest_framework", "drf_yasg", "corsheaders", - "django_celery_results" + "django_celery_results", ] HEALTH_CHECKS = [ @@ -199,10 +199,11 @@ CORS_ALLOW_ALL_ORIGINS = True # Celery -CELERY_BROKER_URL = 'redis://localhost:6379/0' +CELERY_BROKER_URL = "redis://localhost:6379/0" CELERY_TIMEZONE = "Europe/Moscow" CELERY_TASK_TRACK_STARTED = True CELERY_TASK_TIME_LIMIT = 30 * 60 -CELERY_ACCEPT_CONTENT = ['json'] -CELERY_TASK_SERIALIZER = 'json' -CELERY_RESULT_SERIALIZER = 'json' +CELERY_ACCEPT_CONTENT = ["json"] +CELERY_TASK_SERIALIZER = "json" +CELERY_RESULT_SERIALIZER = "json" +CELERY_RESULT_BACKEND = "django-db"