From 477afd4278adf4f2a532ab3578976dc649096dae Mon Sep 17 00:00:00 2001 From: Alexander-D-Karpov Date: Sat, 27 Aug 2022 07:38:54 +0300 Subject: [PATCH] added async task worker, state save --- checker/api/serializers.py | 6 ++++++ checker/api/views.py | 24 ++++++++++++++++++++++-- checker/models.py | 12 ++++++++++-- checker/services/file.py | 11 ++++++++++- checker/services/generators.py | 7 +++++++ checker/signals.py | 13 ++++++------- checker/tasks.py | 32 +++++++++++++++++++++++++++----- conf/api_router.py | 6 ++++-- conf/celery.py | 2 +- conf/settings/base.py | 3 +++ requirements/base.txt | 3 ++- 11 files changed, 98 insertions(+), 21 deletions(-) create mode 100644 checker/services/generators.py diff --git a/checker/api/serializers.py b/checker/api/serializers.py index 763ef10..c03c649 100644 --- a/checker/api/serializers.py +++ b/checker/api/serializers.py @@ -8,3 +8,9 @@ class DocxSerializer(serializers.ModelSerializer): model = Docx fields = ["uuid", "file"] extra_kwargs = {"uuid": {"read_only": True}} + + +class DocxStateSerializer(serializers.ModelSerializer): + class Meta: + model = Docx + fields = ["paragraphs_loaded", "paragraphs_processed"] diff --git a/checker/api/views.py b/checker/api/views.py index 871e153..2ca4479 100644 --- a/checker/api/views.py +++ b/checker/api/views.py @@ -1,11 +1,31 @@ from rest_framework import generics +from rest_framework.response import Response +from rest_framework.views import APIView +from rest_framework.generics import get_object_or_404 from rest_framework.parsers import MultiPartParser, FormParser -from checker.api.serializers import DocxSerializer -from checker.models import Docx +from checker.api.serializers import DocxSerializer, DocxStateSerializer +from checker.models import Docx, ParagraphType class ListCreateDocxApiView(generics.ListCreateAPIView): parser_classes = [FormParser, MultiPartParser] serializer_class = DocxSerializer queryset = Docx.objects.all() + + +class GetDocxState(generics.RetrieveAPIView): + lookup_field = "uuid" + queryset = Docx.objects.all() + serializer_class = DocxStateSerializer + + +class RetireDocxSerializer(APIView): + def get(self, request, uuid): + doc = get_object_or_404(Docx, uuid=uuid) + res = {} + paragraphs = ParagraphType.objects.filter(paragraphs__docx=doc) + for p in paragraphs: + res[p.name] = [x.text for x in p.paragraphs.filter(docx=doc)] + return Response(res) + diff --git a/checker/models.py b/checker/models.py index 9d713b3..e9c0c25 100644 --- a/checker/models.py +++ b/checker/models.py @@ -2,16 +2,24 @@ import uuid as uuid from django.db import models # Create your models here. +from checker.services.file import media_upload_path class Docx(models.Model): uuid = models.UUIDField( default=uuid.uuid4, editable=False, unique=True, primary_key=True ) - file = models.FileField(upload_to="") + file = models.FileField(upload_to=media_upload_path) + created = models.DateTimeField(auto_now_add=True) + + paragraphs_processed = models.IntegerField(default=0) + paragraphs_loaded = models.IntegerField(default=0) def __str__(self): - return self.uuid + return str(self.uuid) + + class Meta: + ordering = ["-created"] class ParagraphType(models.Model): diff --git a/checker/services/file.py b/checker/services/file.py index 0c0dc0a..c51480a 100644 --- a/checker/services/file.py +++ b/checker/services/file.py @@ -1,3 +1,8 @@ +import os + +from checker.services.generators import generate_charset + + def process_paragraphs(text): paragraphs = {} c = 0 @@ -12,4 +17,8 @@ def process_paragraphs(text): print() if c: paragraphs[c] += line - return paragraphs \ No newline at end of file + return paragraphs + + +def media_upload_path(instance, filename): + return os.path.join(f"uploads/{generate_charset(7)}/", filename) diff --git a/checker/services/generators.py b/checker/services/generators.py new file mode 100644 index 0000000..eeebfb8 --- /dev/null +++ b/checker/services/generators.py @@ -0,0 +1,7 @@ +import random +import string + + +def generate_charset(length: int) -> str: + """Generate a random string of characters of a given length.""" + return "".join(random.choice(string.ascii_letters) for _ in range(length)) diff --git a/checker/signals.py b/checker/signals.py index 15cc591..c1f88f1 100644 --- a/checker/signals.py +++ b/checker/signals.py @@ -1,19 +1,18 @@ -from pprint import pprint -import requests +import asyncio import docx2txt -from django.conf import settings from django.db.models.signals import post_save from django.dispatch import receiver -from checker.models import Docx +from checker.models import Docx, Paragraph from checker.services.file import process_paragraphs +from checker.tasks import process_file +import threading +import asyncio @receiver(post_save, sender=Docx) def create_docs(sender, instance, created, **kwargs): if created: - document = docx2txt.process(instance.file.path) - paragraphs = process_paragraphs(document.split("\n")) - x = requests.post(settings.AI_URL, json=paragraphs) + process_file.apply_async((instance.pk)) return diff --git a/checker/tasks.py b/checker/tasks.py index d1f44a0..c50e967 100644 --- a/checker/tasks.py +++ b/checker/tasks.py @@ -1,10 +1,32 @@ +from time import sleep + +import docx2txt +import requests from celery import shared_task -from uuid import uuid4 -from checker.models import Docx +from django.conf import settings + +from checker.models import Paragraph, Docx +from checker.services.file import process_paragraphs -@shared_task(name="process_file") -def process_file(file: uuid4): - print(file) +@shared_task() +def process_file(pk: int): + file = Docx.objects.get(pk=pk) + document = docx2txt.process(file.file.path) + paragraphs = process_paragraphs(document.split("\n")) + + file.paragraphs_loaded = len(paragraphs) + file.save(update_fields=["paragraphs_loaded"]) + + x = requests.post("http://185.244.175.164:5000/api", json=paragraphs) + for el_id, type_id in x.json().items(): + Paragraph.objects.create( + type_id=type_id, docx=file, text=paragraphs[el_id] + ) + + file.paragraphs_processed = len(paragraphs) + file.save(update_fields=["paragraphs_processed"]) + return file + diff --git a/conf/api_router.py b/conf/api_router.py index b6cdc86..9d1ae8f 100644 --- a/conf/api_router.py +++ b/conf/api_router.py @@ -1,8 +1,10 @@ from django.urls import path, include -from checker.api.views import ListCreateDocxApiView +from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState urlpatterns = [ path("health/", include("health_check.urls")), - path("docx/", ListCreateDocxApiView.as_view(), name="list_create_docx") + path("docx/", ListCreateDocxApiView.as_view(), name="list_create_docx"), + path("docx/", RetireDocxSerializer.as_view(), name="get_docx"), + path("state/", GetDocxState.as_view(), name="get_state_docx"), ] diff --git a/conf/celery.py b/conf/celery.py index bedd42d..ba8c6fa 100644 --- a/conf/celery.py +++ b/conf/celery.py @@ -5,7 +5,7 @@ from celery import Celery # Set the default Django settings module for the 'celery' program. os.environ.setdefault("DJANGO_SETTINGS_MODULE", "conf.settings.local") -app = Celery("mistake_checker_hack_backend") +app = Celery("conf") # Using a string here means the worker doesn't have to serialize # the configuration object to child processes. diff --git a/conf/settings/base.py b/conf/settings/base.py index 9e666f3..443f804 100644 --- a/conf/settings/base.py +++ b/conf/settings/base.py @@ -203,3 +203,6 @@ CELERY_BROKER_URL = 'redis://localhost:6379/0' CELERY_TIMEZONE = "Europe/Moscow" CELERY_TASK_TRACK_STARTED = True CELERY_TASK_TIME_LIMIT = 30 * 60 +CELERY_ACCEPT_CONTENT = ['json'] +CELERY_TASK_SERIALIZER = 'json' +CELERY_RESULT_SERIALIZER = 'json' diff --git a/requirements/base.txt b/requirements/base.txt index b61cff5..365ca44 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -14,4 +14,5 @@ django_celery_results==2.4.0 psutil dj-database-url uuid -docx2txt \ No newline at end of file +docx2txt +requests-async \ No newline at end of file