added async task worker, state save

This commit is contained in:
Alexander Karpov 2022-08-27 07:38:54 +03:00
parent 0c1d881dff
commit 477afd4278
11 changed files with 98 additions and 21 deletions

View File

@ -8,3 +8,9 @@ class DocxSerializer(serializers.ModelSerializer):
model = Docx
fields = ["uuid", "file"]
extra_kwargs = {"uuid": {"read_only": True}}
class DocxStateSerializer(serializers.ModelSerializer):
class Meta:
model = Docx
fields = ["paragraphs_loaded", "paragraphs_processed"]

View File

@ -1,11 +1,31 @@
from rest_framework import generics
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.generics import get_object_or_404
from rest_framework.parsers import MultiPartParser, FormParser
from checker.api.serializers import DocxSerializer
from checker.models import Docx
from checker.api.serializers import DocxSerializer, DocxStateSerializer
from checker.models import Docx, ParagraphType
class ListCreateDocxApiView(generics.ListCreateAPIView):
parser_classes = [FormParser, MultiPartParser]
serializer_class = DocxSerializer
queryset = Docx.objects.all()
class GetDocxState(generics.RetrieveAPIView):
lookup_field = "uuid"
queryset = Docx.objects.all()
serializer_class = DocxStateSerializer
class RetireDocxSerializer(APIView):
def get(self, request, uuid):
doc = get_object_or_404(Docx, uuid=uuid)
res = {}
paragraphs = ParagraphType.objects.filter(paragraphs__docx=doc)
for p in paragraphs:
res[p.name] = [x.text for x in p.paragraphs.filter(docx=doc)]
return Response(res)

View File

@ -2,16 +2,24 @@ import uuid as uuid
from django.db import models
# Create your models here.
from checker.services.file import media_upload_path
class Docx(models.Model):
uuid = models.UUIDField(
default=uuid.uuid4, editable=False, unique=True, primary_key=True
)
file = models.FileField(upload_to="")
file = models.FileField(upload_to=media_upload_path)
created = models.DateTimeField(auto_now_add=True)
paragraphs_processed = models.IntegerField(default=0)
paragraphs_loaded = models.IntegerField(default=0)
def __str__(self):
return self.uuid
return str(self.uuid)
class Meta:
ordering = ["-created"]
class ParagraphType(models.Model):

View File

@ -1,3 +1,8 @@
import os
from checker.services.generators import generate_charset
def process_paragraphs(text):
paragraphs = {}
c = 0
@ -13,3 +18,7 @@ def process_paragraphs(text):
if c:
paragraphs[c] += line
return paragraphs
def media_upload_path(instance, filename):
return os.path.join(f"uploads/{generate_charset(7)}/", filename)

View File

@ -0,0 +1,7 @@
import random
import string
def generate_charset(length: int) -> str:
"""Generate a random string of characters of a given length."""
return "".join(random.choice(string.ascii_letters) for _ in range(length))

View File

@ -1,19 +1,18 @@
from pprint import pprint
import requests
import asyncio
import docx2txt
from django.conf import settings
from django.db.models.signals import post_save
from django.dispatch import receiver
from checker.models import Docx
from checker.models import Docx, Paragraph
from checker.services.file import process_paragraphs
from checker.tasks import process_file
import threading
import asyncio
@receiver(post_save, sender=Docx)
def create_docs(sender, instance, created, **kwargs):
if created:
document = docx2txt.process(instance.file.path)
paragraphs = process_paragraphs(document.split("\n"))
x = requests.post(settings.AI_URL, json=paragraphs)
process_file.apply_async((instance.pk))
return

View File

@ -1,10 +1,32 @@
from time import sleep
import docx2txt
import requests
from celery import shared_task
from uuid import uuid4
from checker.models import Docx
from django.conf import settings
from checker.models import Paragraph, Docx
from checker.services.file import process_paragraphs
@shared_task(name="process_file")
def process_file(file: uuid4):
print(file)
@shared_task()
def process_file(pk: int):
file = Docx.objects.get(pk=pk)
document = docx2txt.process(file.file.path)
paragraphs = process_paragraphs(document.split("\n"))
file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"])
x = requests.post("http://185.244.175.164:5000/api", json=paragraphs)
for el_id, type_id in x.json().items():
Paragraph.objects.create(
type_id=type_id, docx=file, text=paragraphs[el_id]
)
file.paragraphs_processed = len(paragraphs)
file.save(update_fields=["paragraphs_processed"])
return file

View File

@ -1,8 +1,10 @@
from django.urls import path, include
from checker.api.views import ListCreateDocxApiView
from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState
urlpatterns = [
path("health/", include("health_check.urls")),
path("docx/", ListCreateDocxApiView.as_view(), name="list_create_docx")
path("docx/", ListCreateDocxApiView.as_view(), name="list_create_docx"),
path("docx/<uuid:uuid>", RetireDocxSerializer.as_view(), name="get_docx"),
path("state/<uuid:uuid>", GetDocxState.as_view(), name="get_state_docx"),
]

View File

@ -5,7 +5,7 @@ from celery import Celery
# Set the default Django settings module for the 'celery' program.
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "conf.settings.local")
app = Celery("mistake_checker_hack_backend")
app = Celery("conf")
# Using a string here means the worker doesn't have to serialize
# the configuration object to child processes.

View File

@ -203,3 +203,6 @@ CELERY_BROKER_URL = 'redis://localhost:6379/0'
CELERY_TIMEZONE = "Europe/Moscow"
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = 30 * 60
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'

View File

@ -15,3 +15,4 @@ psutil
dj-database-url
uuid
docx2txt
requests-async