added async task worker, state save

This commit is contained in:
Alexander Karpov 2022-08-27 07:38:54 +03:00
parent 0c1d881dff
commit 477afd4278
11 changed files with 98 additions and 21 deletions

View File

@ -8,3 +8,9 @@ class DocxSerializer(serializers.ModelSerializer):
model = Docx model = Docx
fields = ["uuid", "file"] fields = ["uuid", "file"]
extra_kwargs = {"uuid": {"read_only": True}} extra_kwargs = {"uuid": {"read_only": True}}
class DocxStateSerializer(serializers.ModelSerializer):
class Meta:
model = Docx
fields = ["paragraphs_loaded", "paragraphs_processed"]

View File

@ -1,11 +1,31 @@
from rest_framework import generics from rest_framework import generics
from rest_framework.response import Response
from rest_framework.views import APIView
from rest_framework.generics import get_object_or_404
from rest_framework.parsers import MultiPartParser, FormParser from rest_framework.parsers import MultiPartParser, FormParser
from checker.api.serializers import DocxSerializer from checker.api.serializers import DocxSerializer, DocxStateSerializer
from checker.models import Docx from checker.models import Docx, ParagraphType
class ListCreateDocxApiView(generics.ListCreateAPIView): class ListCreateDocxApiView(generics.ListCreateAPIView):
parser_classes = [FormParser, MultiPartParser] parser_classes = [FormParser, MultiPartParser]
serializer_class = DocxSerializer serializer_class = DocxSerializer
queryset = Docx.objects.all() queryset = Docx.objects.all()
class GetDocxState(generics.RetrieveAPIView):
lookup_field = "uuid"
queryset = Docx.objects.all()
serializer_class = DocxStateSerializer
class RetireDocxSerializer(APIView):
def get(self, request, uuid):
doc = get_object_or_404(Docx, uuid=uuid)
res = {}
paragraphs = ParagraphType.objects.filter(paragraphs__docx=doc)
for p in paragraphs:
res[p.name] = [x.text for x in p.paragraphs.filter(docx=doc)]
return Response(res)

View File

@ -2,16 +2,24 @@ import uuid as uuid
from django.db import models from django.db import models
# Create your models here. # Create your models here.
from checker.services.file import media_upload_path
class Docx(models.Model): class Docx(models.Model):
uuid = models.UUIDField( uuid = models.UUIDField(
default=uuid.uuid4, editable=False, unique=True, primary_key=True default=uuid.uuid4, editable=False, unique=True, primary_key=True
) )
file = models.FileField(upload_to="") file = models.FileField(upload_to=media_upload_path)
created = models.DateTimeField(auto_now_add=True)
paragraphs_processed = models.IntegerField(default=0)
paragraphs_loaded = models.IntegerField(default=0)
def __str__(self): def __str__(self):
return self.uuid return str(self.uuid)
class Meta:
ordering = ["-created"]
class ParagraphType(models.Model): class ParagraphType(models.Model):

View File

@ -1,3 +1,8 @@
import os
from checker.services.generators import generate_charset
def process_paragraphs(text): def process_paragraphs(text):
paragraphs = {} paragraphs = {}
c = 0 c = 0
@ -13,3 +18,7 @@ def process_paragraphs(text):
if c: if c:
paragraphs[c] += line paragraphs[c] += line
return paragraphs return paragraphs
def media_upload_path(instance, filename):
return os.path.join(f"uploads/{generate_charset(7)}/", filename)

View File

@ -0,0 +1,7 @@
import random
import string
def generate_charset(length: int) -> str:
"""Generate a random string of characters of a given length."""
return "".join(random.choice(string.ascii_letters) for _ in range(length))

View File

@ -1,19 +1,18 @@
from pprint import pprint import asyncio
import requests
import docx2txt import docx2txt
from django.conf import settings
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.dispatch import receiver from django.dispatch import receiver
from checker.models import Docx from checker.models import Docx, Paragraph
from checker.services.file import process_paragraphs from checker.services.file import process_paragraphs
from checker.tasks import process_file
import threading
import asyncio
@receiver(post_save, sender=Docx) @receiver(post_save, sender=Docx)
def create_docs(sender, instance, created, **kwargs): def create_docs(sender, instance, created, **kwargs):
if created: if created:
document = docx2txt.process(instance.file.path) process_file.apply_async((instance.pk))
paragraphs = process_paragraphs(document.split("\n"))
x = requests.post(settings.AI_URL, json=paragraphs)
return return

View File

@ -1,10 +1,32 @@
from time import sleep
import docx2txt
import requests
from celery import shared_task from celery import shared_task
from uuid import uuid4
from checker.models import Docx from django.conf import settings
from checker.models import Paragraph, Docx
from checker.services.file import process_paragraphs
@shared_task(name="process_file") @shared_task()
def process_file(file: uuid4): def process_file(pk: int):
print(file) file = Docx.objects.get(pk=pk)
document = docx2txt.process(file.file.path)
paragraphs = process_paragraphs(document.split("\n"))
file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"])
x = requests.post("http://185.244.175.164:5000/api", json=paragraphs)
for el_id, type_id in x.json().items():
Paragraph.objects.create(
type_id=type_id, docx=file, text=paragraphs[el_id]
)
file.paragraphs_processed = len(paragraphs)
file.save(update_fields=["paragraphs_processed"])
return file return file

View File

@ -1,8 +1,10 @@
from django.urls import path, include from django.urls import path, include
from checker.api.views import ListCreateDocxApiView from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState
urlpatterns = [ urlpatterns = [
path("health/", include("health_check.urls")), path("health/", include("health_check.urls")),
path("docx/", ListCreateDocxApiView.as_view(), name="list_create_docx") path("docx/", ListCreateDocxApiView.as_view(), name="list_create_docx"),
path("docx/<uuid:uuid>", RetireDocxSerializer.as_view(), name="get_docx"),
path("state/<uuid:uuid>", GetDocxState.as_view(), name="get_state_docx"),
] ]

View File

@ -5,7 +5,7 @@ from celery import Celery
# Set the default Django settings module for the 'celery' program. # Set the default Django settings module for the 'celery' program.
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "conf.settings.local") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "conf.settings.local")
app = Celery("mistake_checker_hack_backend") app = Celery("conf")
# Using a string here means the worker doesn't have to serialize # Using a string here means the worker doesn't have to serialize
# the configuration object to child processes. # the configuration object to child processes.

View File

@ -203,3 +203,6 @@ CELERY_BROKER_URL = 'redis://localhost:6379/0'
CELERY_TIMEZONE = "Europe/Moscow" CELERY_TIMEZONE = "Europe/Moscow"
CELERY_TASK_TRACK_STARTED = True CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = 30 * 60 CELERY_TASK_TIME_LIMIT = 30 * 60
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'

View File

@ -15,3 +15,4 @@ psutil
dj-database-url dj-database-url
uuid uuid
docx2txt docx2txt
requests-async