From 1d859084d1c65e987181a038a0d1ad2854a543fc Mon Sep 17 00:00:00 2001 From: Alexander-D-Karpov Date: Sat, 27 Aug 2022 11:59:23 +0300 Subject: [PATCH] implemented word endpoints --- checker/api/serializers.py | 21 ++++++++++++++++++- checker/api/views.py | 31 ++++++++++++++++++++++++++-- checker/models.py | 34 ++++++++++++++++++++++++++++++- checker/services/file.py | 14 +++++++++++-- checker/signals.py | 12 +++++++++-- checker/tasks.py | 41 ++++++++++++++++++++++++++++++++++++-- conf/api_router.py | 9 +++++---- 7 files changed, 148 insertions(+), 14 deletions(-) diff --git a/checker/api/serializers.py b/checker/api/serializers.py index c03c649..b28bfa5 100644 --- a/checker/api/serializers.py +++ b/checker/api/serializers.py @@ -1,6 +1,6 @@ from rest_framework import serializers -from checker.models import Docx +from checker.models import Docx, WordDocx class DocxSerializer(serializers.ModelSerializer): @@ -14,3 +14,22 @@ class DocxStateSerializer(serializers.ModelSerializer): class Meta: model = Docx fields = ["paragraphs_loaded", "paragraphs_processed"] + + +class WordDocxSerializer(serializers.ModelSerializer): + text = serializers.CharField() + + class Meta: + model = WordDocx + fields = ["text", "uuid"] + extra_kwargs = {"uuid": {"read_only": True}, "text": {"write_only": True}} + write_only = ["text"] + + def validate_text(self, val): + return str(val).encode() + + +class WordDocxStateSerializer(serializers.ModelSerializer): + class Meta: + model = WordDocx + fields = ["paragraphs_loaded", "paragraphs_processed"] diff --git a/checker/api/views.py b/checker/api/views.py index ce80d4e..5861ce6 100644 --- a/checker/api/views.py +++ b/checker/api/views.py @@ -4,8 +4,13 @@ from rest_framework.views import APIView from rest_framework.generics import get_object_or_404 from rest_framework.parsers import MultiPartParser, FormParser -from checker.api.serializers import DocxSerializer, DocxStateSerializer -from checker.models import Docx, ParagraphType +from checker.api.serializers import ( + DocxSerializer, + DocxStateSerializer, + WordDocxSerializer, + WordDocxStateSerializer, +) +from checker.models import Docx, ParagraphType, WordDocx class ListCreateDocxApiView(generics.ListCreateAPIView): @@ -29,3 +34,25 @@ class RetireDocxSerializer(APIView): res[p.name] = [(x.text, x.score) for x in p.paragraphs.filter(docx=doc)] return Response(res) + +class ListCreateWordDocxApiView(generics.ListCreateAPIView): + parser_classes = [FormParser, MultiPartParser] + serializer_class = WordDocxSerializer + queryset = WordDocx.objects.all() + + +class GetWordDocxState(generics.RetrieveAPIView): + lookup_field = "uuid" + queryset = WordDocx.objects.all() + serializer_class = WordDocxStateSerializer + + +class RetireWordDocxSerializer(APIView): + # TODO create base class + def get(self, request, uuid): + doc = get_object_or_404(WordDocx, uuid=uuid) + res = {} + paragraphs = ParagraphType.objects.all() + for p in paragraphs: + res[p.name] = [(x.text, x.score) for x in p.word_paragraphs.filter(docx=doc)] + return Response(res) diff --git a/checker/models.py b/checker/models.py index 83d2027..e8b42fb 100644 --- a/checker/models.py +++ b/checker/models.py @@ -24,6 +24,23 @@ class Docx(models.Model): ordering = ["-created"] +class WordDocx(models.Model): + uuid = models.UUIDField( + default=uuid.uuid4, editable=False, unique=True, primary_key=True + ) + text = models.BinaryField() + created = models.DateTimeField(auto_now_add=True) + + paragraphs_processed = models.IntegerField(default=0) + paragraphs_loaded = models.IntegerField(default=0) + + def __str__(self): + return str(self.uuid) + + class Meta: + ordering = ["-created"] + + class ParagraphType(models.Model): name = models.CharField(max_length=200) @@ -32,9 +49,24 @@ class ParagraphType(models.Model): class Paragraph(models.Model): - score = models.IntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]) + score = models.IntegerField( + default=0, validators=[MinValueValidator(0), MaxValueValidator(100)] + ) text = models.TextField() type = models.ForeignKey( ParagraphType, related_name="paragraphs", on_delete=models.CASCADE ) docx = models.ForeignKey(Docx, related_name="paragraphs", on_delete=models.CASCADE) + + +class WordParagraph(models.Model): + text = models.TextField() + type = models.ForeignKey( + ParagraphType, related_name="word_paragraphs", on_delete=models.CASCADE + ) + score = models.IntegerField( + default=0, validators=[MinValueValidator(0), MaxValueValidator(100)] + ) + docx = models.ForeignKey( + WordDocx, related_name="paragraphs", on_delete=models.CASCADE + ) diff --git a/checker/services/file.py b/checker/services/file.py index 4cfaa67..b6c95f6 100644 --- a/checker/services/file.py +++ b/checker/services/file.py @@ -3,8 +3,7 @@ import os from checker.services.generators import generate_charset -def process_paragraphs(text): - text = text.split("\n") +def _base_process(text): paragraphs = {} c = 1 title = True @@ -19,5 +18,16 @@ def process_paragraphs(text): return paragraphs +def process_paragraphs(text): + text = text.split("\n") + return _base_process(text) + + +def process_word_paragraphs(text): + text = text.split("\\r") + print(text) + return _base_process(text) + + def media_upload_path(instance, filename): return os.path.join(f"uploads/{generate_charset(7)}/", filename) diff --git a/checker/signals.py b/checker/signals.py index 83064e7..a7d02e4 100644 --- a/checker/signals.py +++ b/checker/signals.py @@ -1,8 +1,8 @@ from django.db.models.signals import post_save from django.dispatch import receiver -from checker.models import Docx -from checker.tasks import process_file +from checker.models import Docx, WordDocx +from checker.tasks import process_file, process_word @receiver(post_save, sender=Docx) @@ -10,3 +10,11 @@ def create_docs(sender, instance, created, **kwargs): if created: process_file.apply_async(kwargs={"pk": instance.pk}) return + + +@receiver(post_save, sender=WordDocx) +def create_docs(sender, instance, created, **kwargs): + if created: + process_word.apply_async(kwargs={"pk": instance.pk}) + return + diff --git a/checker/tasks.py b/checker/tasks.py index a3f0325..3c90719 100644 --- a/checker/tasks.py +++ b/checker/tasks.py @@ -2,8 +2,8 @@ import docx2txt import requests from celery import shared_task -from checker.models import Paragraph, Docx -from checker.services.file import process_paragraphs +from checker.models import Paragraph, Docx, WordDocx, WordParagraph +from checker.services.file import process_paragraphs, process_word_paragraphs @shared_task() @@ -40,3 +40,40 @@ def process_file(pk: int): print(f"AI server error, {x.status_code}") return f"ok, {pk}" + + +@shared_task() +def process_word(pk: int): + file = WordDocx.objects.get(pk=pk) + uuid = file.uuid + paragraphs = process_word_paragraphs(file.text.tobytes().decode()) + print(paragraphs) + + file.paragraphs_loaded = len(paragraphs) + file.save(update_fields=["paragraphs_loaded"]) + + cut = 100 + counter = 0 + len_c = len(paragraphs) + paragraphs = list(paragraphs.values()) + for i in range(0, len(paragraphs) // cut + 1): + vals = paragraphs[i * cut : (i + 1) * cut + 1] + dct = {x: vals[x] for x in range(len(vals))} + + x = requests.post("http://109.248.175.223:5000/api", json=dct) + if x.status_code == 200: + for el_id, dat in x.json().items(): + type_id, score = dat + WordParagraph.objects.create( + type_id=type_id, docx=file, text=dct[int(el_id)], score=score + ) + + counter += len(vals) + print(f"processing {uuid}, {counter}/{len_c}") + file.paragraphs_processed = counter + file.save(update_fields=["paragraphs_processed"]) + else: + print(f"AI server error, {x.status_code}") + + return f"ok, {pk}" + diff --git a/conf/api_router.py b/conf/api_router.py index 9c1407f..020a79b 100644 --- a/conf/api_router.py +++ b/conf/api_router.py @@ -1,6 +1,7 @@ from django.urls import path, include -from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState +from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState, ListCreateWordDocxApiView, \ + GetWordDocxState, RetireWordDocxSerializer urlpatterns = [ path("health/", include("health_check.urls")), @@ -22,12 +23,12 @@ urlpatterns = [ "word/", include( [ - path("docx/", ListCreateDocxApiView.as_view(), name="list_create_word"), + path("docx/", ListCreateWordDocxApiView.as_view(), name="list_create_word"), path( - "docx/", RetireDocxSerializer.as_view(), name="get_word" + "docx/", GetWordDocxState.as_view(), name="get_word" ), path( - "state/", GetDocxState.as_view(), name="get_state_word" + "state/", RetireWordDocxSerializer.as_view(), name="get_state_word" ), ] ),