implemented word endpoints

2025-11-08 19:47:34 +03:00 · 2022-08-27 11:59:23 +03:00 · 2022-08-27 11:59:23 +03:00 · 1d859084d1
commit 1d859084d1
parent cdf082415c
7 changed files with 148 additions and 14 deletions
--- a/checker/api/serializers.py
+++ b/checker/api/serializers.py
@ -1,6 +1,6 @@
 from rest_framework import serializers

-from checker.models import Docx
+from checker.models import Docx, WordDocx


 class DocxSerializer(serializers.ModelSerializer):
@ -14,3 +14,22 @@ class DocxStateSerializer(serializers.ModelSerializer):
    class Meta:
        model = Docx
        fields = ["paragraphs_loaded", "paragraphs_processed"]
+
+
+class WordDocxSerializer(serializers.ModelSerializer):
+    text = serializers.CharField()
+
+    class Meta:
+        model = WordDocx
+        fields = ["text", "uuid"]
+        extra_kwargs = {"uuid": {"read_only": True}, "text": {"write_only": True}}
+        write_only = ["text"]
+
+    def validate_text(self, val):
+        return str(val).encode()
+
+
+class WordDocxStateSerializer(serializers.ModelSerializer):
+    class Meta:
+        model = WordDocx
+        fields = ["paragraphs_loaded", "paragraphs_processed"]
--- a/checker/api/views.py
+++ b/checker/api/views.py
@ -4,8 +4,13 @@ from rest_framework.views import APIView
 from rest_framework.generics import get_object_or_404
 from rest_framework.parsers import MultiPartParser, FormParser

-from checker.api.serializers import DocxSerializer, DocxStateSerializer
-from checker.models import Docx, ParagraphType
+from checker.api.serializers import (
+    DocxSerializer,
+    DocxStateSerializer,
+    WordDocxSerializer,
+    WordDocxStateSerializer,
+)
+from checker.models import Docx, ParagraphType, WordDocx


 class ListCreateDocxApiView(generics.ListCreateAPIView):
@ -29,3 +34,25 @@ class RetireDocxSerializer(APIView):
            res[p.name] = [(x.text, x.score) for x in p.paragraphs.filter(docx=doc)]
        return Response(res)

+
+class ListCreateWordDocxApiView(generics.ListCreateAPIView):
+    parser_classes = [FormParser, MultiPartParser]
+    serializer_class = WordDocxSerializer
+    queryset = WordDocx.objects.all()
+
+
+class GetWordDocxState(generics.RetrieveAPIView):
+    lookup_field = "uuid"
+    queryset = WordDocx.objects.all()
+    serializer_class = WordDocxStateSerializer
+
+
+class RetireWordDocxSerializer(APIView):
+    # TODO create base class
+    def get(self, request, uuid):
+        doc = get_object_or_404(WordDocx, uuid=uuid)
+        res = {}
+        paragraphs = ParagraphType.objects.all()
+        for p in paragraphs:
+            res[p.name] = [(x.text, x.score) for x in p.word_paragraphs.filter(docx=doc)]
+        return Response(res)
--- a/checker/models.py
+++ b/checker/models.py
@ -24,6 +24,23 @@ class Docx(models.Model):
        ordering = ["-created"]


+class WordDocx(models.Model):
+    uuid = models.UUIDField(
+        default=uuid.uuid4, editable=False, unique=True, primary_key=True
+    )
+    text = models.BinaryField()
+    created = models.DateTimeField(auto_now_add=True)
+
+    paragraphs_processed = models.IntegerField(default=0)
+    paragraphs_loaded = models.IntegerField(default=0)
+
+    def __str__(self):
+        return str(self.uuid)
+
+    class Meta:
+        ordering = ["-created"]
+
+
 class ParagraphType(models.Model):
    name = models.CharField(max_length=200)

@ -32,9 +49,24 @@ class ParagraphType(models.Model):


 class Paragraph(models.Model):
-    score = models.IntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(100)])
+    score = models.IntegerField(
+        default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]
+    )
    text = models.TextField()
    type = models.ForeignKey(
        ParagraphType, related_name="paragraphs", on_delete=models.CASCADE
    )
    docx = models.ForeignKey(Docx, related_name="paragraphs", on_delete=models.CASCADE)
+
+
+class WordParagraph(models.Model):
+    text = models.TextField()
+    type = models.ForeignKey(
+        ParagraphType, related_name="word_paragraphs", on_delete=models.CASCADE
+    )
+    score = models.IntegerField(
+        default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]
+    )
+    docx = models.ForeignKey(
+        WordDocx, related_name="paragraphs", on_delete=models.CASCADE
+    )
--- a/checker/services/file.py
+++ b/checker/services/file.py
@ -3,8 +3,7 @@ import os
 from checker.services.generators import generate_charset


-def process_paragraphs(text):
-    text = text.split("\n")
+def _base_process(text):
    paragraphs = {}
    c = 1
    title = True
@ -19,5 +18,16 @@ def process_paragraphs(text):
    return paragraphs


+def process_paragraphs(text):
+    text = text.split("\n")
+    return _base_process(text)
+
+
+def process_word_paragraphs(text):
+    text = text.split("\\r")
+    print(text)
+    return _base_process(text)
+
+
 def media_upload_path(instance, filename):
    return os.path.join(f"uploads/{generate_charset(7)}/", filename)
--- a/checker/signals.py
+++ b/checker/signals.py
@ -1,8 +1,8 @@
 from django.db.models.signals import post_save
 from django.dispatch import receiver

-from checker.models import Docx
-from checker.tasks import process_file
+from checker.models import Docx, WordDocx
+from checker.tasks import process_file, process_word


@receiver(post_save, sender=Docx)
@ -10,3 +10,11 @@ def create_docs(sender, instance, created, **kwargs):
    if created:
        process_file.apply_async(kwargs={"pk": instance.pk})
        return
+
+
+@receiver(post_save, sender=WordDocx)
+def create_docs(sender, instance, created, **kwargs):
+    if created:
+        process_word.apply_async(kwargs={"pk": instance.pk})
+        return
+
--- a/checker/tasks.py
+++ b/checker/tasks.py
@ -2,8 +2,8 @@ import docx2txt
 import requests
 from celery import shared_task

-from checker.models import Paragraph, Docx
-from checker.services.file import process_paragraphs
+from checker.models import Paragraph, Docx, WordDocx, WordParagraph
+from checker.services.file import process_paragraphs, process_word_paragraphs


@shared_task()
@ -40,3 +40,40 @@ def process_file(pk: int):
            print(f"AI server error, {x.status_code}")

    return f"ok, {pk}"
+
+
+@shared_task()
+def process_word(pk: int):
+    file = WordDocx.objects.get(pk=pk)
+    uuid = file.uuid
+    paragraphs = process_word_paragraphs(file.text.tobytes().decode())
+    print(paragraphs)
+
+    file.paragraphs_loaded = len(paragraphs)
+    file.save(update_fields=["paragraphs_loaded"])
+
+    cut = 100
+    counter = 0
+    len_c = len(paragraphs)
+    paragraphs = list(paragraphs.values())
+    for i in range(0, len(paragraphs) // cut + 1):
+        vals = paragraphs[i * cut : (i + 1) * cut + 1]
+        dct = {x: vals[x] for x in range(len(vals))}
+
+        x = requests.post("http://109.248.175.223:5000/api", json=dct)
+        if x.status_code == 200:
+            for el_id, dat in x.json().items():
+                type_id, score = dat
+                WordParagraph.objects.create(
+                    type_id=type_id, docx=file, text=dct[int(el_id)], score=score
+                )
+
+            counter += len(vals)
+            print(f"processing {uuid}, {counter}/{len_c}")
+            file.paragraphs_processed = counter
+            file.save(update_fields=["paragraphs_processed"])
+        else:
+            print(f"AI server error, {x.status_code}")
+
+    return f"ok, {pk}"
+
--- a/conf/api_router.py
+++ b/conf/api_router.py
@ -1,6 +1,7 @@
 from django.urls import path, include

-from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState
+from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState, ListCreateWordDocxApiView, \
+    GetWordDocxState, RetireWordDocxSerializer

 urlpatterns = [
    path("health/", include("health_check.urls")),
@ -22,12 +23,12 @@ urlpatterns = [
        "word/",
        include(
            [
-                path("docx/", ListCreateDocxApiView.as_view(), name="list_create_word"),
+                path("docx/", ListCreateWordDocxApiView.as_view(), name="list_create_word"),
                path(
-                    "docx/<uuid:uuid>", RetireDocxSerializer.as_view(), name="get_word"
+                    "docx/<uuid:uuid>", GetWordDocxState.as_view(), name="get_word"
                ),
                path(
-                    "state/<uuid:uuid>", GetDocxState.as_view(), name="get_state_word"
+                    "state/<uuid:uuid>", RetireWordDocxSerializer.as_view(), name="get_state_word"
                ),
            ]
        ),