From 1d859084d1c65e987181a038a0d1ad2854a543fc Mon Sep 17 00:00:00 2001
From: Alexander-D-Karpov <alexandr.d.karpov@gmail.com>
Date: Sat, 27 Aug 2022 11:59:23 +0300
Subject: [PATCH] implemented word endpoints

---
 checker/api/serializers.py | 21 ++++++++++++++++++-
 checker/api/views.py       | 31 ++++++++++++++++++++++++++--
 checker/models.py          | 34 ++++++++++++++++++++++++++++++-
 checker/services/file.py   | 14 +++++++++++--
 checker/signals.py         | 12 +++++++++--
 checker/tasks.py           | 41 ++++++++++++++++++++++++++++++++++++--
 conf/api_router.py         |  9 +++++----
 7 files changed, 148 insertions(+), 14 deletions(-)

diff --git a/checker/api/serializers.py b/checker/api/serializers.py
index c03c649..b28bfa5 100644
--- a/checker/api/serializers.py
+++ b/checker/api/serializers.py
@@ -1,6 +1,6 @@
 from rest_framework import serializers
 
-from checker.models import Docx
+from checker.models import Docx, WordDocx
 
 
 class DocxSerializer(serializers.ModelSerializer):
@@ -14,3 +14,22 @@ class DocxStateSerializer(serializers.ModelSerializer):
     class Meta:
         model = Docx
         fields = ["paragraphs_loaded", "paragraphs_processed"]
+
+
+class WordDocxSerializer(serializers.ModelSerializer):
+    text = serializers.CharField()
+
+    class Meta:
+        model = WordDocx
+        fields = ["text", "uuid"]
+        extra_kwargs = {"uuid": {"read_only": True}, "text": {"write_only": True}}
+        write_only = ["text"]
+
+    def validate_text(self, val):
+        return str(val).encode()
+
+
+class WordDocxStateSerializer(serializers.ModelSerializer):
+    class Meta:
+        model = WordDocx
+        fields = ["paragraphs_loaded", "paragraphs_processed"]
diff --git a/checker/api/views.py b/checker/api/views.py
index ce80d4e..5861ce6 100644
--- a/checker/api/views.py
+++ b/checker/api/views.py
@@ -4,8 +4,13 @@ from rest_framework.views import APIView
 from rest_framework.generics import get_object_or_404
 from rest_framework.parsers import MultiPartParser, FormParser
 
-from checker.api.serializers import DocxSerializer, DocxStateSerializer
-from checker.models import Docx, ParagraphType
+from checker.api.serializers import (
+    DocxSerializer,
+    DocxStateSerializer,
+    WordDocxSerializer,
+    WordDocxStateSerializer,
+)
+from checker.models import Docx, ParagraphType, WordDocx
 
 
 class ListCreateDocxApiView(generics.ListCreateAPIView):
@@ -29,3 +34,25 @@ class RetireDocxSerializer(APIView):
             res[p.name] = [(x.text, x.score) for x in p.paragraphs.filter(docx=doc)]
         return Response(res)
 
+
+class ListCreateWordDocxApiView(generics.ListCreateAPIView):
+    parser_classes = [FormParser, MultiPartParser]
+    serializer_class = WordDocxSerializer
+    queryset = WordDocx.objects.all()
+
+
+class GetWordDocxState(generics.RetrieveAPIView):
+    lookup_field = "uuid"
+    queryset = WordDocx.objects.all()
+    serializer_class = WordDocxStateSerializer
+
+
+class RetireWordDocxSerializer(APIView):
+    # TODO create base class
+    def get(self, request, uuid):
+        doc = get_object_or_404(WordDocx, uuid=uuid)
+        res = {}
+        paragraphs = ParagraphType.objects.all()
+        for p in paragraphs:
+            res[p.name] = [(x.text, x.score) for x in p.word_paragraphs.filter(docx=doc)]
+        return Response(res)
diff --git a/checker/models.py b/checker/models.py
index 83d2027..e8b42fb 100644
--- a/checker/models.py
+++ b/checker/models.py
@@ -24,6 +24,23 @@ class Docx(models.Model):
         ordering = ["-created"]
 
 
+class WordDocx(models.Model):
+    uuid = models.UUIDField(
+        default=uuid.uuid4, editable=False, unique=True, primary_key=True
+    )
+    text = models.BinaryField()
+    created = models.DateTimeField(auto_now_add=True)
+
+    paragraphs_processed = models.IntegerField(default=0)
+    paragraphs_loaded = models.IntegerField(default=0)
+
+    def __str__(self):
+        return str(self.uuid)
+
+    class Meta:
+        ordering = ["-created"]
+
+
 class ParagraphType(models.Model):
     name = models.CharField(max_length=200)
 
@@ -32,9 +49,24 @@ class ParagraphType(models.Model):
 
 
 class Paragraph(models.Model):
-    score = models.IntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(100)])
+    score = models.IntegerField(
+        default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]
+    )
     text = models.TextField()
     type = models.ForeignKey(
         ParagraphType, related_name="paragraphs", on_delete=models.CASCADE
     )
     docx = models.ForeignKey(Docx, related_name="paragraphs", on_delete=models.CASCADE)
+
+
+class WordParagraph(models.Model):
+    text = models.TextField()
+    type = models.ForeignKey(
+        ParagraphType, related_name="word_paragraphs", on_delete=models.CASCADE
+    )
+    score = models.IntegerField(
+        default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]
+    )
+    docx = models.ForeignKey(
+        WordDocx, related_name="paragraphs", on_delete=models.CASCADE
+    )
diff --git a/checker/services/file.py b/checker/services/file.py
index 4cfaa67..b6c95f6 100644
--- a/checker/services/file.py
+++ b/checker/services/file.py
@@ -3,8 +3,7 @@ import os
 from checker.services.generators import generate_charset
 
 
-def process_paragraphs(text):
-    text = text.split("\n")
+def _base_process(text):
     paragraphs = {}
     c = 1
     title = True
@@ -19,5 +18,16 @@ def process_paragraphs(text):
     return paragraphs
 
 
+def process_paragraphs(text):
+    text = text.split("\n")
+    return _base_process(text)
+
+
+def process_word_paragraphs(text):
+    text = text.split("\\r")
+    print(text)
+    return _base_process(text)
+
+
 def media_upload_path(instance, filename):
     return os.path.join(f"uploads/{generate_charset(7)}/", filename)
diff --git a/checker/signals.py b/checker/signals.py
index 83064e7..a7d02e4 100644
--- a/checker/signals.py
+++ b/checker/signals.py
@@ -1,8 +1,8 @@
 from django.db.models.signals import post_save
 from django.dispatch import receiver
 
-from checker.models import Docx
-from checker.tasks import process_file
+from checker.models import Docx, WordDocx
+from checker.tasks import process_file, process_word
 
 
 @receiver(post_save, sender=Docx)
@@ -10,3 +10,11 @@ def create_docs(sender, instance, created, **kwargs):
     if created:
         process_file.apply_async(kwargs={"pk": instance.pk})
         return
+
+
+@receiver(post_save, sender=WordDocx)
+def create_docs(sender, instance, created, **kwargs):
+    if created:
+        process_word.apply_async(kwargs={"pk": instance.pk})
+        return
+
diff --git a/checker/tasks.py b/checker/tasks.py
index a3f0325..3c90719 100644
--- a/checker/tasks.py
+++ b/checker/tasks.py
@@ -2,8 +2,8 @@ import docx2txt
 import requests
 from celery import shared_task
 
-from checker.models import Paragraph, Docx
-from checker.services.file import process_paragraphs
+from checker.models import Paragraph, Docx, WordDocx, WordParagraph
+from checker.services.file import process_paragraphs, process_word_paragraphs
 
 
 @shared_task()
@@ -40,3 +40,40 @@ def process_file(pk: int):
             print(f"AI server error, {x.status_code}")
 
     return f"ok, {pk}"
+
+
+@shared_task()
+def process_word(pk: int):
+    file = WordDocx.objects.get(pk=pk)
+    uuid = file.uuid
+    paragraphs = process_word_paragraphs(file.text.tobytes().decode())
+    print(paragraphs)
+
+    file.paragraphs_loaded = len(paragraphs)
+    file.save(update_fields=["paragraphs_loaded"])
+
+    cut = 100
+    counter = 0
+    len_c = len(paragraphs)
+    paragraphs = list(paragraphs.values())
+    for i in range(0, len(paragraphs) // cut + 1):
+        vals = paragraphs[i * cut : (i + 1) * cut + 1]
+        dct = {x: vals[x] for x in range(len(vals))}
+
+        x = requests.post("http://109.248.175.223:5000/api", json=dct)
+        if x.status_code == 200:
+            for el_id, dat in x.json().items():
+                type_id, score = dat
+                WordParagraph.objects.create(
+                    type_id=type_id, docx=file, text=dct[int(el_id)], score=score
+                )
+
+            counter += len(vals)
+            print(f"processing {uuid}, {counter}/{len_c}")
+            file.paragraphs_processed = counter
+            file.save(update_fields=["paragraphs_processed"])
+        else:
+            print(f"AI server error, {x.status_code}")
+
+    return f"ok, {pk}"
+
diff --git a/conf/api_router.py b/conf/api_router.py
index 9c1407f..020a79b 100644
--- a/conf/api_router.py
+++ b/conf/api_router.py
@@ -1,6 +1,7 @@
 from django.urls import path, include
 
-from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState
+from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState, ListCreateWordDocxApiView, \
+    GetWordDocxState, RetireWordDocxSerializer
 
 urlpatterns = [
     path("health/", include("health_check.urls")),
@@ -22,12 +23,12 @@ urlpatterns = [
         "word/",
         include(
             [
-                path("docx/", ListCreateDocxApiView.as_view(), name="list_create_word"),
+                path("docx/", ListCreateWordDocxApiView.as_view(), name="list_create_word"),
                 path(
-                    "docx/<uuid:uuid>", RetireDocxSerializer.as_view(), name="get_word"
+                    "docx/<uuid:uuid>", GetWordDocxState.as_view(), name="get_word"
                 ),
                 path(
-                    "state/<uuid:uuid>", GetDocxState.as_view(), name="get_state_word"
+                    "state/<uuid:uuid>", RetireWordDocxSerializer.as_view(), name="get_state_word"
                 ),
             ]
         ),