implemented word endpoints

This commit is contained in:
Alexander Karpov 2022-08-27 11:59:23 +03:00
parent cdf082415c
commit 1d859084d1
7 changed files with 148 additions and 14 deletions

View File

@ -1,6 +1,6 @@
from rest_framework import serializers from rest_framework import serializers
from checker.models import Docx from checker.models import Docx, WordDocx
class DocxSerializer(serializers.ModelSerializer): class DocxSerializer(serializers.ModelSerializer):
@ -14,3 +14,22 @@ class DocxStateSerializer(serializers.ModelSerializer):
class Meta: class Meta:
model = Docx model = Docx
fields = ["paragraphs_loaded", "paragraphs_processed"] fields = ["paragraphs_loaded", "paragraphs_processed"]
class WordDocxSerializer(serializers.ModelSerializer):
text = serializers.CharField()
class Meta:
model = WordDocx
fields = ["text", "uuid"]
extra_kwargs = {"uuid": {"read_only": True}, "text": {"write_only": True}}
write_only = ["text"]
def validate_text(self, val):
return str(val).encode()
class WordDocxStateSerializer(serializers.ModelSerializer):
class Meta:
model = WordDocx
fields = ["paragraphs_loaded", "paragraphs_processed"]

View File

@ -4,8 +4,13 @@ from rest_framework.views import APIView
from rest_framework.generics import get_object_or_404 from rest_framework.generics import get_object_or_404
from rest_framework.parsers import MultiPartParser, FormParser from rest_framework.parsers import MultiPartParser, FormParser
from checker.api.serializers import DocxSerializer, DocxStateSerializer from checker.api.serializers import (
from checker.models import Docx, ParagraphType DocxSerializer,
DocxStateSerializer,
WordDocxSerializer,
WordDocxStateSerializer,
)
from checker.models import Docx, ParagraphType, WordDocx
class ListCreateDocxApiView(generics.ListCreateAPIView): class ListCreateDocxApiView(generics.ListCreateAPIView):
@ -29,3 +34,25 @@ class RetireDocxSerializer(APIView):
res[p.name] = [(x.text, x.score) for x in p.paragraphs.filter(docx=doc)] res[p.name] = [(x.text, x.score) for x in p.paragraphs.filter(docx=doc)]
return Response(res) return Response(res)
class ListCreateWordDocxApiView(generics.ListCreateAPIView):
parser_classes = [FormParser, MultiPartParser]
serializer_class = WordDocxSerializer
queryset = WordDocx.objects.all()
class GetWordDocxState(generics.RetrieveAPIView):
lookup_field = "uuid"
queryset = WordDocx.objects.all()
serializer_class = WordDocxStateSerializer
class RetireWordDocxSerializer(APIView):
# TODO create base class
def get(self, request, uuid):
doc = get_object_or_404(WordDocx, uuid=uuid)
res = {}
paragraphs = ParagraphType.objects.all()
for p in paragraphs:
res[p.name] = [(x.text, x.score) for x in p.word_paragraphs.filter(docx=doc)]
return Response(res)

View File

@ -24,6 +24,23 @@ class Docx(models.Model):
ordering = ["-created"] ordering = ["-created"]
class WordDocx(models.Model):
uuid = models.UUIDField(
default=uuid.uuid4, editable=False, unique=True, primary_key=True
)
text = models.BinaryField()
created = models.DateTimeField(auto_now_add=True)
paragraphs_processed = models.IntegerField(default=0)
paragraphs_loaded = models.IntegerField(default=0)
def __str__(self):
return str(self.uuid)
class Meta:
ordering = ["-created"]
class ParagraphType(models.Model): class ParagraphType(models.Model):
name = models.CharField(max_length=200) name = models.CharField(max_length=200)
@ -32,9 +49,24 @@ class ParagraphType(models.Model):
class Paragraph(models.Model): class Paragraph(models.Model):
score = models.IntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]) score = models.IntegerField(
default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]
)
text = models.TextField() text = models.TextField()
type = models.ForeignKey( type = models.ForeignKey(
ParagraphType, related_name="paragraphs", on_delete=models.CASCADE ParagraphType, related_name="paragraphs", on_delete=models.CASCADE
) )
docx = models.ForeignKey(Docx, related_name="paragraphs", on_delete=models.CASCADE) docx = models.ForeignKey(Docx, related_name="paragraphs", on_delete=models.CASCADE)
class WordParagraph(models.Model):
text = models.TextField()
type = models.ForeignKey(
ParagraphType, related_name="word_paragraphs", on_delete=models.CASCADE
)
score = models.IntegerField(
default=0, validators=[MinValueValidator(0), MaxValueValidator(100)]
)
docx = models.ForeignKey(
WordDocx, related_name="paragraphs", on_delete=models.CASCADE
)

View File

@ -3,8 +3,7 @@ import os
from checker.services.generators import generate_charset from checker.services.generators import generate_charset
def process_paragraphs(text): def _base_process(text):
text = text.split("\n")
paragraphs = {} paragraphs = {}
c = 1 c = 1
title = True title = True
@ -19,5 +18,16 @@ def process_paragraphs(text):
return paragraphs return paragraphs
def process_paragraphs(text):
text = text.split("\n")
return _base_process(text)
def process_word_paragraphs(text):
text = text.split("\\r")
print(text)
return _base_process(text)
def media_upload_path(instance, filename): def media_upload_path(instance, filename):
return os.path.join(f"uploads/{generate_charset(7)}/", filename) return os.path.join(f"uploads/{generate_charset(7)}/", filename)

View File

@ -1,8 +1,8 @@
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.dispatch import receiver from django.dispatch import receiver
from checker.models import Docx from checker.models import Docx, WordDocx
from checker.tasks import process_file from checker.tasks import process_file, process_word
@receiver(post_save, sender=Docx) @receiver(post_save, sender=Docx)
@ -10,3 +10,11 @@ def create_docs(sender, instance, created, **kwargs):
if created: if created:
process_file.apply_async(kwargs={"pk": instance.pk}) process_file.apply_async(kwargs={"pk": instance.pk})
return return
@receiver(post_save, sender=WordDocx)
def create_docs(sender, instance, created, **kwargs):
if created:
process_word.apply_async(kwargs={"pk": instance.pk})
return

View File

@ -2,8 +2,8 @@ import docx2txt
import requests import requests
from celery import shared_task from celery import shared_task
from checker.models import Paragraph, Docx from checker.models import Paragraph, Docx, WordDocx, WordParagraph
from checker.services.file import process_paragraphs from checker.services.file import process_paragraphs, process_word_paragraphs
@shared_task() @shared_task()
@ -40,3 +40,40 @@ def process_file(pk: int):
print(f"AI server error, {x.status_code}") print(f"AI server error, {x.status_code}")
return f"ok, {pk}" return f"ok, {pk}"
@shared_task()
def process_word(pk: int):
file = WordDocx.objects.get(pk=pk)
uuid = file.uuid
paragraphs = process_word_paragraphs(file.text.tobytes().decode())
print(paragraphs)
file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"])
cut = 100
counter = 0
len_c = len(paragraphs)
paragraphs = list(paragraphs.values())
for i in range(0, len(paragraphs) // cut + 1):
vals = paragraphs[i * cut : (i + 1) * cut + 1]
dct = {x: vals[x] for x in range(len(vals))}
x = requests.post("http://109.248.175.223:5000/api", json=dct)
if x.status_code == 200:
for el_id, dat in x.json().items():
type_id, score = dat
WordParagraph.objects.create(
type_id=type_id, docx=file, text=dct[int(el_id)], score=score
)
counter += len(vals)
print(f"processing {uuid}, {counter}/{len_c}")
file.paragraphs_processed = counter
file.save(update_fields=["paragraphs_processed"])
else:
print(f"AI server error, {x.status_code}")
return f"ok, {pk}"

View File

@ -1,6 +1,7 @@
from django.urls import path, include from django.urls import path, include
from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState from checker.api.views import ListCreateDocxApiView, RetireDocxSerializer, GetDocxState, ListCreateWordDocxApiView, \
GetWordDocxState, RetireWordDocxSerializer
urlpatterns = [ urlpatterns = [
path("health/", include("health_check.urls")), path("health/", include("health_check.urls")),
@ -22,12 +23,12 @@ urlpatterns = [
"word/", "word/",
include( include(
[ [
path("docx/", ListCreateDocxApiView.as_view(), name="list_create_word"), path("docx/", ListCreateWordDocxApiView.as_view(), name="list_create_word"),
path( path(
"docx/<uuid:uuid>", RetireDocxSerializer.as_view(), name="get_word" "docx/<uuid:uuid>", GetWordDocxState.as_view(), name="get_word"
), ),
path( path(
"state/<uuid:uuid>", GetDocxState.as_view(), name="get_state_word" "state/<uuid:uuid>", RetireWordDocxSerializer.as_view(), name="get_state_word"
), ),
] ]
), ),