fixed parser, added file processing state

This commit is contained in:
Alexander Karpov 2022-08-27 10:16:21 +03:00
parent 477afd4278
commit f06651d1a9
4 changed files with 36 additions and 38 deletions

View File

@ -4,19 +4,18 @@ from checker.services.generators import generate_charset
def process_paragraphs(text): def process_paragraphs(text):
text = text.split("\n")
paragraphs = {} paragraphs = {}
c = 0 c = 1
title = True
for line in text: for line in text:
ind = line[:2] if title:
if len(ind) == 2 and ind[1] == ".": if line and len(line) > 2 and line[:2] == "1.":
try: title = False
ind = int(ind[0]) else:
c = ind if line:
paragraphs[c] = "" paragraphs[c] = line
except ValueError: c += 1
print()
if c:
paragraphs[c] += line
return paragraphs return paragraphs

View File

@ -1,18 +1,12 @@
import asyncio
import docx2txt
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.dispatch import receiver from django.dispatch import receiver
from checker.models import Docx, Paragraph from checker.models import Docx
from checker.services.file import process_paragraphs
from checker.tasks import process_file from checker.tasks import process_file
import threading
import asyncio
@receiver(post_save, sender=Docx) @receiver(post_save, sender=Docx)
def create_docs(sender, instance, created, **kwargs): def create_docs(sender, instance, created, **kwargs):
if created: if created:
process_file.apply_async((instance.pk)) process_file.apply_async(kwargs={"pk": instance.pk})
return return

View File

@ -1,11 +1,7 @@
from time import sleep
import docx2txt import docx2txt
import requests import requests
from celery import shared_task from celery import shared_task
from django.conf import settings
from checker.models import Paragraph, Docx from checker.models import Paragraph, Docx
from checker.services.file import process_paragraphs from checker.services.file import process_paragraphs
@ -13,20 +9,28 @@ from checker.services.file import process_paragraphs
@shared_task() @shared_task()
def process_file(pk: int): def process_file(pk: int):
file = Docx.objects.get(pk=pk) file = Docx.objects.get(pk=pk)
uuid = file.uuid
document = docx2txt.process(file.file.path) document = docx2txt.process(file.file.path)
paragraphs = process_paragraphs(document.split("\n")) paragraphs = process_paragraphs(document)
file.paragraphs_loaded = len(paragraphs) file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"]) file.save(update_fields=["paragraphs_loaded"])
x = requests.post("http://185.244.175.164:5000/api", json=paragraphs) cut = 100
for el_id, type_id in x.json().items(): counter = 0
Paragraph.objects.create( len_c = len(paragraphs)
type_id=type_id, docx=file, text=paragraphs[el_id] paragraphs = list(paragraphs.values())
) for i in range(0, len(paragraphs) // cut + 1):
vals = paragraphs[i * cut : (i + 1) * cut + 1]
dct = {x: vals[x] for x in range(len(vals))}
file.paragraphs_processed = len(paragraphs) x = requests.post("http://109.248.175.223:5000/api", json=dct)
for el_id, type_id in x.json().items():
Paragraph.objects.create(type_id=type_id, docx=file, text=dct[int(el_id)])
counter += len(vals)
print(f"processing {uuid}, {counter}/{len_c}")
file.paragraphs_processed = counter
file.save(update_fields=["paragraphs_processed"]) file.save(update_fields=["paragraphs_processed"])
return file return f"ok, {pk}"

View File

@ -4,7 +4,7 @@ from pathlib import Path
ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent
AI_URL = "http://185.244.175.164:5000/api" AI_URL = "http://109.248.175.223:5000/api"
# AI_URL = "http://127.0.0.1:5000" # AI_URL = "http://127.0.0.1:5000"
APPS_DIR = ROOT_DIR APPS_DIR = ROOT_DIR
@ -64,7 +64,7 @@ THIRD_PARTY_APPS = [
"rest_framework", "rest_framework",
"drf_yasg", "drf_yasg",
"corsheaders", "corsheaders",
"django_celery_results" "django_celery_results",
] ]
HEALTH_CHECKS = [ HEALTH_CHECKS = [
@ -199,10 +199,11 @@ CORS_ALLOW_ALL_ORIGINS = True
# Celery # Celery
CELERY_BROKER_URL = 'redis://localhost:6379/0' CELERY_BROKER_URL = "redis://localhost:6379/0"
CELERY_TIMEZONE = "Europe/Moscow" CELERY_TIMEZONE = "Europe/Moscow"
CELERY_TASK_TRACK_STARTED = True CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = 30 * 60 CELERY_TASK_TIME_LIMIT = 30 * 60
CELERY_ACCEPT_CONTENT = ['json'] CELERY_ACCEPT_CONTENT = ["json"]
CELERY_TASK_SERIALIZER = 'json' CELERY_TASK_SERIALIZER = "json"
CELERY_RESULT_SERIALIZER = 'json' CELERY_RESULT_SERIALIZER = "json"
CELERY_RESULT_BACKEND = "django-db"