mirror of
https://github.com/Ai-hack-MAGNUM-OPUS/backend.git
synced 2024-11-22 08:16:35 +03:00
fixed parser, added file processing state
This commit is contained in:
parent
477afd4278
commit
f06651d1a9
|
@ -4,19 +4,18 @@ from checker.services.generators import generate_charset
|
||||||
|
|
||||||
|
|
||||||
def process_paragraphs(text):
|
def process_paragraphs(text):
|
||||||
|
text = text.split("\n")
|
||||||
paragraphs = {}
|
paragraphs = {}
|
||||||
c = 0
|
c = 1
|
||||||
|
title = True
|
||||||
for line in text:
|
for line in text:
|
||||||
ind = line[:2]
|
if title:
|
||||||
if len(ind) == 2 and ind[1] == ".":
|
if line and len(line) > 2 and line[:2] == "1.":
|
||||||
try:
|
title = False
|
||||||
ind = int(ind[0])
|
else:
|
||||||
c = ind
|
if line:
|
||||||
paragraphs[c] = ""
|
paragraphs[c] = line
|
||||||
except ValueError:
|
c += 1
|
||||||
print()
|
|
||||||
if c:
|
|
||||||
paragraphs[c] += line
|
|
||||||
return paragraphs
|
return paragraphs
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,12 @@
|
||||||
import asyncio
|
|
||||||
|
|
||||||
import docx2txt
|
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
|
|
||||||
from checker.models import Docx, Paragraph
|
from checker.models import Docx
|
||||||
from checker.services.file import process_paragraphs
|
|
||||||
from checker.tasks import process_file
|
from checker.tasks import process_file
|
||||||
import threading
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
|
|
||||||
@receiver(post_save, sender=Docx)
|
@receiver(post_save, sender=Docx)
|
||||||
def create_docs(sender, instance, created, **kwargs):
|
def create_docs(sender, instance, created, **kwargs):
|
||||||
if created:
|
if created:
|
||||||
process_file.apply_async((instance.pk))
|
process_file.apply_async(kwargs={"pk": instance.pk})
|
||||||
return
|
return
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
from time import sleep
|
|
||||||
|
|
||||||
import docx2txt
|
import docx2txt
|
||||||
import requests
|
import requests
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from checker.models import Paragraph, Docx
|
from checker.models import Paragraph, Docx
|
||||||
from checker.services.file import process_paragraphs
|
from checker.services.file import process_paragraphs
|
||||||
|
|
||||||
|
@ -13,20 +9,28 @@ from checker.services.file import process_paragraphs
|
||||||
@shared_task()
|
@shared_task()
|
||||||
def process_file(pk: int):
|
def process_file(pk: int):
|
||||||
file = Docx.objects.get(pk=pk)
|
file = Docx.objects.get(pk=pk)
|
||||||
|
uuid = file.uuid
|
||||||
document = docx2txt.process(file.file.path)
|
document = docx2txt.process(file.file.path)
|
||||||
paragraphs = process_paragraphs(document.split("\n"))
|
paragraphs = process_paragraphs(document)
|
||||||
|
|
||||||
file.paragraphs_loaded = len(paragraphs)
|
file.paragraphs_loaded = len(paragraphs)
|
||||||
file.save(update_fields=["paragraphs_loaded"])
|
file.save(update_fields=["paragraphs_loaded"])
|
||||||
|
|
||||||
x = requests.post("http://185.244.175.164:5000/api", json=paragraphs)
|
cut = 100
|
||||||
for el_id, type_id in x.json().items():
|
counter = 0
|
||||||
Paragraph.objects.create(
|
len_c = len(paragraphs)
|
||||||
type_id=type_id, docx=file, text=paragraphs[el_id]
|
paragraphs = list(paragraphs.values())
|
||||||
)
|
for i in range(0, len(paragraphs) // cut + 1):
|
||||||
|
vals = paragraphs[i * cut : (i + 1) * cut + 1]
|
||||||
|
dct = {x: vals[x] for x in range(len(vals))}
|
||||||
|
|
||||||
file.paragraphs_processed = len(paragraphs)
|
x = requests.post("http://109.248.175.223:5000/api", json=dct)
|
||||||
|
for el_id, type_id in x.json().items():
|
||||||
|
Paragraph.objects.create(type_id=type_id, docx=file, text=dct[int(el_id)])
|
||||||
|
|
||||||
|
counter += len(vals)
|
||||||
|
print(f"processing {uuid}, {counter}/{len_c}")
|
||||||
|
file.paragraphs_processed = counter
|
||||||
file.save(update_fields=["paragraphs_processed"])
|
file.save(update_fields=["paragraphs_processed"])
|
||||||
|
|
||||||
return file
|
return f"ok, {pk}"
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||||
|
|
||||||
ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent
|
ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent
|
||||||
|
|
||||||
AI_URL = "http://185.244.175.164:5000/api"
|
AI_URL = "http://109.248.175.223:5000/api"
|
||||||
# AI_URL = "http://127.0.0.1:5000"
|
# AI_URL = "http://127.0.0.1:5000"
|
||||||
|
|
||||||
APPS_DIR = ROOT_DIR
|
APPS_DIR = ROOT_DIR
|
||||||
|
@ -64,7 +64,7 @@ THIRD_PARTY_APPS = [
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"drf_yasg",
|
"drf_yasg",
|
||||||
"corsheaders",
|
"corsheaders",
|
||||||
"django_celery_results"
|
"django_celery_results",
|
||||||
]
|
]
|
||||||
|
|
||||||
HEALTH_CHECKS = [
|
HEALTH_CHECKS = [
|
||||||
|
@ -199,10 +199,11 @@ CORS_ALLOW_ALL_ORIGINS = True
|
||||||
|
|
||||||
|
|
||||||
# Celery
|
# Celery
|
||||||
CELERY_BROKER_URL = 'redis://localhost:6379/0'
|
CELERY_BROKER_URL = "redis://localhost:6379/0"
|
||||||
CELERY_TIMEZONE = "Europe/Moscow"
|
CELERY_TIMEZONE = "Europe/Moscow"
|
||||||
CELERY_TASK_TRACK_STARTED = True
|
CELERY_TASK_TRACK_STARTED = True
|
||||||
CELERY_TASK_TIME_LIMIT = 30 * 60
|
CELERY_TASK_TIME_LIMIT = 30 * 60
|
||||||
CELERY_ACCEPT_CONTENT = ['json']
|
CELERY_ACCEPT_CONTENT = ["json"]
|
||||||
CELERY_TASK_SERIALIZER = 'json'
|
CELERY_TASK_SERIALIZER = "json"
|
||||||
CELERY_RESULT_SERIALIZER = 'json'
|
CELERY_RESULT_SERIALIZER = "json"
|
||||||
|
CELERY_RESULT_BACKEND = "django-db"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user