Optimised file processing, added doc, odt -> docx converter, minor changes

This commit is contained in:
Alexander Karpov 2022-08-28 12:37:28 +03:00
parent 2bf771e3a2
commit 6cffc965a8
3 changed files with 121 additions and 56 deletions

View File

@ -1,4 +1,7 @@
import os
import re
import convertapi
from checker.services.generators import generate_charset
@ -28,5 +31,31 @@ def process_word_paragraphs(text):
return _base_process(text)
def doc_to_docx(file_path):
convertapi.api_secret = '0fp22XFRPwKmNJql'
result = convertapi.convert('docx', {'File': file_path}, from_format='doc')
result.file.save(file_path.split(".")[0] + ".docx")
return file_path.split(".")[0] + ".docx"
def doc_to_odt(file_path):
convertapi.api_secret = '0fp22XFRPwKmNJql'
result = convertapi.convert('docx', {'File': file_path}, from_format='odt')
result.file.save(file_path.split(".")[0] + ".docx")
return file_path.split(".")[0] + ".docx"
def media_upload_path(instance, filename):
return os.path.join(f"uploads/{generate_charset(7)}/", filename)
def split_text(text):
texts, groups = [], []
regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', ''))
for t in regt:
if t[0] == t[-1]:
return texts, groups

View File

@ -1,15 +1,31 @@
import magic
from django.db.models.signals import post_save
from django.core.files import File
from django.dispatch import receiver
from celery import chain
from checker.models import Docx, WordDocx
from checker.services.file import doc_to_docx, doc_to_odt
from checker.tasks import process_file, process_word, highlight_file
@receiver(post_save, sender=Docx)
def create_docs(sender, instance, created, **kwargs):
if created:
process_file.apply_async(kwargs={"pk": instance.pk})
highlight_file.apply_async(kwargs={"pk": instance.pk})
type = magic.from_file(instance.file.path, mime=True)
if type == "application/msword":
pth = doc_to_docx(instance.file.path)
with open(pth, 'rb') as f:
instance.file = File(f, name=pth.split("/")[-1])
elif type == "application/vnd.oasis.opendocument.text":
pth = doc_to_odt(instance.file.path)
with open(pth, 'rb') as f:
instance.file = File(f, name=pth.split("/")[-1])
chain(process_file.s(instance.pk), highlight_file.s()).apply_async()

View File

@ -3,49 +3,63 @@ import requests
from celery import shared_task
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
from requests.exceptions import InvalidJSONError
from checker.models import Paragraph, Docx, WordDocx, WordParagraph
from checker.services.file import process_paragraphs, process_word_paragraphs
from checker.services.file import process_paragraphs, process_word_paragraphs, split_text
def process_file(pk: int):
def process_file(pk: int, *args, **kwargs):
file = Docx.objects.get(pk=pk)
uuid = file.uuid
document = docx2txt.process(file.file.path)
paragraphs = process_paragraphs(document)
# paragraphs = process_paragraphs(document)
paragraphs, groups = split_text(document)
file.paragraphs_loaded = len(paragraphs)
cut = 100
counter = 0
len_c = len(paragraphs) + 1
paragraphs = list(paragraphs.values())
for i in range(0, len(paragraphs) // cut + 1):
vals = paragraphs[i * cut : (i + 1) * cut + 1]
dct = {x: vals[x] for x in range(len(vals))}
cut = 10
for i in range(len(paragraphs) // cut):
vals = [x for x in range(i * cut, (i+ 1) * cut)]
dct = {x: paragraphs[x] for x in vals}
x = requests.post("", json=dct, timeout=1)
for el_id, dat in x.json().items():
type_id, score = dat
type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score
x = requests.post("", json=dct)
if x.status_code == 200:
for el_id, dat in x.json().items():
type_id, score = dat
type_id=type_id, docx=file, text=dct[int(el_id)], score=score
#for i in range(0, len(paragraphs) // cut + 1):
# vals = paragraphs[i * cut : (i + 1) * cut + 1]
# dct = {x: vals[x] for x in range(len(vals))}
# x = requests.post("", json=dct)
# if x.status_code == 200:
# try:
# for el_id, dat in x.json().items():
# type_id, score = dat
# Paragraph.objects.create(
# type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score
# )
# g_c += 1
# counter += len(vals)
# print(f"processing {uuid}, {counter}/{len_c}")
# file.paragraphs_processed = counter
# file.save(update_fields=["paragraphs_processed"])
# except InvalidJSONError:
# print("json pars error")
# else:
# print(f"AI server error, {x.status_code}")
counter += len(vals)
print(f"processing {uuid}, {counter}/{len_c}")
file.paragraphs_processed = counter
print(f"AI server error, {x.status_code}")
return f"ok, {pk}"
return pk
def process_word(pk: int):
def process_word(pk: int, *args, **kwargs):
file = WordDocx.objects.get(pk=pk)
uuid = file.uuid
paragraphs = process_word_paragraphs(file.text.tobytes().decode())
@ -53,7 +67,7 @@ def process_word(pk: int):
file.paragraphs_loaded = len(paragraphs)
cut = 150
cut = 10
len_c = len(paragraphs) + 1
paragraphs = list(paragraphs.values())
counter = 0
@ -63,31 +77,34 @@ def process_word(pk: int):
x = requests.post("", json=dct)
if x.status_code == 200:
for el_id, dat in x.json().items():
type_id, score = dat
type_id=type_id, docx=file, text=dct[int(el_id)], score=score
for el_id, dat in x.json().items():
type_id, score = dat
type_id=type_id, docx=file, text=dct[int(el_id)], score=score
counter += len(vals)
print(f"processing {uuid}, {counter}/{len_c}")
file.paragraphs_processed = counter
counter += len(vals)
print(f"processing {uuid}, {counter}/{len_c}")
file.paragraphs_processed = counter
except InvalidJSONError:
print("json pars error")
print(f"AI server error, {x.status_code}")
return f"ok, {pk}"
return pk
def highlight_file(pk: int):
def highlight_file(pk: int, *args, **kwargs):
c = 0
lim = 0
file = Docx.objects.get(pk=pk)
document = Document(file.file.path)
paragraphs = document.paragraphs
cut = 100
cut = 10
for paragraph in paragraphs:
if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
@ -101,20 +118,23 @@ def highlight_file(pk: int):
if dat:
n_dct[el] = dat
x = requests.post("", json=n_dct)
jsn = x.json()
if x.status_code == 200:
for j in range(len(paragraphs_sliced)):
if j in n_dct:
paragraph = paragraphs_sliced[j]
el_id, dat = jsn[str(j)]
if dat < 50:
text = paragraph.text
run = paragraph.add_run()
run.font.highlight_color = WD_COLOR_INDEX.RED
c += 1
print("AI server error")
jsn = x.json()
if x.status_code == 200:
for j in range(len(paragraphs_sliced)):
if j in n_dct:
paragraph = paragraphs_sliced[j]
el_id, dat = jsn[str(j)]
if dat < 50:
text = paragraph.text
run = paragraph.add_run()
run.font.highlight_color = WD_COLOR_INDEX.RED
c += 1
print("AI server error")
except InvalidJSONError:
print("json pars error")
return f"highlighted {c}, {pk}"
return pk