Optimised file processing, added doc, odt -> docx converter, minor changes

This commit is contained in:
Alexander Karpov 2022-08-28 12:37:28 +03:00
parent 2bf771e3a2
commit 6cffc965a8
3 changed files with 121 additions and 56 deletions

View File

@ -1,4 +1,7 @@
import os import os
import re
import convertapi
from checker.services.generators import generate_charset from checker.services.generators import generate_charset
@ -28,5 +31,31 @@ def process_word_paragraphs(text):
return _base_process(text) return _base_process(text)
def doc_to_docx(file_path):
convertapi.api_secret = '0fp22XFRPwKmNJql'
result = convertapi.convert('docx', {'File': file_path}, from_format='doc')
result.file.save(file_path.split(".")[0] + ".docx")
return file_path.split(".")[0] + ".docx"
def doc_to_odt(file_path):
convertapi.api_secret = '0fp22XFRPwKmNJql'
result = convertapi.convert('docx', {'File': file_path}, from_format='odt')
result.file.save(file_path.split(".")[0] + ".docx")
return file_path.split(".")[0] + ".docx"
def media_upload_path(instance, filename): def media_upload_path(instance, filename):
return os.path.join(f"uploads/{generate_charset(7)}/", filename) return os.path.join(f"uploads/{generate_charset(7)}/", filename)
def split_text(text):
texts, groups = [], []
regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', ''))
for t in regt:
if t[0] == t[-1]:
texts.append(t[1])
groups.append(int(t[0]))
else:
print(t)
return texts, groups

View File

@ -1,15 +1,31 @@
import magic
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.core.files import File
from django.dispatch import receiver from django.dispatch import receiver
from celery import chain
from checker.models import Docx, WordDocx from checker.models import Docx, WordDocx
from checker.services.file import doc_to_docx, doc_to_odt
from checker.tasks import process_file, process_word, highlight_file from checker.tasks import process_file, process_word, highlight_file
@receiver(post_save, sender=Docx) @receiver(post_save, sender=Docx)
def create_docs(sender, instance, created, **kwargs): def create_docs(sender, instance, created, **kwargs):
if created: if created:
process_file.apply_async(kwargs={"pk": instance.pk}) type = magic.from_file(instance.file.path, mime=True)
highlight_file.apply_async(kwargs={"pk": instance.pk}) if type == "application/msword":
pth = doc_to_docx(instance.file.path)
with open(pth, 'rb') as f:
instance.file = File(f, name=pth.split("/")[-1])
instance.save(update_fields=["file"])
elif type == "application/vnd.oasis.opendocument.text":
pth = doc_to_odt(instance.file.path)
with open(pth, 'rb') as f:
instance.file = File(f, name=pth.split("/")[-1])
instance.save(update_fields=["file"])
chain(process_file.s(instance.pk), highlight_file.s()).apply_async()
return return

View File

@ -3,49 +3,63 @@ import requests
from celery import shared_task from celery import shared_task
from docx import Document from docx import Document
from docx.enum.text import WD_COLOR_INDEX from docx.enum.text import WD_COLOR_INDEX
from requests.exceptions import InvalidJSONError
from checker.models import Paragraph, Docx, WordDocx, WordParagraph from checker.models import Paragraph, Docx, WordDocx, WordParagraph
from checker.services.file import process_paragraphs, process_word_paragraphs from checker.services.file import process_paragraphs, process_word_paragraphs, split_text
@shared_task() @shared_task()
def process_file(pk: int): def process_file(pk: int, *args, **kwargs):
file = Docx.objects.get(pk=pk) file = Docx.objects.get(pk=pk)
uuid = file.uuid uuid = file.uuid
document = docx2txt.process(file.file.path) document = docx2txt.process(file.file.path)
paragraphs = process_paragraphs(document) # paragraphs = process_paragraphs(document)
paragraphs, groups = split_text(document)
file.paragraphs_loaded = len(paragraphs) file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"]) file.save(update_fields=["paragraphs_loaded"])
cut = 100 cut = 10
counter = 0 for i in range(len(paragraphs) // cut):
len_c = len(paragraphs) + 1 vals = [x for x in range(i * cut, (i+ 1) * cut)]
paragraphs = list(paragraphs.values()) dct = {x: paragraphs[x] for x in vals}
for i in range(0, len(paragraphs) // cut + 1): x = requests.post("http://109.248.175.223:5000/api", json=dct, timeout=1)
vals = paragraphs[i * cut : (i + 1) * cut + 1] for el_id, dat in x.json().items():
dct = {x: vals[x] for x in range(len(vals))} type_id, score = dat
Paragraph.objects.create(
type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score
)
x = requests.post("http://109.248.175.223:5000/api", json=dct) #for i in range(0, len(paragraphs) // cut + 1):
if x.status_code == 200: # vals = paragraphs[i * cut : (i + 1) * cut + 1]
for el_id, dat in x.json().items(): # dct = {x: vals[x] for x in range(len(vals))}
type_id, score = dat #
Paragraph.objects.create( # x = requests.post("http://109.248.175.223:5000/api", json=dct)
type_id=type_id, docx=file, text=dct[int(el_id)], score=score # if x.status_code == 200:
) # try:
# for el_id, dat in x.json().items():
# type_id, score = dat
# Paragraph.objects.create(
# type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score
# )
# g_c += 1
#
# counter += len(vals)
# print(f"processing {uuid}, {counter}/{len_c}")
# file.paragraphs_processed = counter
# file.save(update_fields=["paragraphs_processed"])
# except InvalidJSONError:
# print("json pars error")
# else:
# print(f"AI server error, {x.status_code}")
counter += len(vals)
print(f"processing {uuid}, {counter}/{len_c}")
file.paragraphs_processed = counter
file.save(update_fields=["paragraphs_processed"])
else:
print(f"AI server error, {x.status_code}")
return f"ok, {pk}" return pk
@shared_task() @shared_task()
def process_word(pk: int): def process_word(pk: int, *args, **kwargs):
file = WordDocx.objects.get(pk=pk) file = WordDocx.objects.get(pk=pk)
uuid = file.uuid uuid = file.uuid
paragraphs = process_word_paragraphs(file.text.tobytes().decode()) paragraphs = process_word_paragraphs(file.text.tobytes().decode())
@ -53,7 +67,7 @@ def process_word(pk: int):
file.paragraphs_loaded = len(paragraphs) file.paragraphs_loaded = len(paragraphs)
file.save(update_fields=["paragraphs_loaded"]) file.save(update_fields=["paragraphs_loaded"])
cut = 150 cut = 10
len_c = len(paragraphs) + 1 len_c = len(paragraphs) + 1
paragraphs = list(paragraphs.values()) paragraphs = list(paragraphs.values())
counter = 0 counter = 0
@ -63,31 +77,34 @@ def process_word(pk: int):
x = requests.post("http://109.248.175.223:5000/api", json=dct) x = requests.post("http://109.248.175.223:5000/api", json=dct)
if x.status_code == 200: if x.status_code == 200:
for el_id, dat in x.json().items(): try:
type_id, score = dat for el_id, dat in x.json().items():
WordParagraph.objects.create( type_id, score = dat
type_id=type_id, docx=file, text=dct[int(el_id)], score=score WordParagraph.objects.create(
) type_id=type_id, docx=file, text=dct[int(el_id)], score=score
)
counter += len(vals) counter += len(vals)
print(f"processing {uuid}, {counter}/{len_c}") print(f"processing {uuid}, {counter}/{len_c}")
file.paragraphs_processed = counter file.paragraphs_processed = counter
file.save(update_fields=["paragraphs_processed"]) file.save(update_fields=["paragraphs_processed"])
except InvalidJSONError:
print("json pars error")
else: else:
print(f"AI server error, {x.status_code}") print(f"AI server error, {x.status_code}")
return f"ok, {pk}" return pk
@shared_task @shared_task
def highlight_file(pk: int): def highlight_file(pk: int, *args, **kwargs):
c = 0 c = 0
lim = 0 lim = 0
file = Docx.objects.get(pk=pk) file = Docx.objects.get(pk=pk)
document = Document(file.file.path) document = Document(file.file.path)
paragraphs = document.paragraphs paragraphs = document.paragraphs
cut = 100 cut = 10
for paragraph in paragraphs: for paragraph in paragraphs:
if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.": if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
@ -101,20 +118,23 @@ def highlight_file(pk: int):
if dat: if dat:
n_dct[el] = dat n_dct[el] = dat
x = requests.post("http://109.248.175.223:5000/api", json=n_dct) x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
jsn = x.json() try:
if x.status_code == 200: jsn = x.json()
for j in range(len(paragraphs_sliced)): if x.status_code == 200:
if j in n_dct: for j in range(len(paragraphs_sliced)):
paragraph = paragraphs_sliced[j] if j in n_dct:
el_id, dat = jsn[str(j)] paragraph = paragraphs_sliced[j]
if dat < 50: el_id, dat = jsn[str(j)]
text = paragraph.text if dat < 50:
paragraph.clear() text = paragraph.text
run = paragraph.add_run() paragraph.clear()
run.font.highlight_color = WD_COLOR_INDEX.RED run = paragraph.add_run()
run.add_text(text) run.font.highlight_color = WD_COLOR_INDEX.RED
c += 1 run.add_text(text)
else: c += 1
print("AI server error") else:
print("AI server error")
except InvalidJSONError:
print("json pars error")
document.save(file.file.path) document.save(file.file.path)
return f"highlighted {c}, {pk}" return pk