mirror of
https://github.com/Ai-hack-MAGNUM-OPUS/backend.git
synced 2024-11-22 00:06:34 +03:00
Optimised file processing, added doc, odt -> docx converter, minor changes
This commit is contained in:
parent
2bf771e3a2
commit
6cffc965a8
|
@ -1,4 +1,7 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
import convertapi
|
||||||
|
|
||||||
|
|
||||||
from checker.services.generators import generate_charset
|
from checker.services.generators import generate_charset
|
||||||
|
|
||||||
|
@ -28,5 +31,31 @@ def process_word_paragraphs(text):
|
||||||
return _base_process(text)
|
return _base_process(text)
|
||||||
|
|
||||||
|
|
||||||
|
def doc_to_docx(file_path):
|
||||||
|
convertapi.api_secret = '0fp22XFRPwKmNJql'
|
||||||
|
result = convertapi.convert('docx', {'File': file_path}, from_format='doc')
|
||||||
|
result.file.save(file_path.split(".")[0] + ".docx")
|
||||||
|
return file_path.split(".")[0] + ".docx"
|
||||||
|
|
||||||
|
|
||||||
|
def doc_to_odt(file_path):
|
||||||
|
convertapi.api_secret = '0fp22XFRPwKmNJql'
|
||||||
|
result = convertapi.convert('docx', {'File': file_path}, from_format='odt')
|
||||||
|
result.file.save(file_path.split(".")[0] + ".docx")
|
||||||
|
return file_path.split(".")[0] + ".docx"
|
||||||
|
|
||||||
|
|
||||||
def media_upload_path(instance, filename):
|
def media_upload_path(instance, filename):
|
||||||
return os.path.join(f"uploads/{generate_charset(7)}/", filename)
|
return os.path.join(f"uploads/{generate_charset(7)}/", filename)
|
||||||
|
|
||||||
|
|
||||||
|
def split_text(text):
|
||||||
|
texts, groups = [], []
|
||||||
|
regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', ''))
|
||||||
|
for t in regt:
|
||||||
|
if t[0] == t[-1]:
|
||||||
|
texts.append(t[1])
|
||||||
|
groups.append(int(t[0]))
|
||||||
|
else:
|
||||||
|
print(t)
|
||||||
|
return texts, groups
|
||||||
|
|
|
@ -1,15 +1,31 @@
|
||||||
|
import magic
|
||||||
|
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
|
from django.core.files import File
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
|
from celery import chain
|
||||||
|
|
||||||
from checker.models import Docx, WordDocx
|
from checker.models import Docx, WordDocx
|
||||||
|
from checker.services.file import doc_to_docx, doc_to_odt
|
||||||
from checker.tasks import process_file, process_word, highlight_file
|
from checker.tasks import process_file, process_word, highlight_file
|
||||||
|
|
||||||
|
|
||||||
@receiver(post_save, sender=Docx)
|
@receiver(post_save, sender=Docx)
|
||||||
def create_docs(sender, instance, created, **kwargs):
|
def create_docs(sender, instance, created, **kwargs):
|
||||||
if created:
|
if created:
|
||||||
process_file.apply_async(kwargs={"pk": instance.pk})
|
type = magic.from_file(instance.file.path, mime=True)
|
||||||
highlight_file.apply_async(kwargs={"pk": instance.pk})
|
if type == "application/msword":
|
||||||
|
pth = doc_to_docx(instance.file.path)
|
||||||
|
with open(pth, 'rb') as f:
|
||||||
|
instance.file = File(f, name=pth.split("/")[-1])
|
||||||
|
instance.save(update_fields=["file"])
|
||||||
|
elif type == "application/vnd.oasis.opendocument.text":
|
||||||
|
pth = doc_to_odt(instance.file.path)
|
||||||
|
with open(pth, 'rb') as f:
|
||||||
|
instance.file = File(f, name=pth.split("/")[-1])
|
||||||
|
instance.save(update_fields=["file"])
|
||||||
|
|
||||||
|
chain(process_file.s(instance.pk), highlight_file.s()).apply_async()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,49 +3,63 @@ import requests
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.enum.text import WD_COLOR_INDEX
|
from docx.enum.text import WD_COLOR_INDEX
|
||||||
|
from requests.exceptions import InvalidJSONError
|
||||||
|
|
||||||
from checker.models import Paragraph, Docx, WordDocx, WordParagraph
|
from checker.models import Paragraph, Docx, WordDocx, WordParagraph
|
||||||
from checker.services.file import process_paragraphs, process_word_paragraphs
|
from checker.services.file import process_paragraphs, process_word_paragraphs, split_text
|
||||||
|
|
||||||
|
|
||||||
@shared_task()
|
@shared_task()
|
||||||
def process_file(pk: int):
|
def process_file(pk: int, *args, **kwargs):
|
||||||
file = Docx.objects.get(pk=pk)
|
file = Docx.objects.get(pk=pk)
|
||||||
uuid = file.uuid
|
uuid = file.uuid
|
||||||
document = docx2txt.process(file.file.path)
|
document = docx2txt.process(file.file.path)
|
||||||
paragraphs = process_paragraphs(document)
|
# paragraphs = process_paragraphs(document)
|
||||||
|
paragraphs, groups = split_text(document)
|
||||||
|
|
||||||
file.paragraphs_loaded = len(paragraphs)
|
file.paragraphs_loaded = len(paragraphs)
|
||||||
file.save(update_fields=["paragraphs_loaded"])
|
file.save(update_fields=["paragraphs_loaded"])
|
||||||
|
|
||||||
cut = 100
|
cut = 10
|
||||||
counter = 0
|
for i in range(len(paragraphs) // cut):
|
||||||
len_c = len(paragraphs) + 1
|
vals = [x for x in range(i * cut, (i+ 1) * cut)]
|
||||||
paragraphs = list(paragraphs.values())
|
dct = {x: paragraphs[x] for x in vals}
|
||||||
for i in range(0, len(paragraphs) // cut + 1):
|
x = requests.post("http://109.248.175.223:5000/api", json=dct, timeout=1)
|
||||||
vals = paragraphs[i * cut : (i + 1) * cut + 1]
|
|
||||||
dct = {x: vals[x] for x in range(len(vals))}
|
|
||||||
|
|
||||||
x = requests.post("http://109.248.175.223:5000/api", json=dct)
|
|
||||||
if x.status_code == 200:
|
|
||||||
for el_id, dat in x.json().items():
|
for el_id, dat in x.json().items():
|
||||||
type_id, score = dat
|
type_id, score = dat
|
||||||
Paragraph.objects.create(
|
Paragraph.objects.create(
|
||||||
type_id=type_id, docx=file, text=dct[int(el_id)], score=score
|
type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score
|
||||||
)
|
)
|
||||||
|
|
||||||
counter += len(vals)
|
#for i in range(0, len(paragraphs) // cut + 1):
|
||||||
print(f"processing {uuid}, {counter}/{len_c}")
|
# vals = paragraphs[i * cut : (i + 1) * cut + 1]
|
||||||
file.paragraphs_processed = counter
|
# dct = {x: vals[x] for x in range(len(vals))}
|
||||||
file.save(update_fields=["paragraphs_processed"])
|
#
|
||||||
else:
|
# x = requests.post("http://109.248.175.223:5000/api", json=dct)
|
||||||
print(f"AI server error, {x.status_code}")
|
# if x.status_code == 200:
|
||||||
|
# try:
|
||||||
|
# for el_id, dat in x.json().items():
|
||||||
|
# type_id, score = dat
|
||||||
|
# Paragraph.objects.create(
|
||||||
|
# type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score
|
||||||
|
# )
|
||||||
|
# g_c += 1
|
||||||
|
#
|
||||||
|
# counter += len(vals)
|
||||||
|
# print(f"processing {uuid}, {counter}/{len_c}")
|
||||||
|
# file.paragraphs_processed = counter
|
||||||
|
# file.save(update_fields=["paragraphs_processed"])
|
||||||
|
# except InvalidJSONError:
|
||||||
|
# print("json pars error")
|
||||||
|
# else:
|
||||||
|
# print(f"AI server error, {x.status_code}")
|
||||||
|
|
||||||
return f"ok, {pk}"
|
|
||||||
|
return pk
|
||||||
|
|
||||||
|
|
||||||
@shared_task()
|
@shared_task()
|
||||||
def process_word(pk: int):
|
def process_word(pk: int, *args, **kwargs):
|
||||||
file = WordDocx.objects.get(pk=pk)
|
file = WordDocx.objects.get(pk=pk)
|
||||||
uuid = file.uuid
|
uuid = file.uuid
|
||||||
paragraphs = process_word_paragraphs(file.text.tobytes().decode())
|
paragraphs = process_word_paragraphs(file.text.tobytes().decode())
|
||||||
|
@ -53,7 +67,7 @@ def process_word(pk: int):
|
||||||
file.paragraphs_loaded = len(paragraphs)
|
file.paragraphs_loaded = len(paragraphs)
|
||||||
file.save(update_fields=["paragraphs_loaded"])
|
file.save(update_fields=["paragraphs_loaded"])
|
||||||
|
|
||||||
cut = 150
|
cut = 10
|
||||||
len_c = len(paragraphs) + 1
|
len_c = len(paragraphs) + 1
|
||||||
paragraphs = list(paragraphs.values())
|
paragraphs = list(paragraphs.values())
|
||||||
counter = 0
|
counter = 0
|
||||||
|
@ -63,6 +77,7 @@ def process_word(pk: int):
|
||||||
|
|
||||||
x = requests.post("http://109.248.175.223:5000/api", json=dct)
|
x = requests.post("http://109.248.175.223:5000/api", json=dct)
|
||||||
if x.status_code == 200:
|
if x.status_code == 200:
|
||||||
|
try:
|
||||||
for el_id, dat in x.json().items():
|
for el_id, dat in x.json().items():
|
||||||
type_id, score = dat
|
type_id, score = dat
|
||||||
WordParagraph.objects.create(
|
WordParagraph.objects.create(
|
||||||
|
@ -73,21 +88,23 @@ def process_word(pk: int):
|
||||||
print(f"processing {uuid}, {counter}/{len_c}")
|
print(f"processing {uuid}, {counter}/{len_c}")
|
||||||
file.paragraphs_processed = counter
|
file.paragraphs_processed = counter
|
||||||
file.save(update_fields=["paragraphs_processed"])
|
file.save(update_fields=["paragraphs_processed"])
|
||||||
|
except InvalidJSONError:
|
||||||
|
print("json pars error")
|
||||||
else:
|
else:
|
||||||
print(f"AI server error, {x.status_code}")
|
print(f"AI server error, {x.status_code}")
|
||||||
|
|
||||||
return f"ok, {pk}"
|
return pk
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
def highlight_file(pk: int):
|
def highlight_file(pk: int, *args, **kwargs):
|
||||||
c = 0
|
c = 0
|
||||||
lim = 0
|
lim = 0
|
||||||
file = Docx.objects.get(pk=pk)
|
file = Docx.objects.get(pk=pk)
|
||||||
document = Document(file.file.path)
|
document = Document(file.file.path)
|
||||||
|
|
||||||
paragraphs = document.paragraphs
|
paragraphs = document.paragraphs
|
||||||
cut = 100
|
cut = 10
|
||||||
|
|
||||||
for paragraph in paragraphs:
|
for paragraph in paragraphs:
|
||||||
if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
|
if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
|
||||||
|
@ -101,6 +118,7 @@ def highlight_file(pk: int):
|
||||||
if dat:
|
if dat:
|
||||||
n_dct[el] = dat
|
n_dct[el] = dat
|
||||||
x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
|
x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
|
||||||
|
try:
|
||||||
jsn = x.json()
|
jsn = x.json()
|
||||||
if x.status_code == 200:
|
if x.status_code == 200:
|
||||||
for j in range(len(paragraphs_sliced)):
|
for j in range(len(paragraphs_sliced)):
|
||||||
|
@ -116,5 +134,7 @@ def highlight_file(pk: int):
|
||||||
c += 1
|
c += 1
|
||||||
else:
|
else:
|
||||||
print("AI server error")
|
print("AI server error")
|
||||||
|
except InvalidJSONError:
|
||||||
|
print("json pars error")
|
||||||
document.save(file.file.path)
|
document.save(file.file.path)
|
||||||
return f"highlighted {c}, {pk}"
|
return pk
|
||||||
|
|
Loading…
Reference in New Issue
Block a user