2022-08-27 07:38:54 +03:00
|
|
|
import docx2txt
|
|
|
|
import requests
|
2022-08-26 20:04:45 +03:00
|
|
|
from celery import shared_task
|
2022-08-27 14:30:23 +03:00
|
|
|
from docx import Document
|
|
|
|
from docx.enum.text import WD_COLOR_INDEX
|
2022-08-26 20:04:45 +03:00
|
|
|
|
2022-08-27 11:59:23 +03:00
|
|
|
from checker.models import Paragraph, Docx, WordDocx, WordParagraph
|
|
|
|
from checker.services.file import process_paragraphs, process_word_paragraphs
|
2022-08-27 07:38:54 +03:00
|
|
|
|
|
|
|
|
|
|
|
@shared_task()
|
|
|
|
def process_file(pk: int):
|
|
|
|
file = Docx.objects.get(pk=pk)
|
2022-08-27 10:16:21 +03:00
|
|
|
uuid = file.uuid
|
2022-08-27 07:38:54 +03:00
|
|
|
document = docx2txt.process(file.file.path)
|
2022-08-27 10:16:21 +03:00
|
|
|
paragraphs = process_paragraphs(document)
|
2022-08-26 20:04:45 +03:00
|
|
|
|
2022-08-27 07:38:54 +03:00
|
|
|
file.paragraphs_loaded = len(paragraphs)
|
|
|
|
file.save(update_fields=["paragraphs_loaded"])
|
|
|
|
|
2022-08-27 10:16:21 +03:00
|
|
|
cut = 100
|
|
|
|
counter = 0
|
2022-08-27 15:46:13 +03:00
|
|
|
len_c = len(paragraphs) + 1
|
2022-08-27 10:16:21 +03:00
|
|
|
paragraphs = list(paragraphs.values())
|
|
|
|
for i in range(0, len(paragraphs) // cut + 1):
|
|
|
|
vals = paragraphs[i * cut : (i + 1) * cut + 1]
|
|
|
|
dct = {x: vals[x] for x in range(len(vals))}
|
2022-08-27 07:38:54 +03:00
|
|
|
|
2022-08-27 10:16:21 +03:00
|
|
|
x = requests.post("http://109.248.175.223:5000/api", json=dct)
|
2022-08-27 11:13:36 +03:00
|
|
|
if x.status_code == 200:
|
|
|
|
for el_id, dat in x.json().items():
|
|
|
|
type_id, score = dat
|
|
|
|
Paragraph.objects.create(
|
|
|
|
type_id=type_id, docx=file, text=dct[int(el_id)], score=score
|
|
|
|
)
|
|
|
|
|
|
|
|
counter += len(vals)
|
|
|
|
print(f"processing {uuid}, {counter}/{len_c}")
|
|
|
|
file.paragraphs_processed = counter
|
|
|
|
file.save(update_fields=["paragraphs_processed"])
|
|
|
|
else:
|
|
|
|
print(f"AI server error, {x.status_code}")
|
2022-08-27 07:38:54 +03:00
|
|
|
|
2022-08-27 10:16:21 +03:00
|
|
|
return f"ok, {pk}"
|
2022-08-27 11:59:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
@shared_task()
|
|
|
|
def process_word(pk: int):
|
|
|
|
file = WordDocx.objects.get(pk=pk)
|
|
|
|
uuid = file.uuid
|
|
|
|
paragraphs = process_word_paragraphs(file.text.tobytes().decode())
|
|
|
|
|
|
|
|
file.paragraphs_loaded = len(paragraphs)
|
|
|
|
file.save(update_fields=["paragraphs_loaded"])
|
|
|
|
|
2022-08-27 15:46:13 +03:00
|
|
|
cut = 150
|
|
|
|
len_c = len(paragraphs) + 1
|
2022-08-27 11:59:23 +03:00
|
|
|
paragraphs = list(paragraphs.values())
|
2022-08-27 15:46:13 +03:00
|
|
|
counter = 0
|
2022-08-27 11:59:23 +03:00
|
|
|
for i in range(0, len(paragraphs) // cut + 1):
|
|
|
|
vals = paragraphs[i * cut : (i + 1) * cut + 1]
|
|
|
|
dct = {x: vals[x] for x in range(len(vals))}
|
|
|
|
|
|
|
|
x = requests.post("http://109.248.175.223:5000/api", json=dct)
|
|
|
|
if x.status_code == 200:
|
|
|
|
for el_id, dat in x.json().items():
|
|
|
|
type_id, score = dat
|
|
|
|
WordParagraph.objects.create(
|
|
|
|
type_id=type_id, docx=file, text=dct[int(el_id)], score=score
|
|
|
|
)
|
|
|
|
|
|
|
|
counter += len(vals)
|
|
|
|
print(f"processing {uuid}, {counter}/{len_c}")
|
|
|
|
file.paragraphs_processed = counter
|
|
|
|
file.save(update_fields=["paragraphs_processed"])
|
|
|
|
else:
|
|
|
|
print(f"AI server error, {x.status_code}")
|
|
|
|
|
|
|
|
return f"ok, {pk}"
|
|
|
|
|
2022-08-27 14:30:23 +03:00
|
|
|
|
|
|
|
@shared_task
|
|
|
|
def highlight_file(pk: int):
|
|
|
|
c = 0
|
2022-08-27 15:46:13 +03:00
|
|
|
lim = 0
|
2022-08-27 14:30:23 +03:00
|
|
|
file = Docx.objects.get(pk=pk)
|
|
|
|
document = Document(file.file.path)
|
|
|
|
|
2022-08-27 15:46:13 +03:00
|
|
|
paragraphs = document.paragraphs
|
|
|
|
cut = 100
|
|
|
|
|
|
|
|
for paragraph in paragraphs:
|
|
|
|
if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.":
|
|
|
|
break
|
|
|
|
lim += 1
|
|
|
|
for i in range(0, len(paragraphs) // cut + 1):
|
|
|
|
paragraphs_sliced = paragraphs[i * cut + lim : (i + 1) * cut + lim + 1]
|
|
|
|
dct = {x: paragraphs_sliced[x].text for x in range(len(paragraphs_sliced))}
|
|
|
|
n_dct = {}
|
|
|
|
for el, dat in dct.items():
|
|
|
|
if dat:
|
|
|
|
n_dct[el] = dat
|
|
|
|
x = requests.post("http://109.248.175.223:5000/api", json=n_dct)
|
|
|
|
jsn = x.json()
|
|
|
|
if x.status_code == 200:
|
|
|
|
for j in range(len(paragraphs_sliced)):
|
|
|
|
if j in n_dct:
|
|
|
|
paragraph = paragraphs_sliced[j]
|
|
|
|
el_id, dat = jsn[str(j)]
|
2022-08-27 14:30:23 +03:00
|
|
|
if dat < 50:
|
|
|
|
text = paragraph.text
|
|
|
|
paragraph.clear()
|
|
|
|
run = paragraph.add_run()
|
|
|
|
run.font.highlight_color = WD_COLOR_INDEX.RED
|
|
|
|
run.add_text(text)
|
|
|
|
c += 1
|
2022-08-27 15:46:13 +03:00
|
|
|
else:
|
|
|
|
print("AI server error")
|
2022-08-27 14:30:23 +03:00
|
|
|
document.save(file.file.path)
|
|
|
|
return f"highlighted {c}, {pk}"
|