updated doc process

This commit is contained in:
Alexander Karpov 2023-09-09 19:59:39 +03:00
parent 65ac64e48a
commit f1b5fff3b3
3 changed files with 78 additions and 23 deletions

View File

@ -52,7 +52,7 @@ class UpdateTextDescriptionApiView(generics.GenericAPIView):
def get(self, request, *args, **kwargs):
type = self.request.query_params.get("type")
text = get_object_or_404(Text, id=self.kwargs["id"])
run_mth = ["f", "bert"]
run_mth = ["f", "bert", "nearest"]
if type in run_mth:
run_mth = [type]
@ -71,8 +71,14 @@ def get(self, request, *args, **kwargs):
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
if re.status_code == 200:
text.description["f"]["text"] = re.json()["text"]
if "nearest" not in text.description and "nearest" in run_mth:
e = True
text.description["nearest"] = {}
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
text.score["nearest"]["detailed"] = ""
if e:
text.save(update_fields=["description"])
text.save(update_fields=["description", "score"])
else:
text.description = {}
@ -87,5 +93,10 @@ def get(self, request, *args, **kwargs):
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
if re.status_code == 200:
text.description["f"]["text"] = re.json()["text"]
text.save(update_fields=["description"])
if "nearest" in run_mth:
text.description["nearest"] = {}
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
text.score["nearest"]["detailed"] = ""
text.save(update_fields=["description", "score"])
return Response(data=ProcessedTextSerializer().to_representation(instance=text))

View File

@ -20,26 +20,39 @@ def create_highlighted_document(pk: int, var: str) -> str:
text = Text.objects.get(pk=pk).description[var]["text"]
document = Document()
p = document.add_paragraph()
for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
if len(e) == 1:
run = p.add_run()
run.add_text(e[0][0])
else:
for k, v in e:
if "data-value" in k:
if v:
run = p.add_run()
run.add_text(v)
w = float(k.split("=")[-1].replace('"', ""))
tag = run._r
run.font.size = Pt(11)
shd = OxmlElement("w:shd")
shd.set(qn("w:fill"), to_rgb(cmap(w)))
tag.rPr.append(shd)
else:
if v:
run = p.add_run()
run.add_text(v)
if var == "nearest":
for el in text:
w = el["features"][1] / 100 - 0.00001
v = el["text"]
if v:
run = p.add_run()
run.add_text(v)
tag = run._r
run.font.size = Pt(11)
shd = OxmlElement("w:shd")
shd.set(qn("w:fill"), to_rgb(cmap(w)))
tag.rPr.append(shd)
else:
for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
if len(e) == 1:
run = p.add_run()
run.add_text(e[0][0])
else:
for k, v in e:
if "data-value" in k:
if v:
run = p.add_run()
run.add_text(v)
w = float(k.split("=")[-1].replace('"', ""))
tag = run._r
run.font.size = Pt(11)
shd = OxmlElement("w:shd")
shd.set(qn("w:fill"), to_rgb(cmap(w)))
tag.rPr.append(shd)
else:
if v:
run = p.add_run()
run.add_text(v)
f = settings.MEDIA_ROOT + f"/docx/{pk}_{randint(1, 1000)}.docx"
document.save(f)
return f.replace(settings.MEDIA_ROOT, "/media")

View File

@ -1,8 +1,10 @@
import subprocess
from time import sleep
import requests
import textract
from celery import shared_task
from django.conf import settings
from press_release_nl.processor.models import Entry, Text
from press_release_nl.processor.services import create_highlighted_document
@ -41,16 +43,24 @@ def run_ml(pk: int, f=True):
for text in entry.texts.all():
re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
re_nearest = requests.post(
ML_HOST + "nearest/nearest", json={"data": text.text}
)
if re_bert.status_code != 200:
print(re_bert.status_code, "bert")
continue
if re_tf.status_code != 200:
print(re_tf.status_code, "tf-idf")
continue
if re_nearest.status_code != 200:
print(re_nearest.status_code, "nearest")
continue
text.refresh_from_db()
text.score = {
"bert": re_bert.json(),
"f": re_tf.json(),
"nearest": re_nearest.json(),
}
text.save(update_fields=["score"])
return pk
@ -78,7 +88,28 @@ def load_text_sum(pk: int):
@shared_task
def run_create_highlighted_document(pk: int, var: str):
text = Text.objects.get(pk=pk)
if "file" in text.description[var]:
return
file_path = create_highlighted_document(pk, var)
text.description[var]["file"] = file_path
text.save()
convert_to_pdf.apply_async(kwargs={"pk": pk, "var": var}, countdown=1)
return pk
@shared_task
def convert_to_pdf(pk: int, var: str):
text = Text.objects.get(pk=pk)
if "pdf" in text.description[var]:
return
file_path = text.description[var]["file"]
subprocess.run(
"libreoffice --headless --convert-to pdf --outdir".split(" ")
+ [
settings.MEDIA_ROOT + "/pdf",
settings.MEDIA_ROOT.replace("/media", "") + file_path,
]
)
f_path = "/media/pdf/" + file_path.split("/")[-1].replace("docx", "pdf")
text.description[var]["pdf"] = f_path
text.save()