updated doc process

This commit is contained in:
Alexander Karpov 2023-09-09 19:59:39 +03:00
parent 65ac64e48a
commit f1b5fff3b3
3 changed files with 78 additions and 23 deletions

View File

@ -52,7 +52,7 @@ class UpdateTextDescriptionApiView(generics.GenericAPIView):
def get(self, request, *args, **kwargs): def get(self, request, *args, **kwargs):
type = self.request.query_params.get("type") type = self.request.query_params.get("type")
text = get_object_or_404(Text, id=self.kwargs["id"]) text = get_object_or_404(Text, id=self.kwargs["id"])
run_mth = ["f", "bert"] run_mth = ["f", "bert", "nearest"]
if type in run_mth: if type in run_mth:
run_mth = [type] run_mth = [type]
@ -71,8 +71,14 @@ def get(self, request, *args, **kwargs):
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text}) re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
if re.status_code == 200: if re.status_code == 200:
text.description["f"]["text"] = re.json()["text"] text.description["f"]["text"] = re.json()["text"]
if "nearest" not in text.description and "nearest" in run_mth:
e = True
text.description["nearest"] = {}
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
text.score["nearest"]["detailed"] = ""
if e: if e:
text.save(update_fields=["description"]) text.save(update_fields=["description", "score"])
else: else:
text.description = {} text.description = {}
@ -87,5 +93,10 @@ def get(self, request, *args, **kwargs):
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text}) re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
if re.status_code == 200: if re.status_code == 200:
text.description["f"]["text"] = re.json()["text"] text.description["f"]["text"] = re.json()["text"]
text.save(update_fields=["description"])
if "nearest" in run_mth:
text.description["nearest"] = {}
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
text.score["nearest"]["detailed"] = ""
text.save(update_fields=["description", "score"])
return Response(data=ProcessedTextSerializer().to_representation(instance=text)) return Response(data=ProcessedTextSerializer().to_representation(instance=text))

View File

@ -20,6 +20,19 @@ def create_highlighted_document(pk: int, var: str) -> str:
text = Text.objects.get(pk=pk).description[var]["text"] text = Text.objects.get(pk=pk).description[var]["text"]
document = Document() document = Document()
p = document.add_paragraph() p = document.add_paragraph()
if var == "nearest":
for el in text:
w = el["features"][1] / 100 - 0.00001
v = el["text"]
if v:
run = p.add_run()
run.add_text(v)
tag = run._r
run.font.size = Pt(11)
shd = OxmlElement("w:shd")
shd.set(qn("w:fill"), to_rgb(cmap(w)))
tag.rPr.append(shd)
else:
for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]: for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
if len(e) == 1: if len(e) == 1:
run = p.add_run() run = p.add_run()

View File

@ -1,8 +1,10 @@
import subprocess
from time import sleep from time import sleep
import requests import requests
import textract import textract
from celery import shared_task from celery import shared_task
from django.conf import settings
from press_release_nl.processor.models import Entry, Text from press_release_nl.processor.models import Entry, Text
from press_release_nl.processor.services import create_highlighted_document from press_release_nl.processor.services import create_highlighted_document
@ -41,16 +43,24 @@ def run_ml(pk: int, f=True):
for text in entry.texts.all(): for text in entry.texts.all():
re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text}) re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text}) re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
re_nearest = requests.post(
ML_HOST + "nearest/nearest", json={"data": text.text}
)
if re_bert.status_code != 200: if re_bert.status_code != 200:
print(re_bert.status_code, "bert") print(re_bert.status_code, "bert")
continue continue
if re_tf.status_code != 200: if re_tf.status_code != 200:
print(re_tf.status_code, "tf-idf") print(re_tf.status_code, "tf-idf")
continue continue
if re_nearest.status_code != 200:
print(re_nearest.status_code, "nearest")
continue
text.refresh_from_db() text.refresh_from_db()
text.score = { text.score = {
"bert": re_bert.json(), "bert": re_bert.json(),
"f": re_tf.json(), "f": re_tf.json(),
"nearest": re_nearest.json(),
} }
text.save(update_fields=["score"]) text.save(update_fields=["score"])
return pk return pk
@ -78,7 +88,28 @@ def load_text_sum(pk: int):
@shared_task @shared_task
def run_create_highlighted_document(pk: int, var: str): def run_create_highlighted_document(pk: int, var: str):
text = Text.objects.get(pk=pk) text = Text.objects.get(pk=pk)
if "file" in text.description[var]:
return
file_path = create_highlighted_document(pk, var) file_path = create_highlighted_document(pk, var)
text.description[var]["file"] = file_path text.description[var]["file"] = file_path
text.save() text.save()
convert_to_pdf.apply_async(kwargs={"pk": pk, "var": var}, countdown=1)
return pk return pk
@shared_task
def convert_to_pdf(pk: int, var: str):
text = Text.objects.get(pk=pk)
if "pdf" in text.description[var]:
return
file_path = text.description[var]["file"]
subprocess.run(
"libreoffice --headless --convert-to pdf --outdir".split(" ")
+ [
settings.MEDIA_ROOT + "/pdf",
settings.MEDIA_ROOT.replace("/media", "") + file_path,
]
)
f_path = "/media/pdf/" + file_path.split("/")[-1].replace("docx", "pdf")
text.description[var]["pdf"] = f_path
text.save()