updated doc process

2025-09-11 14:12:27 +03:00 · 2023-09-09 19:59:39 +03:00 · 2023-09-09 19:59:39 +03:00 · f1b5fff3b3
commit f1b5fff3b3
parent 65ac64e48a
3 changed files with 78 additions and 23 deletions
--- a/press_release_nl/processor/api/views.py
+++ b/press_release_nl/processor/api/views.py
@ -52,7 +52,7 @@ class UpdateTextDescriptionApiView(generics.GenericAPIView):
    def get(self, request, *args, **kwargs):
        type = self.request.query_params.get("type")
        text = get_object_or_404(Text, id=self.kwargs["id"])
-        run_mth = ["f", "bert"]
+        run_mth = ["f", "bert", "nearest"]
        if type in run_mth:
            run_mth = [type]
@ -71,8 +71,14 @@ def get(self, request, *args, **kwargs):
                re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
                if re.status_code == 200:
                    text.description["f"]["text"] = re.json()["text"]
            if "nearest" not in text.description and "nearest" in run_mth:
                e = True
                text.description["nearest"] = {}
                text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
                text.score["nearest"]["detailed"] = ""
            if e:
-                text.save(update_fields=["description"])
+                text.save(update_fields=["description", "score"])
        else:
            text.description = {}
@ -87,5 +93,10 @@ def get(self, request, *args, **kwargs):
                re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
                if re.status_code == 200:
                    text.description["f"]["text"] = re.json()["text"]
-            text.save(update_fields=["description"])
+
            if "nearest" in run_mth:
                text.description["nearest"] = {}
                text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
                text.score["nearest"]["detailed"] = ""
            text.save(update_fields=["description", "score"])
        return Response(data=ProcessedTextSerializer().to_representation(instance=text))
--- a/press_release_nl/processor/services.py
+++ b/press_release_nl/processor/services.py
@ -20,6 +20,19 @@ def create_highlighted_document(pk: int, var: str) -> str:
    text = Text.objects.get(pk=pk).description[var]["text"]
    document = Document()
    p = document.add_paragraph()
    if var == "nearest":
        for el in text:
            w = el["features"][1] / 100 - 0.00001
            v = el["text"]
            if v:
                run = p.add_run()
                run.add_text(v)
                tag = run._r
                run.font.size = Pt(11)
                shd = OxmlElement("w:shd")
                shd.set(qn("w:fill"), to_rgb(cmap(w)))
                tag.rPr.append(shd)
    else:
        for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
            if len(e) == 1:
                run = p.add_run()
--- a/press_release_nl/processor/tasks.py
+++ b/press_release_nl/processor/tasks.py
@ -1,8 +1,10 @@
 import subprocess
 from time import sleep
 import requests
 import textract
 from celery import shared_task
 from django.conf import settings
 from press_release_nl.processor.models import Entry, Text
 from press_release_nl.processor.services import create_highlighted_document
@ -41,16 +43,24 @@ def run_ml(pk: int, f=True):
    for text in entry.texts.all():
        re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
        re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
        re_nearest = requests.post(
            ML_HOST + "nearest/nearest", json={"data": text.text}
        )
        if re_bert.status_code != 200:
            print(re_bert.status_code, "bert")
            continue
        if re_tf.status_code != 200:
            print(re_tf.status_code, "tf-idf")
            continue
        if re_nearest.status_code != 200:
            print(re_nearest.status_code, "nearest")
            continue
        text.refresh_from_db()
        text.score = {
            "bert": re_bert.json(),
            "f": re_tf.json(),
            "nearest": re_nearest.json(),
        }
        text.save(update_fields=["score"])
    return pk
@ -78,7 +88,28 @@ def load_text_sum(pk: int):
@shared_task
 def run_create_highlighted_document(pk: int, var: str):
    text = Text.objects.get(pk=pk)
    if "file" in text.description[var]:
        return
    file_path = create_highlighted_document(pk, var)
    text.description[var]["file"] = file_path
    text.save()
    convert_to_pdf.apply_async(kwargs={"pk": pk, "var": var}, countdown=1)
    return pk
@shared_task
 def convert_to_pdf(pk: int, var: str):
    text = Text.objects.get(pk=pk)
    if "pdf" in text.description[var]:
        return
    file_path = text.description[var]["file"]
    subprocess.run(
        "libreoffice --headless --convert-to pdf --outdir".split(" ")
        + [
            settings.MEDIA_ROOT + "/pdf",
            settings.MEDIA_ROOT.replace("/media", "") + file_path,
        ]
    )
    f_path = "/media/pdf/" + file_path.split("/")[-1].replace("docx", "pdf")
    text.description[var]["pdf"] = f_path
    text.save()