updated doc process

2025-08-04 20:10:12 +03:00 · 2023-09-09 19:59:39 +03:00 · 2023-09-09 19:59:39 +03:00 · f1b5fff3b3
commit f1b5fff3b3
parent 65ac64e48a
3 changed files with 78 additions and 23 deletions
--- a/press_release_nl/processor/api/views.py
+++ b/press_release_nl/processor/api/views.py
@ -52,7 +52,7 @@ class UpdateTextDescriptionApiView(generics.GenericAPIView):
    def get(self, request, *args, **kwargs):
        type = self.request.query_params.get("type")
        text = get_object_or_404(Text, id=self.kwargs["id"])
-        run_mth = ["f", "bert"]
+        run_mth = ["f", "bert", "nearest"]
        if type in run_mth:
            run_mth = [type]

@ -71,8 +71,14 @@ def get(self, request, *args, **kwargs):
                re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
                if re.status_code == 200:
                    text.description["f"]["text"] = re.json()["text"]
+
+            if "nearest" not in text.description and "nearest" in run_mth:
+                e = True
+                text.description["nearest"] = {}
+                text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
+                text.score["nearest"]["detailed"] = ""
            if e:
-                text.save(update_fields=["description"])
+                text.save(update_fields=["description", "score"])

        else:
            text.description = {}
@ -87,5 +93,10 @@ def get(self, request, *args, **kwargs):
                re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
                if re.status_code == 200:
                    text.description["f"]["text"] = re.json()["text"]
-            text.save(update_fields=["description"])
+
+            if "nearest" in run_mth:
+                text.description["nearest"] = {}
+                text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
+                text.score["nearest"]["detailed"] = ""
+            text.save(update_fields=["description", "score"])
        return Response(data=ProcessedTextSerializer().to_representation(instance=text))
--- a/press_release_nl/processor/services.py
+++ b/press_release_nl/processor/services.py
@ -20,26 +20,39 @@ def create_highlighted_document(pk: int, var: str) -> str:
    text = Text.objects.get(pk=pk).description[var]["text"]
    document = Document()
    p = document.add_paragraph()
-    for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
-        if len(e) == 1:
-            run = p.add_run()
-            run.add_text(e[0][0])
-        else:
-            for k, v in e:
-                if "data-value" in k:
-                    if v:
-                        run = p.add_run()
-                        run.add_text(v)
-                        w = float(k.split("=")[-1].replace('"', ""))
-                        tag = run._r
-                        run.font.size = Pt(11)
-                        shd = OxmlElement("w:shd")
-                        shd.set(qn("w:fill"), to_rgb(cmap(w)))
-                        tag.rPr.append(shd)
-                else:
-                    if v:
-                        run = p.add_run()
-                        run.add_text(v)
+    if var == "nearest":
+        for el in text:
+            w = el["features"][1] / 100 - 0.00001
+            v = el["text"]
+            if v:
+                run = p.add_run()
+                run.add_text(v)
+                tag = run._r
+                run.font.size = Pt(11)
+                shd = OxmlElement("w:shd")
+                shd.set(qn("w:fill"), to_rgb(cmap(w)))
+                tag.rPr.append(shd)
+    else:
+        for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
+            if len(e) == 1:
+                run = p.add_run()
+                run.add_text(e[0][0])
+            else:
+                for k, v in e:
+                    if "data-value" in k:
+                        if v:
+                            run = p.add_run()
+                            run.add_text(v)
+                            w = float(k.split("=")[-1].replace('"', ""))
+                            tag = run._r
+                            run.font.size = Pt(11)
+                            shd = OxmlElement("w:shd")
+                            shd.set(qn("w:fill"), to_rgb(cmap(w)))
+                            tag.rPr.append(shd)
+                    else:
+                        if v:
+                            run = p.add_run()
+                            run.add_text(v)
    f = settings.MEDIA_ROOT + f"/docx/{pk}_{randint(1, 1000)}.docx"
    document.save(f)
    return f.replace(settings.MEDIA_ROOT, "/media")
--- a/press_release_nl/processor/tasks.py
+++ b/press_release_nl/processor/tasks.py
@ -1,8 +1,10 @@
+import subprocess
 from time import sleep

 import requests
 import textract
 from celery import shared_task
+from django.conf import settings

 from press_release_nl.processor.models import Entry, Text
 from press_release_nl.processor.services import create_highlighted_document
@ -41,16 +43,24 @@ def run_ml(pk: int, f=True):
    for text in entry.texts.all():
        re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
        re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
+        re_nearest = requests.post(
+            ML_HOST + "nearest/nearest", json={"data": text.text}
+        )
        if re_bert.status_code != 200:
            print(re_bert.status_code, "bert")
            continue
        if re_tf.status_code != 200:
            print(re_tf.status_code, "tf-idf")
            continue
+        if re_nearest.status_code != 200:
+            print(re_nearest.status_code, "nearest")
+            continue
+
        text.refresh_from_db()
        text.score = {
            "bert": re_bert.json(),
            "f": re_tf.json(),
+            "nearest": re_nearest.json(),
        }
        text.save(update_fields=["score"])
    return pk
@ -78,7 +88,28 @@ def load_text_sum(pk: int):
@shared_task
 def run_create_highlighted_document(pk: int, var: str):
    text = Text.objects.get(pk=pk)
+    if "file" in text.description[var]:
+        return
    file_path = create_highlighted_document(pk, var)
    text.description[var]["file"] = file_path
    text.save()
+    convert_to_pdf.apply_async(kwargs={"pk": pk, "var": var}, countdown=1)
    return pk
+
+
+@shared_task
+def convert_to_pdf(pk: int, var: str):
+    text = Text.objects.get(pk=pk)
+    if "pdf" in text.description[var]:
+        return
+    file_path = text.description[var]["file"]
+    subprocess.run(
+        "libreoffice --headless --convert-to pdf --outdir".split(" ")
+        + [
+            settings.MEDIA_ROOT + "/pdf",
+            settings.MEDIA_ROOT.replace("/media", "") + file_path,
+        ]
+    )
+    f_path = "/media/pdf/" + file_path.split("/")[-1].replace("docx", "pdf")
+    text.description[var]["pdf"] = f_path
+    text.save()