mirror of
https://github.com/magnum-opus-nn-cp/backend.git
synced 2024-12-04 04:33:44 +03:00
updated doc process
This commit is contained in:
parent
65ac64e48a
commit
f1b5fff3b3
|
@ -52,7 +52,7 @@ class UpdateTextDescriptionApiView(generics.GenericAPIView):
|
|||
def get(self, request, *args, **kwargs):
|
||||
type = self.request.query_params.get("type")
|
||||
text = get_object_or_404(Text, id=self.kwargs["id"])
|
||||
run_mth = ["f", "bert"]
|
||||
run_mth = ["f", "bert", "nearest"]
|
||||
if type in run_mth:
|
||||
run_mth = [type]
|
||||
|
||||
|
@ -71,8 +71,14 @@ def get(self, request, *args, **kwargs):
|
|||
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
|
||||
if re.status_code == 200:
|
||||
text.description["f"]["text"] = re.json()["text"]
|
||||
|
||||
if "nearest" not in text.description and "nearest" in run_mth:
|
||||
e = True
|
||||
text.description["nearest"] = {}
|
||||
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
|
||||
text.score["nearest"]["detailed"] = ""
|
||||
if e:
|
||||
text.save(update_fields=["description"])
|
||||
text.save(update_fields=["description", "score"])
|
||||
|
||||
else:
|
||||
text.description = {}
|
||||
|
@ -87,5 +93,10 @@ def get(self, request, *args, **kwargs):
|
|||
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
|
||||
if re.status_code == 200:
|
||||
text.description["f"]["text"] = re.json()["text"]
|
||||
text.save(update_fields=["description"])
|
||||
|
||||
if "nearest" in run_mth:
|
||||
text.description["nearest"] = {}
|
||||
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
|
||||
text.score["nearest"]["detailed"] = ""
|
||||
text.save(update_fields=["description", "score"])
|
||||
return Response(data=ProcessedTextSerializer().to_representation(instance=text))
|
||||
|
|
|
@ -20,26 +20,39 @@ def create_highlighted_document(pk: int, var: str) -> str:
|
|||
text = Text.objects.get(pk=pk).description[var]["text"]
|
||||
document = Document()
|
||||
p = document.add_paragraph()
|
||||
for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
|
||||
if len(e) == 1:
|
||||
run = p.add_run()
|
||||
run.add_text(e[0][0])
|
||||
else:
|
||||
for k, v in e:
|
||||
if "data-value" in k:
|
||||
if v:
|
||||
run = p.add_run()
|
||||
run.add_text(v)
|
||||
w = float(k.split("=")[-1].replace('"', ""))
|
||||
tag = run._r
|
||||
run.font.size = Pt(11)
|
||||
shd = OxmlElement("w:shd")
|
||||
shd.set(qn("w:fill"), to_rgb(cmap(w)))
|
||||
tag.rPr.append(shd)
|
||||
else:
|
||||
if v:
|
||||
run = p.add_run()
|
||||
run.add_text(v)
|
||||
if var == "nearest":
|
||||
for el in text:
|
||||
w = el["features"][1] / 100 - 0.00001
|
||||
v = el["text"]
|
||||
if v:
|
||||
run = p.add_run()
|
||||
run.add_text(v)
|
||||
tag = run._r
|
||||
run.font.size = Pt(11)
|
||||
shd = OxmlElement("w:shd")
|
||||
shd.set(qn("w:fill"), to_rgb(cmap(w)))
|
||||
tag.rPr.append(shd)
|
||||
else:
|
||||
for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
|
||||
if len(e) == 1:
|
||||
run = p.add_run()
|
||||
run.add_text(e[0][0])
|
||||
else:
|
||||
for k, v in e:
|
||||
if "data-value" in k:
|
||||
if v:
|
||||
run = p.add_run()
|
||||
run.add_text(v)
|
||||
w = float(k.split("=")[-1].replace('"', ""))
|
||||
tag = run._r
|
||||
run.font.size = Pt(11)
|
||||
shd = OxmlElement("w:shd")
|
||||
shd.set(qn("w:fill"), to_rgb(cmap(w)))
|
||||
tag.rPr.append(shd)
|
||||
else:
|
||||
if v:
|
||||
run = p.add_run()
|
||||
run.add_text(v)
|
||||
f = settings.MEDIA_ROOT + f"/docx/{pk}_{randint(1, 1000)}.docx"
|
||||
document.save(f)
|
||||
return f.replace(settings.MEDIA_ROOT, "/media")
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import subprocess
|
||||
from time import sleep
|
||||
|
||||
import requests
|
||||
import textract
|
||||
from celery import shared_task
|
||||
from django.conf import settings
|
||||
|
||||
from press_release_nl.processor.models import Entry, Text
|
||||
from press_release_nl.processor.services import create_highlighted_document
|
||||
|
@ -41,16 +43,24 @@ def run_ml(pk: int, f=True):
|
|||
for text in entry.texts.all():
|
||||
re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
|
||||
re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
|
||||
re_nearest = requests.post(
|
||||
ML_HOST + "nearest/nearest", json={"data": text.text}
|
||||
)
|
||||
if re_bert.status_code != 200:
|
||||
print(re_bert.status_code, "bert")
|
||||
continue
|
||||
if re_tf.status_code != 200:
|
||||
print(re_tf.status_code, "tf-idf")
|
||||
continue
|
||||
if re_nearest.status_code != 200:
|
||||
print(re_nearest.status_code, "nearest")
|
||||
continue
|
||||
|
||||
text.refresh_from_db()
|
||||
text.score = {
|
||||
"bert": re_bert.json(),
|
||||
"f": re_tf.json(),
|
||||
"nearest": re_nearest.json(),
|
||||
}
|
||||
text.save(update_fields=["score"])
|
||||
return pk
|
||||
|
@ -78,7 +88,28 @@ def load_text_sum(pk: int):
|
|||
@shared_task
|
||||
def run_create_highlighted_document(pk: int, var: str):
|
||||
text = Text.objects.get(pk=pk)
|
||||
if "file" in text.description[var]:
|
||||
return
|
||||
file_path = create_highlighted_document(pk, var)
|
||||
text.description[var]["file"] = file_path
|
||||
text.save()
|
||||
convert_to_pdf.apply_async(kwargs={"pk": pk, "var": var}, countdown=1)
|
||||
return pk
|
||||
|
||||
|
||||
@shared_task
|
||||
def convert_to_pdf(pk: int, var: str):
|
||||
text = Text.objects.get(pk=pk)
|
||||
if "pdf" in text.description[var]:
|
||||
return
|
||||
file_path = text.description[var]["file"]
|
||||
subprocess.run(
|
||||
"libreoffice --headless --convert-to pdf --outdir".split(" ")
|
||||
+ [
|
||||
settings.MEDIA_ROOT + "/pdf",
|
||||
settings.MEDIA_ROOT.replace("/media", "") + file_path,
|
||||
]
|
||||
)
|
||||
f_path = "/media/pdf/" + file_path.split("/")[-1].replace("docx", "pdf")
|
||||
text.description[var]["pdf"] = f_path
|
||||
text.save()
|
||||
|
|
Loading…
Reference in New Issue
Block a user