mirror of
https://github.com/magnum-opus-nn-cp/backend.git
synced 2024-09-21 03:08:54 +03:00
updated doc process
This commit is contained in:
parent
65ac64e48a
commit
f1b5fff3b3
|
@ -52,7 +52,7 @@ class UpdateTextDescriptionApiView(generics.GenericAPIView):
|
||||||
def get(self, request, *args, **kwargs):
|
def get(self, request, *args, **kwargs):
|
||||||
type = self.request.query_params.get("type")
|
type = self.request.query_params.get("type")
|
||||||
text = get_object_or_404(Text, id=self.kwargs["id"])
|
text = get_object_or_404(Text, id=self.kwargs["id"])
|
||||||
run_mth = ["f", "bert"]
|
run_mth = ["f", "bert", "nearest"]
|
||||||
if type in run_mth:
|
if type in run_mth:
|
||||||
run_mth = [type]
|
run_mth = [type]
|
||||||
|
|
||||||
|
@ -71,8 +71,14 @@ def get(self, request, *args, **kwargs):
|
||||||
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
|
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
|
||||||
if re.status_code == 200:
|
if re.status_code == 200:
|
||||||
text.description["f"]["text"] = re.json()["text"]
|
text.description["f"]["text"] = re.json()["text"]
|
||||||
|
|
||||||
|
if "nearest" not in text.description and "nearest" in run_mth:
|
||||||
|
e = True
|
||||||
|
text.description["nearest"] = {}
|
||||||
|
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
|
||||||
|
text.score["nearest"]["detailed"] = ""
|
||||||
if e:
|
if e:
|
||||||
text.save(update_fields=["description"])
|
text.save(update_fields=["description", "score"])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
text.description = {}
|
text.description = {}
|
||||||
|
@ -87,5 +93,10 @@ def get(self, request, *args, **kwargs):
|
||||||
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
|
re = requests.post(ML_HOST + "tfidf/describe", json={"data": text.text})
|
||||||
if re.status_code == 200:
|
if re.status_code == 200:
|
||||||
text.description["f"]["text"] = re.json()["text"]
|
text.description["f"]["text"] = re.json()["text"]
|
||||||
text.save(update_fields=["description"])
|
|
||||||
|
if "nearest" in run_mth:
|
||||||
|
text.description["nearest"] = {}
|
||||||
|
text.description["nearest"]["text"] = text.score["nearest"]["detailed"]
|
||||||
|
text.score["nearest"]["detailed"] = ""
|
||||||
|
text.save(update_fields=["description", "score"])
|
||||||
return Response(data=ProcessedTextSerializer().to_representation(instance=text))
|
return Response(data=ProcessedTextSerializer().to_representation(instance=text))
|
||||||
|
|
|
@ -20,6 +20,19 @@ def create_highlighted_document(pk: int, var: str) -> str:
|
||||||
text = Text.objects.get(pk=pk).description[var]["text"]
|
text = Text.objects.get(pk=pk).description[var]["text"]
|
||||||
document = Document()
|
document = Document()
|
||||||
p = document.add_paragraph()
|
p = document.add_paragraph()
|
||||||
|
if var == "nearest":
|
||||||
|
for el in text:
|
||||||
|
w = el["features"][1] / 100 - 0.00001
|
||||||
|
v = el["text"]
|
||||||
|
if v:
|
||||||
|
run = p.add_run()
|
||||||
|
run.add_text(v)
|
||||||
|
tag = run._r
|
||||||
|
run.font.size = Pt(11)
|
||||||
|
shd = OxmlElement("w:shd")
|
||||||
|
shd.set(qn("w:fill"), to_rgb(cmap(w)))
|
||||||
|
tag.rPr.append(shd)
|
||||||
|
else:
|
||||||
for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
|
for e in [[y.split(">") for y in x.split("<")] for x in text.split("<span ")]:
|
||||||
if len(e) == 1:
|
if len(e) == 1:
|
||||||
run = p.add_run()
|
run = p.add_run()
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
|
import subprocess
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import textract
|
import textract
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
from press_release_nl.processor.models import Entry, Text
|
from press_release_nl.processor.models import Entry, Text
|
||||||
from press_release_nl.processor.services import create_highlighted_document
|
from press_release_nl.processor.services import create_highlighted_document
|
||||||
|
@ -41,16 +43,24 @@ def run_ml(pk: int, f=True):
|
||||||
for text in entry.texts.all():
|
for text in entry.texts.all():
|
||||||
re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
|
re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
|
||||||
re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
|
re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
|
||||||
|
re_nearest = requests.post(
|
||||||
|
ML_HOST + "nearest/nearest", json={"data": text.text}
|
||||||
|
)
|
||||||
if re_bert.status_code != 200:
|
if re_bert.status_code != 200:
|
||||||
print(re_bert.status_code, "bert")
|
print(re_bert.status_code, "bert")
|
||||||
continue
|
continue
|
||||||
if re_tf.status_code != 200:
|
if re_tf.status_code != 200:
|
||||||
print(re_tf.status_code, "tf-idf")
|
print(re_tf.status_code, "tf-idf")
|
||||||
continue
|
continue
|
||||||
|
if re_nearest.status_code != 200:
|
||||||
|
print(re_nearest.status_code, "nearest")
|
||||||
|
continue
|
||||||
|
|
||||||
text.refresh_from_db()
|
text.refresh_from_db()
|
||||||
text.score = {
|
text.score = {
|
||||||
"bert": re_bert.json(),
|
"bert": re_bert.json(),
|
||||||
"f": re_tf.json(),
|
"f": re_tf.json(),
|
||||||
|
"nearest": re_nearest.json(),
|
||||||
}
|
}
|
||||||
text.save(update_fields=["score"])
|
text.save(update_fields=["score"])
|
||||||
return pk
|
return pk
|
||||||
|
@ -78,7 +88,28 @@ def load_text_sum(pk: int):
|
||||||
@shared_task
|
@shared_task
|
||||||
def run_create_highlighted_document(pk: int, var: str):
|
def run_create_highlighted_document(pk: int, var: str):
|
||||||
text = Text.objects.get(pk=pk)
|
text = Text.objects.get(pk=pk)
|
||||||
|
if "file" in text.description[var]:
|
||||||
|
return
|
||||||
file_path = create_highlighted_document(pk, var)
|
file_path = create_highlighted_document(pk, var)
|
||||||
text.description[var]["file"] = file_path
|
text.description[var]["file"] = file_path
|
||||||
text.save()
|
text.save()
|
||||||
|
convert_to_pdf.apply_async(kwargs={"pk": pk, "var": var}, countdown=1)
|
||||||
return pk
|
return pk
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task
|
||||||
|
def convert_to_pdf(pk: int, var: str):
|
||||||
|
text = Text.objects.get(pk=pk)
|
||||||
|
if "pdf" in text.description[var]:
|
||||||
|
return
|
||||||
|
file_path = text.description[var]["file"]
|
||||||
|
subprocess.run(
|
||||||
|
"libreoffice --headless --convert-to pdf --outdir".split(" ")
|
||||||
|
+ [
|
||||||
|
settings.MEDIA_ROOT + "/pdf",
|
||||||
|
settings.MEDIA_ROOT.replace("/media", "") + file_path,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
f_path = "/media/pdf/" + file_path.split("/")[-1].replace("docx", "pdf")
|
||||||
|
text.description[var]["pdf"] = f_path
|
||||||
|
text.save()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user