2023-09-08 23:49:29 +03:00
|
|
|
from time import sleep
|
|
|
|
|
2023-09-08 19:06:21 +03:00
|
|
|
import requests
|
|
|
|
import textract
|
|
|
|
from celery import shared_task
|
|
|
|
|
2023-09-09 10:07:17 +03:00
|
|
|
from press_release_nl.processor.models import Entry, Text
|
2023-09-09 13:39:49 +03:00
|
|
|
from press_release_nl.processor.services import create_highlighted_document
|
2023-09-09 14:13:08 +03:00
|
|
|
from press_release_nl.utils.celery import get_scheduled_tasks_name
|
2023-09-08 19:06:21 +03:00
|
|
|
|
2023-09-09 10:07:17 +03:00
|
|
|
ML_HOST = "http://192.168.107.95:8000/"
|
2023-09-09 13:39:49 +03:00
|
|
|
# ML_HOST = "https://dev2.akarpov.ru/"
|
2023-09-08 23:49:29 +03:00
|
|
|
ML_SUM_HOST = "https://dev.akarpov.ru/"
|
2023-09-08 19:06:21 +03:00
|
|
|
|
|
|
|
|
|
|
|
@shared_task
|
|
|
|
def load_text(pk: int):
|
|
|
|
text = Text.objects.get(pk=pk)
|
|
|
|
if not text.text:
|
2023-09-08 23:49:29 +03:00
|
|
|
text.text = textract.process(
|
|
|
|
text.file.path, encoding="unicode_escape", language="rus"
|
|
|
|
).decode()
|
2023-09-08 19:06:21 +03:00
|
|
|
text.save()
|
2023-09-09 10:07:17 +03:00
|
|
|
if not text.text:
|
|
|
|
text.delete()
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
@shared_task
|
|
|
|
def run_ml(pk: int, f=True):
|
2023-09-09 14:13:08 +03:00
|
|
|
if get_scheduled_tasks_name().count("press_release_nl.processor.tasks.run_ml") >= 2:
|
|
|
|
run_ml.apply_async(kwargs={"pk": pk}, countdown=10)
|
|
|
|
return
|
2023-09-09 10:07:17 +03:00
|
|
|
try:
|
|
|
|
entry = Entry.objects.get(pk=pk)
|
|
|
|
except Entry.DoesNotExist:
|
|
|
|
return
|
|
|
|
if entry.texts.filter(text__isnull=True).exists():
|
2023-09-09 14:13:08 +03:00
|
|
|
run_ml.apply_async(kwargs={"pk": pk}, countdown=10)
|
|
|
|
return
|
2023-09-09 10:07:17 +03:00
|
|
|
for text in entry.texts.all():
|
|
|
|
re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
|
|
|
|
re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
|
|
|
|
if re_bert.status_code != 200:
|
|
|
|
print(re_bert.status_code, "bert")
|
|
|
|
continue
|
|
|
|
if re_tf.status_code != 200:
|
|
|
|
print(re_tf.status_code, "tf-idf")
|
|
|
|
continue
|
|
|
|
text.refresh_from_db()
|
|
|
|
text.score = {
|
|
|
|
"bert": re_bert.json(),
|
|
|
|
"f": re_tf.json(),
|
|
|
|
}
|
|
|
|
text.save(update_fields=["score"])
|
2023-09-08 19:06:21 +03:00
|
|
|
return pk
|
2023-09-08 23:49:29 +03:00
|
|
|
|
|
|
|
|
|
|
|
@shared_task
|
|
|
|
def load_text_sum(pk: int):
|
2023-09-09 10:07:17 +03:00
|
|
|
try:
|
|
|
|
text = Text.objects.get(pk=pk)
|
|
|
|
except Text.DoesNotExist:
|
|
|
|
return
|
2023-09-08 23:49:29 +03:00
|
|
|
if not text.text:
|
|
|
|
sleep(3)
|
|
|
|
text.refresh_from_db()
|
|
|
|
re = requests.post(ML_SUM_HOST, json={"body": text.text})
|
|
|
|
if re.status_code != 200:
|
2023-09-09 10:07:17 +03:00
|
|
|
raise ValueError(re.status_code)
|
2023-09-08 23:49:29 +03:00
|
|
|
data = re.json()
|
2023-09-09 10:07:17 +03:00
|
|
|
text.refresh_from_db()
|
2023-09-08 23:49:29 +03:00
|
|
|
text.summery = str(data)
|
2023-09-09 10:07:17 +03:00
|
|
|
text.save(update_fields=["summery"])
|
2023-09-08 23:49:29 +03:00
|
|
|
return pk
|
2023-09-09 13:39:49 +03:00
|
|
|
|
|
|
|
|
|
|
|
@shared_task
|
|
|
|
def run_create_highlighted_document(pk: int, var: str):
|
|
|
|
text = Text.objects.get(pk=pk)
|
|
|
|
file_path = create_highlighted_document(pk, var)
|
|
|
|
text.description[var]["file"] = file_path
|
|
|
|
text.save()
|
|
|
|
return pk
|