backend/press_release_nl/processor/tasks.py

from time import sleep

import requests
import textract
from celery import shared_task

from press_release_nl.processor.models import Entry, Text
from press_release_nl.processor.services import create_highlighted_document
from press_release_nl.utils.celery import get_scheduled_tasks_name

ML_HOST = "http://192.168.107.95:8000/"
# ML_HOST = "https://dev2.akarpov.ru/"
ML_SUM_HOST = "https://dev.akarpov.ru/"


@shared_task
def load_text(pk: int):
    text = Text.objects.get(pk=pk)
    if not text.text:
        text.text = textract.process(
            text.file.path, encoding="unicode_escape", language="rus"
        ).decode()
        text.save()
    if not text.text:
        text.delete()
        return


@shared_task
def run_ml(pk: int, f=True):
    if get_scheduled_tasks_name().count("press_release_nl.processor.tasks.run_ml") >= 2:
        run_ml.apply_async(kwargs={"pk": pk}, countdown=10)
        return
    try:
        entry = Entry.objects.get(pk=pk)
    except Entry.DoesNotExist:
        return
    if entry.texts.filter(text__isnull=True).exists():
        run_ml.apply_async(kwargs={"pk": pk}, countdown=10)
        return
    for text in entry.texts.all():
        re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})
        re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})
        if re_bert.status_code != 200:
            print(re_bert.status_code, "bert")
            continue
        if re_tf.status_code != 200:
            print(re_tf.status_code, "tf-idf")
            continue
        text.refresh_from_db()
        text.score = {
            "bert": re_bert.json(),
            "f": re_tf.json(),
        }
        text.save(update_fields=["score"])
    return pk


@shared_task
def load_text_sum(pk: int):
    try:
        text = Text.objects.get(pk=pk)
    except Text.DoesNotExist:
        return
    if not text.text:
        sleep(3)
    text.refresh_from_db()
    re = requests.post(ML_SUM_HOST, json={"body": text.text})
    if re.status_code != 200:
        raise ValueError(re.status_code)
    data = re.json()
    text.refresh_from_db()
    text.summery = str(data)
    text.save(update_fields=["summery"])
    return pk


@shared_task
def run_create_highlighted_document(pk: int, var: str):
    text = Text.objects.get(pk=pk)
    file_path = create_highlighted_document(pk, var)
    text.description[var]["file"] = file_path
    text.save()
    return pk
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00			`from time import sleep`

added text process, ml process 2023-09-08 19:06:21 +03:00			`import requests`
			`import textract`
			`from celery import shared_task`

updated score calc 2023-09-09 10:07:17 +03:00			`from press_release_nl.processor.models import Entry, Text`
added description retrieve, word highlight 2023-09-09 13:39:49 +03:00			`from press_release_nl.processor.services import create_highlighted_document`
added ml tasks throttling 2023-09-09 14:13:08 +03:00			`from press_release_nl.utils.celery import get_scheduled_tasks_name`
added text process, ml process 2023-09-08 19:06:21 +03:00
updated score calc 2023-09-09 10:07:17 +03:00			`ML_HOST = "http://192.168.107.95:8000/"`
added description retrieve, word highlight 2023-09-09 13:39:49 +03:00			`# ML_HOST = "https://dev2.akarpov.ru/"`
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00			`ML_SUM_HOST = "https://dev.akarpov.ru/"`
added text process, ml process 2023-09-08 19:06:21 +03:00

			`@shared_task`
			`def load_text(pk: int):`
			`text = Text.objects.get(pk=pk)`
			`if not text.text:`
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00			`text.text = textract.process(`
			`text.file.path, encoding="unicode_escape", language="rus"`
			`).decode()`
added text process, ml process 2023-09-08 19:06:21 +03:00			`text.save()`
updated score calc 2023-09-09 10:07:17 +03:00			`if not text.text:`
			`text.delete()`
			`return`


			`@shared_task`
			`def run_ml(pk: int, f=True):`
added ml tasks throttling 2023-09-09 14:13:08 +03:00			`if get_scheduled_tasks_name().count("press_release_nl.processor.tasks.run_ml") >= 2:`
			`run_ml.apply_async(kwargs={"pk": pk}, countdown=10)`
			`return`
updated score calc 2023-09-09 10:07:17 +03:00			`try:`
			`entry = Entry.objects.get(pk=pk)`
			`except Entry.DoesNotExist:`
			`return`
			`if entry.texts.filter(text__isnull=True).exists():`
added ml tasks throttling 2023-09-09 14:13:08 +03:00			`run_ml.apply_async(kwargs={"pk": pk}, countdown=10)`
			`return`
updated score calc 2023-09-09 10:07:17 +03:00			`for text in entry.texts.all():`
			`re_bert = requests.post(ML_HOST + "bert/process", json={"data": text.text})`
			`re_tf = requests.post(ML_HOST + "tfidf/process", json={"data": text.text})`
			`if re_bert.status_code != 200:`
			`print(re_bert.status_code, "bert")`
			`continue`
			`if re_tf.status_code != 200:`
			`print(re_tf.status_code, "tf-idf")`
			`continue`
			`text.refresh_from_db()`
			`text.score = {`
			`"bert": re_bert.json(),`
			`"f": re_tf.json(),`
			`}`
			`text.save(update_fields=["score"])`
added text process, ml process 2023-09-08 19:06:21 +03:00			`return pk`
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00

			`@shared_task`
			`def load_text_sum(pk: int):`
updated score calc 2023-09-09 10:07:17 +03:00			`try:`
			`text = Text.objects.get(pk=pk)`
			`except Text.DoesNotExist:`
			`return`
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00			`if not text.text:`
			`sleep(3)`
			`text.refresh_from_db()`
			`re = requests.post(ML_SUM_HOST, json={"body": text.text})`
			`if re.status_code != 200:`
updated score calc 2023-09-09 10:07:17 +03:00			`raise ValueError(re.status_code)`
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00			`data = re.json()`
updated score calc 2023-09-09 10:07:17 +03:00			`text.refresh_from_db()`
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00			`text.summery = str(data)`
updated score calc 2023-09-09 10:07:17 +03:00			`text.save(update_fields=["summery"])`
added xlsx parse, ip lookup 2023-09-08 23:49:29 +03:00			`return pk`
added description retrieve, word highlight 2023-09-09 13:39:49 +03:00

			`@shared_task`
			`def run_create_highlighted_document(pk: int, var: str):`
			`text = Text.objects.get(pk=pk)`
			`file_path = create_highlighted_document(pk, var)`
			`text.description[var]["file"] = file_path`
			`text.save()`
			`return pk`