From 45cd860803ba3b748021f4d9bcd488190d424120 Mon Sep 17 00:00:00 2001 From: Alexander-D-Karpov Date: Tue, 26 Sep 2023 12:23:00 +0300 Subject: [PATCH] added ml better support, better site notifications --- akarpov/common/ml/__init__.py | 0 akarpov/common/ml/text.py | 51 +++ .../files/migrations/0025_create_vector_ps.py | 10 + .../files/migrations/0026_file_embeddings.py | 18 + .../migrations/0027_merge_20230925_2023.py | 12 + .../migrations/0028_file_content_file_lang.py | 26 ++ akarpov/files/models.py | 4 + akarpov/files/services/text.py | 7 + akarpov/templates/base.html | 59 ++- compose/local/django/Dockerfile | 4 +- compose/production/postgres/Dockerfile | 5 + config/settings/local.py | 4 +- poetry.lock | 383 +++--------------- pyproject.toml | 2 + 14 files changed, 243 insertions(+), 342 deletions(-) create mode 100644 akarpov/common/ml/__init__.py create mode 100644 akarpov/common/ml/text.py create mode 100644 akarpov/files/migrations/0025_create_vector_ps.py create mode 100644 akarpov/files/migrations/0026_file_embeddings.py create mode 100644 akarpov/files/migrations/0027_merge_20230925_2023.py create mode 100644 akarpov/files/migrations/0028_file_content_file_lang.py create mode 100644 akarpov/files/services/text.py diff --git a/akarpov/common/ml/__init__.py b/akarpov/common/ml/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/akarpov/common/ml/text.py b/akarpov/common/ml/text.py new file mode 100644 index 0000000..afe1b53 --- /dev/null +++ b/akarpov/common/ml/text.py @@ -0,0 +1,51 @@ +import pycld2 as cld2 +import spacy +import torch +from transformers import AutoModel, AutoTokenizer + +# load ml classes and models on first request +# TODO: move to outer server/service +nlp = None +ru_nlp = None + +ru_model = None +ru_tokenizer = None +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def get_text_embedding(text: str): + global nlp, ru_nlp, ru_model, ru_tokenizer + + is_reliable, text_bytes_found, details = cld2.detect(text) + if is_reliable: + lang = details[0] + if lang[1] in ["ru", "en"]: + lang = lang[1] + else: + return None + else: + return None + + if lang == "ru": + if not ru_nlp: + ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"]) + lema = " ".join([token.lemma_ for token in ru_nlp(text)]) + if not ru_model: + ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased") + if not ru_tokenizer: + ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased") + encodings = ru_tokenizer( + lema, # the texts to be tokenized + padding=True, # pad the texts to the maximum length (so that all outputs have the same length) + return_tensors="pt", # return the tensors (not lists) + ) + with torch.no_grad(): + # get the model embeddings + embeds = ru_model(**encodings) + embeds = embeds[0] + elif lang == "en": + embeds = None + else: + embeds = None + + return embeds diff --git a/akarpov/files/migrations/0025_create_vector_ps.py b/akarpov/files/migrations/0025_create_vector_ps.py new file mode 100644 index 0000000..9b03df1 --- /dev/null +++ b/akarpov/files/migrations/0025_create_vector_ps.py @@ -0,0 +1,10 @@ +from django.db import migrations +from pgvector.django import VectorExtension + + +class Migration(migrations.Migration): + dependencies = [ + ("files", "0024_alter_file_options_alter_filereport_options_and_more"), + ] + + operations = [VectorExtension()] diff --git a/akarpov/files/migrations/0026_file_embeddings.py b/akarpov/files/migrations/0026_file_embeddings.py new file mode 100644 index 0000000..9efc1c8 --- /dev/null +++ b/akarpov/files/migrations/0026_file_embeddings.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.5 on 2023-09-16 18:33 + +from django.db import migrations +import pgvector.django + + +class Migration(migrations.Migration): + dependencies = [ + ("files", "0025_create_vector_ps"), + ] + + operations = [ + migrations.AddField( + model_name="file", + name="embeddings", + field=pgvector.django.VectorField(dimensions=768, null=True), + ), + ] diff --git a/akarpov/files/migrations/0027_merge_20230925_2023.py b/akarpov/files/migrations/0027_merge_20230925_2023.py new file mode 100644 index 0000000..c3fb947 --- /dev/null +++ b/akarpov/files/migrations/0027_merge_20230925_2023.py @@ -0,0 +1,12 @@ +# Generated by Django 4.2.5 on 2023-09-25 17:23 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("files", "0025_file_notify_user_on_view_alter_basefileitem_parent"), + ("files", "0026_file_embeddings"), + ] + + operations = [] diff --git a/akarpov/files/migrations/0028_file_content_file_lang.py b/akarpov/files/migrations/0028_file_content_file_lang.py new file mode 100644 index 0000000..6c5c0ac --- /dev/null +++ b/akarpov/files/migrations/0028_file_content_file_lang.py @@ -0,0 +1,26 @@ +# Generated by Django 4.2.5 on 2023-09-26 09:04 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("files", "0027_merge_20230925_2023"), + ] + + operations = [ + migrations.AddField( + model_name="file", + name="content", + field=models.TextField(default="", max_length=10000), + preserve_default=False, + ), + migrations.AddField( + model_name="file", + name="lang", + field=models.CharField( + choices=[("ru", "ru"), ("en", "en")], default="en", max_length=2 + ), + preserve_default=False, + ), + ] diff --git a/akarpov/files/models.py b/akarpov/files/models.py index affa15f..15081b5 100644 --- a/akarpov/files/models.py +++ b/akarpov/files/models.py @@ -17,6 +17,7 @@ from django.urls import reverse from model_utils.fields import AutoCreatedField, AutoLastModifiedField from model_utils.models import TimeStampedModel +from pgvector.django import VectorField from polymorphic.models import PolymorphicModel from akarpov.files.services.files import trash_file_upload, user_unique_file_upload @@ -69,6 +70,9 @@ class File(BaseFileItem, TimeStampedModel, ShortLinkModel, UserHistoryModel): preview = FileField(blank=True, upload_to="file/previews/") file_obj = FileField(blank=False, upload_to=user_unique_file_upload) + embeddings = VectorField(dimensions=768, null=True) + content = TextField(max_length=10000) + lang = CharField(max_length=2, choices=[("ru", "ru"), ("en", "en")]) # meta name = CharField(max_length=255, null=True, blank=True) diff --git a/akarpov/files/services/text.py b/akarpov/files/services/text.py new file mode 100644 index 0000000..ca65517 --- /dev/null +++ b/akarpov/files/services/text.py @@ -0,0 +1,7 @@ +import textract + + +def extract_file_text(file: str) -> str: + text = textract.process(file) + + return text diff --git a/akarpov/templates/base.html b/akarpov/templates/base.html index e352fe2..0f3a33d 100644 --- a/akarpov/templates/base.html +++ b/akarpov/templates/base.html @@ -132,6 +132,10 @@ +
+ +
+ @@ -142,18 +146,63 @@ {% endblock inline_javascript %} {% if request.user.is_authenticated %}