added ml better support, better site notifications

This commit is contained in:
Alexander Karpov 2023-09-26 12:23:00 +03:00
parent 513de19a16
commit 45cd860803
14 changed files with 243 additions and 342 deletions

View File

51
akarpov/common/ml/text.py Normal file
View File

@ -0,0 +1,51 @@
import pycld2 as cld2
import spacy
import torch
from transformers import AutoModel, AutoTokenizer
# load ml classes and models on first request
# TODO: move to outer server/service
nlp = None
ru_nlp = None
ru_model = None
ru_tokenizer = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_text_embedding(text: str):
global nlp, ru_nlp, ru_model, ru_tokenizer
is_reliable, text_bytes_found, details = cld2.detect(text)
if is_reliable:
lang = details[0]
if lang[1] in ["ru", "en"]:
lang = lang[1]
else:
return None
else:
return None
if lang == "ru":
if not ru_nlp:
ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"])
lema = " ".join([token.lemma_ for token in ru_nlp(text)])
if not ru_model:
ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
if not ru_tokenizer:
ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
encodings = ru_tokenizer(
lema, # the texts to be tokenized
padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
return_tensors="pt", # return the tensors (not lists)
)
with torch.no_grad():
# get the model embeddings
embeds = ru_model(**encodings)
embeds = embeds[0]
elif lang == "en":
embeds = None
else:
embeds = None
return embeds

View File

@ -0,0 +1,10 @@
from django.db import migrations
from pgvector.django import VectorExtension
class Migration(migrations.Migration):
dependencies = [
("files", "0024_alter_file_options_alter_filereport_options_and_more"),
]
operations = [VectorExtension()]

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.5 on 2023-09-16 18:33
from django.db import migrations
import pgvector.django
class Migration(migrations.Migration):
dependencies = [
("files", "0025_create_vector_ps"),
]
operations = [
migrations.AddField(
model_name="file",
name="embeddings",
field=pgvector.django.VectorField(dimensions=768, null=True),
),
]

View File

@ -0,0 +1,12 @@
# Generated by Django 4.2.5 on 2023-09-25 17:23
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("files", "0025_file_notify_user_on_view_alter_basefileitem_parent"),
("files", "0026_file_embeddings"),
]
operations = []

View File

@ -0,0 +1,26 @@
# Generated by Django 4.2.5 on 2023-09-26 09:04
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("files", "0027_merge_20230925_2023"),
]
operations = [
migrations.AddField(
model_name="file",
name="content",
field=models.TextField(default="", max_length=10000),
preserve_default=False,
),
migrations.AddField(
model_name="file",
name="lang",
field=models.CharField(
choices=[("ru", "ru"), ("en", "en")], default="en", max_length=2
),
preserve_default=False,
),
]

View File

@ -17,6 +17,7 @@
from django.urls import reverse
from model_utils.fields import AutoCreatedField, AutoLastModifiedField
from model_utils.models import TimeStampedModel
from pgvector.django import VectorField
from polymorphic.models import PolymorphicModel
from akarpov.files.services.files import trash_file_upload, user_unique_file_upload
@ -69,6 +70,9 @@ class File(BaseFileItem, TimeStampedModel, ShortLinkModel, UserHistoryModel):
preview = FileField(blank=True, upload_to="file/previews/")
file_obj = FileField(blank=False, upload_to=user_unique_file_upload)
embeddings = VectorField(dimensions=768, null=True)
content = TextField(max_length=10000)
lang = CharField(max_length=2, choices=[("ru", "ru"), ("en", "en")])
# meta
name = CharField(max_length=255, null=True, blank=True)

View File

@ -0,0 +1,7 @@
import textract
def extract_file_text(file: str) -> str:
text = textract.process(file)
return text

View File

@ -132,6 +132,10 @@
<footer class="row bg-light py-1 mt-auto text-center">
<div class="col"> Writen by <a href="/about">sanspie</a>, find source code <a href="https://github.com/Alexander-D-Karpov/akarpov">here</a> </div>
</footer>
<div id="toastContainer" class="toast-container position-fixed bottom-0 end-0 p-3">
</div>
</div>
</div>
</div>
@ -142,18 +146,63 @@
{% endblock inline_javascript %}
{% if request.user.is_authenticated %}
<script>
{# TODO: add automatic socket host retrieve #}
let socket = new WebSocket(`ws://127.0.0.1:8000/ws/notifications/`);
{% if request.is_secure %}
let socket = new WebSocket(`wss://{{ request.get_host }}/ws/notifications/`);
{% else %}
let socket = new WebSocket(`ws://{{ request.get_host }}/ws/notifications/`);
{% endif %}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
function timeSince(date) {
let seconds = Math.floor((new Date() - date) / 1000);
let interval = seconds / 31536000;
if (interval > 1) {
return Math.floor(interval) + " years";
}
interval = seconds / 2592000;
if (interval > 1) {
return Math.floor(interval) + " months";
}
interval = seconds / 86400;
if (interval > 1) {
return Math.floor(interval) + " days";
}
interval = seconds / 3600;
if (interval > 1) {
return Math.floor(interval) + " hours";
}
interval = seconds / 60;
if (interval > 1) {
return Math.floor(interval) + " minutes";
}
return Math.floor(seconds) + " seconds";
}
const toastContainer = document.getElementById('toastContainer')
let fn = async function(event) {
let data = JSON.parse(event.data)
console.log(data)
alert(data.body)
{# TODO add pretty pop up #}
const toast = document.createElement("div")
toast.id = "liveToast"
toast.className = "toast mb-4 ml-2"
toast.setAttribute("role", "alert")
toast.setAttribute("aria-live", "assertive")
toast.setAttribute("aria-atomic", "true")
toast.innerHTML = `<div class="toast-header">
<strong class="me-auto">${data.title}</strong>
<small>${timeSince(Date.parse(data.created))} ago</small>
<button type="button" class="btn-close" data-bs-dismiss="toast" aria-label="Close"></button>
</div>
<div class="toast-body">
${data.body}
</div>`
toastContainer.appendChild(toast)
const toastBootstrap = bootstrap.Toast.getOrCreateInstance(toast)
toastBootstrap.show()
}
socket.onmessage = fn

View File

@ -28,6 +28,8 @@ RUN apt-get update && \
apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev && \
# Dependencies for file preview generation
apt-get install -y webp libimage-exiftool-perl libmagickwand-dev ffmpeg libgdal-dev && \
# ML dependencies \
# none for now
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
rm -rf /var/lib/apt/lists/*

View File

@ -1,5 +1,10 @@
FROM postgres:14
RUN apt-get update && \
apt-get install -y postgresql-14-pgvector && \
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
rm -rf /var/lib/apt/lists/*
COPY ./compose/production/postgres/maintenance /usr/local/bin/maintenance
RUN chmod +x /usr/local/bin/maintenance/*
RUN mv /usr/local/bin/maintenance/* /usr/local/bin \

View File

@ -61,5 +61,5 @@
# SHORTENER
# ------------------------------------------------------------------------------
SHORTENER_REDIRECT_TO = "http://127.0.0.1:8000"
SHORTENER_HOST = "http://127.0.0.1:3000"
SHORTENER_REDIRECT_TO = "https://dev2.akarpov.ru"
SHORTENER_HOST = "https://dev.akarpov.ru"

383
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -107,6 +107,8 @@ pytest-xdist = "^3.3.1"
pytest-mock = "^3.11.1"
pytest-asyncio = "^0.21.1"
pytest-lambda = "^2.2.0"
pgvector = "^0.2.2"
pycld2 = "^0.41"
[build-system]