mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-27 17:03:44 +03:00
Compare commits
2 Commits
8c7d2cded6
...
db88ec65e1
Author | SHA1 | Date | |
---|---|---|---|
|
db88ec65e1 | ||
45cd860803 |
0
akarpov/common/ml/__init__.py
Normal file
0
akarpov/common/ml/__init__.py
Normal file
51
akarpov/common/ml/text.py
Normal file
51
akarpov/common/ml/text.py
Normal file
|
@ -0,0 +1,51 @@
|
|||
import pycld2 as cld2
|
||||
import spacy
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
# load ml classes and models on first request
|
||||
# TODO: move to outer server/service
|
||||
nlp = None
|
||||
ru_nlp = None
|
||||
|
||||
ru_model = None
|
||||
ru_tokenizer = None
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
|
||||
def get_text_embedding(text: str):
|
||||
global nlp, ru_nlp, ru_model, ru_tokenizer
|
||||
|
||||
is_reliable, text_bytes_found, details = cld2.detect(text)
|
||||
if is_reliable:
|
||||
lang = details[0]
|
||||
if lang[1] in ["ru", "en"]:
|
||||
lang = lang[1]
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
if lang == "ru":
|
||||
if not ru_nlp:
|
||||
ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"])
|
||||
lema = " ".join([token.lemma_ for token in ru_nlp(text)])
|
||||
if not ru_model:
|
||||
ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
|
||||
if not ru_tokenizer:
|
||||
ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
|
||||
encodings = ru_tokenizer(
|
||||
lema, # the texts to be tokenized
|
||||
padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
|
||||
return_tensors="pt", # return the tensors (not lists)
|
||||
)
|
||||
with torch.no_grad():
|
||||
# get the model embeddings
|
||||
embeds = ru_model(**encodings)
|
||||
embeds = embeds[0]
|
||||
elif lang == "en":
|
||||
embeds = None
|
||||
else:
|
||||
embeds = None
|
||||
|
||||
return embeds
|
10
akarpov/files/migrations/0025_create_vector_ps.py
Normal file
10
akarpov/files/migrations/0025_create_vector_ps.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
from django.db import migrations
|
||||
from pgvector.django import VectorExtension
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("files", "0024_alter_file_options_alter_filereport_options_and_more"),
|
||||
]
|
||||
|
||||
operations = [VectorExtension()]
|
18
akarpov/files/migrations/0026_file_embeddings.py
Normal file
18
akarpov/files/migrations/0026_file_embeddings.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 4.2.5 on 2023-09-16 18:33
|
||||
|
||||
from django.db import migrations
|
||||
import pgvector.django
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("files", "0025_create_vector_ps"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="file",
|
||||
name="embeddings",
|
||||
field=pgvector.django.VectorField(dimensions=768, null=True),
|
||||
),
|
||||
]
|
12
akarpov/files/migrations/0027_merge_20230925_2023.py
Normal file
12
akarpov/files/migrations/0027_merge_20230925_2023.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# Generated by Django 4.2.5 on 2023-09-25 17:23
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("files", "0025_file_notify_user_on_view_alter_basefileitem_parent"),
|
||||
("files", "0026_file_embeddings"),
|
||||
]
|
||||
|
||||
operations = []
|
26
akarpov/files/migrations/0028_file_content_file_lang.py
Normal file
26
akarpov/files/migrations/0028_file_content_file_lang.py
Normal file
|
@ -0,0 +1,26 @@
|
|||
# Generated by Django 4.2.5 on 2023-09-26 09:04
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("files", "0027_merge_20230925_2023"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="file",
|
||||
name="content",
|
||||
field=models.TextField(default="", max_length=10000),
|
||||
preserve_default=False,
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="file",
|
||||
name="lang",
|
||||
field=models.CharField(
|
||||
choices=[("ru", "ru"), ("en", "en")], default="en", max_length=2
|
||||
),
|
||||
preserve_default=False,
|
||||
),
|
||||
]
|
|
@ -17,6 +17,7 @@
|
|||
from django.urls import reverse
|
||||
from model_utils.fields import AutoCreatedField, AutoLastModifiedField
|
||||
from model_utils.models import TimeStampedModel
|
||||
from pgvector.django import VectorField
|
||||
from polymorphic.models import PolymorphicModel
|
||||
|
||||
from akarpov.files.services.files import trash_file_upload, user_unique_file_upload
|
||||
|
@ -69,6 +70,9 @@ class File(BaseFileItem, TimeStampedModel, ShortLinkModel, UserHistoryModel):
|
|||
|
||||
preview = FileField(blank=True, upload_to="file/previews/")
|
||||
file_obj = FileField(blank=False, upload_to=user_unique_file_upload)
|
||||
embeddings = VectorField(dimensions=768, null=True)
|
||||
content = TextField(max_length=10000)
|
||||
lang = CharField(max_length=2, choices=[("ru", "ru"), ("en", "en")])
|
||||
|
||||
# meta
|
||||
name = CharField(max_length=255, null=True, blank=True)
|
||||
|
|
7
akarpov/files/services/text.py
Normal file
7
akarpov/files/services/text.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
import textract
|
||||
|
||||
|
||||
def extract_file_text(file: str) -> str:
|
||||
text = textract.process(file)
|
||||
|
||||
return text
|
|
@ -132,6 +132,10 @@
|
|||
<footer class="row bg-light py-1 mt-auto text-center">
|
||||
<div class="col"> Writen by <a href="/about">sanspie</a>, find source code <a href="https://github.com/Alexander-D-Karpov/akarpov">here</a> </div>
|
||||
</footer>
|
||||
<div id="toastContainer" class="toast-container position-fixed bottom-0 end-0 p-3">
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
@ -142,18 +146,63 @@
|
|||
{% endblock inline_javascript %}
|
||||
{% if request.user.is_authenticated %}
|
||||
<script>
|
||||
{# TODO: add automatic socket host retrieve #}
|
||||
let socket = new WebSocket(`ws://127.0.0.1:8000/ws/notifications/`);
|
||||
{% if request.is_secure %}
|
||||
let socket = new WebSocket(`wss://{{ request.get_host }}/ws/notifications/`);
|
||||
{% else %}
|
||||
let socket = new WebSocket(`ws://{{ request.get_host }}/ws/notifications/`);
|
||||
{% endif %}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
function timeSince(date) {
|
||||
let seconds = Math.floor((new Date() - date) / 1000);
|
||||
let interval = seconds / 31536000;
|
||||
if (interval > 1) {
|
||||
return Math.floor(interval) + " years";
|
||||
}
|
||||
interval = seconds / 2592000;
|
||||
if (interval > 1) {
|
||||
return Math.floor(interval) + " months";
|
||||
}
|
||||
interval = seconds / 86400;
|
||||
if (interval > 1) {
|
||||
return Math.floor(interval) + " days";
|
||||
}
|
||||
interval = seconds / 3600;
|
||||
if (interval > 1) {
|
||||
return Math.floor(interval) + " hours";
|
||||
}
|
||||
interval = seconds / 60;
|
||||
if (interval > 1) {
|
||||
return Math.floor(interval) + " minutes";
|
||||
}
|
||||
return Math.floor(seconds) + " seconds";
|
||||
}
|
||||
|
||||
const toastContainer = document.getElementById('toastContainer')
|
||||
|
||||
|
||||
let fn = async function(event) {
|
||||
let data = JSON.parse(event.data)
|
||||
console.log(data)
|
||||
alert(data.body)
|
||||
{# TODO add pretty pop up #}
|
||||
const toast = document.createElement("div")
|
||||
toast.id = "liveToast"
|
||||
toast.className = "toast mb-4 ml-2"
|
||||
toast.setAttribute("role", "alert")
|
||||
toast.setAttribute("aria-live", "assertive")
|
||||
toast.setAttribute("aria-atomic", "true")
|
||||
toast.innerHTML = `<div class="toast-header">
|
||||
<strong class="me-auto">${data.title}</strong>
|
||||
<small>${timeSince(Date.parse(data.created))} ago</small>
|
||||
<button type="button" class="btn-close" data-bs-dismiss="toast" aria-label="Close"></button>
|
||||
</div>
|
||||
<div class="toast-body">
|
||||
${data.body}
|
||||
</div>`
|
||||
toastContainer.appendChild(toast)
|
||||
const toastBootstrap = bootstrap.Toast.getOrCreateInstance(toast)
|
||||
toastBootstrap.show()
|
||||
}
|
||||
|
||||
socket.onmessage = fn
|
||||
|
|
|
@ -25,9 +25,11 @@ WORKDIR ${APP_HOME}
|
|||
|
||||
# Install required system dependencies
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev && \
|
||||
apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev && \
|
||||
# Dependencies for file preview generation
|
||||
apt-get install -y webp libimage-exiftool-perl libmagickwand-dev ffmpeg libgdal-dev && \
|
||||
# ML dependencies \
|
||||
# none for now
|
||||
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
FROM postgres:16
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y postgresql-14-pgvector && \
|
||||
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY ./compose/production/postgres/maintenance /usr/local/bin/maintenance
|
||||
RUN chmod +x /usr/local/bin/maintenance/*
|
||||
RUN mv /usr/local/bin/maintenance/* /usr/local/bin \
|
||||
|
|
|
@ -61,5 +61,5 @@
|
|||
|
||||
# SHORTENER
|
||||
# ------------------------------------------------------------------------------
|
||||
SHORTENER_REDIRECT_TO = "http://127.0.0.1:8000"
|
||||
SHORTENER_HOST = "http://127.0.0.1:3000"
|
||||
SHORTENER_REDIRECT_TO = "https://dev2.akarpov.ru"
|
||||
SHORTENER_HOST = "https://dev.akarpov.ru"
|
||||
|
|
383
poetry.lock
generated
383
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
|
@ -107,6 +107,8 @@ pytest-xdist = "^3.3.1"
|
|||
pytest-mock = "^3.11.1"
|
||||
pytest-asyncio = "^0.21.1"
|
||||
pytest-lambda = "^2.2.0"
|
||||
pgvector = "^0.2.2"
|
||||
pycld2 = "^0.41"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
|
Loading…
Reference in New Issue
Block a user