removed unused dependencies from project, moved files process to external service

This commit is contained in:
Alexander Karpov 2024-10-14 21:34:16 +03:00
parent 03c7c5309c
commit f5835d2821
27 changed files with 2761 additions and 5569 deletions

View File

@ -15,3 +15,5 @@ LAST_FM_SECRET=
SPOTIFY_ID=
SPOTIFY_SECRET=
YANDEX_TOKEN=
PREVIEW_SERVICE_API_KEY=
PREVIEW_SERVICE_URL=

View File

@ -1,51 +0,0 @@
import pycld2 as cld2
import spacy
import torch
from transformers import AutoModel, AutoTokenizer
# load ml classes and models on first request
# TODO: move to outer server/service
nlp = None
ru_nlp = None
ru_model = None
ru_tokenizer = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_text_embedding(text: str):
global nlp, ru_nlp, ru_model, ru_tokenizer
is_reliable, text_bytes_found, details = cld2.detect(text)
if is_reliable:
lang = details[0]
if lang[1] in ["ru", "en"]:
lang = lang[1]
else:
return None
else:
return None
if lang == "ru":
if not ru_nlp:
ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"])
lema = " ".join([token.lemma_ for token in ru_nlp(text)])
if not ru_model:
ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
if not ru_tokenizer:
ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
encodings = ru_tokenizer(
lema, # the texts to be tokenized
padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
return_tensors="pt", # return the tensors (not lists)
)
with torch.no_grad():
# get the model embeddings
embeds = ru_model(**encodings)
embeds = embeds[0]
elif lang == "en":
embeds = None
else:
embeds = None
return embeds

View File

@ -1,16 +1,10 @@
import textract
from akarpov.files.models import File
def view(file: File):
static = ""
content = ""
text = (
textract.process(file.file.path, extension="doc", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
text = file.content.replace("\t", " ")
for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>"
return static, content

View File

@ -1,16 +1,10 @@
import textract
from akarpov.files.models import File
def view(file: File):
static = ""
content = ""
text = (
textract.process(file.file.path, extension="docx", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
text = file.content.replace("\t", " ")
for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>"
return static, content

View File

@ -1,16 +1,10 @@
import textract
from akarpov.files.models import File
def view(file: File):
static = ""
content = ""
text = (
textract.process(file.file.path, extension="odt", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
text = file.content.replace("\t", " ")
for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>"
return static, content

View File

@ -1,5 +1,3 @@
import textract
from akarpov.files.models import File
@ -7,11 +5,7 @@ def view(file: File) -> (str, str):
static = f"""
<meta property="og:title" content="{file.name}" />
"""
text = (
textract.process(file.file.path, extension="ogg", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
text = file.content.replace("\t", " ")
content = (
"""
<div id="waveform">

View File

@ -1,42 +0,0 @@
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
# Set up stop words
english_stopwords = set(stopwords.words("english"))
russian_stopwords = set(stopwords.words("russian"))
# Set up lemmatizers
english_lemmatizer = None
russian_lemmatizer = None
def lemmatize_and_remove_stopwords(text, language="english"):
# Tokenize the text
global english_lemmatizer, russian_lemmatizer
tokens = word_tokenize(text)
# Lemmatize each token based on the specified language
lemmatized_tokens = []
for token in tokens:
if language == "russian":
if not russian_lemmatizer:
russian_lemmatizer = MorphAnalyzer()
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
else: # Default to English
if not english_lemmatizer:
english_lemmatizer = WordNetLemmatizer()
lemmatized_token = english_lemmatizer.lemmatize(token)
lemmatized_tokens.append(lemmatized_token)
# Remove stop words
filtered_tokens = [
token
for token in lemmatized_tokens
if token not in english_stopwords and token not in russian_stopwords
]
# Reconstruct the text
filtered_text = " ".join(filtered_tokens)
return filtered_text

View File

@ -1,8 +1,4 @@
from math import ceil
import magic
from PIL import Image, ImageDraw, ImageFont
from preview_generator.manager import PreviewManager
from akarpov.files.models import File
@ -19,90 +15,11 @@
manager = None
def textfile_to_image(textfile_path) -> Image:
"""Convert text file to a grayscale image.
arguments:
textfile_path - the content of this file will be converted to an image
font_path - path to a font file (for example impact.ttf)
"""
# parse the file into lines stripped of whitespace on the right side
with open(textfile_path) as f:
lines = tuple(line.rstrip() for line in f.readlines())
font: ImageFont = None
large_font = 20 # get better resolution with larger size
for font_filename in COMMON_MONO_FONT_FILENAMES:
try:
font = ImageFont.truetype(font_filename, size=large_font)
print(f'Using font "{font_filename}".')
break
except OSError:
print(f'Could not load font "{font_filename}".')
if font is None:
font = ImageFont.load_default()
print("Using default font.")
def _font_points_to_pixels(pt):
return round(pt * 96.0 / 72)
margin_pixels = 20
# height of the background image
tallest_line = max(lines, key=lambda line: font.getsize(line)[PIL_HEIGHT_INDEX])
max_line_height = _font_points_to_pixels(
font.getsize(tallest_line)[PIL_HEIGHT_INDEX]
)
realistic_line_height = max_line_height * 0.8
image_height = int(ceil(realistic_line_height * len(lines) + 2 * margin_pixels))
widest_line = max(lines, key=lambda s: font.getsize(s)[PIL_WIDTH_INDEX])
max_line_width = _font_points_to_pixels(font.getsize(widest_line)[PIL_WIDTH_INDEX])
image_width = int(ceil(max_line_width + (2 * margin_pixels)))
# draw the background
background_color = 255 # white
image = Image.new(
PIL_GRAYSCALE, (image_width, image_height), color=background_color
)
draw = ImageDraw.Draw(image)
font_color = 0
horizontal_position = margin_pixels
for i, line in enumerate(lines):
vertical_position = int(round(margin_pixels + (i * realistic_line_height)))
draw.text(
(horizontal_position, vertical_position), line, fill=font_color, font=font
)
return image
def create_preview(file_path: str) -> str:
global manager
# TODO: add text image generation/code image
if not manager:
manager = PreviewManager(cache_path, create_folder=True)
if manager.has_jpeg_preview(file_path):
return manager.get_jpeg_preview(file_path, height=500)
return ""
def get_file_mimetype(file_path: str) -> str:
mime = magic.Magic(mime=True)
return mime.from_file(file_path)
def get_description(file_path: str) -> str:
global manager
if not manager:
manager = PreviewManager(cache_path, create_folder=True)
if manager.has_text_preview(file_path):
return manager.get_text_preview(file_path)
return ""
def get_base_meta(file: File):
preview = file.preview.url if file.preview else ""
description = file.description if file.description else ""

View File

@ -11,12 +11,6 @@
from akarpov.files.models import File
from ..documents import FileDocument
from .lema import lemmatize_and_remove_stopwords
"""
Calculus on types of searches:
https://new.akarpov.ru/files/FZUTFBIyfbdlDHVzxUNU
"""
class BaseSearch:
@ -140,23 +134,20 @@ class SimilaritySearch(BaseSearch):
def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search")
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
queryset = (
self.queryset.annotate(
name_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("name"), filtered_query),
TrigramSimilarity(UnaccentLower("name"), query),
Value(0),
output_field=FloatField(),
),
description_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("description"), filtered_query),
TrigramSimilarity(UnaccentLower("description"), query),
Value(0),
output_field=FloatField(),
),
content_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("content"), filtered_query),
TrigramSimilarity(UnaccentLower("content"), query),
Value(0),
output_field=FloatField(),
),

View File

@ -1,18 +0,0 @@
import chardet
import textract
from textract.exceptions import ExtensionNotSupported
def extract_file_text(file: str) -> str:
try:
text = textract.process(file)
except ExtensionNotSupported:
try:
rawdata = open(file, "rb").read()
enc = chardet.detect(rawdata)
with open(file, encoding=enc["encoding"]) as f:
text = f.read()
except Exception:
return ""
return text

View File

@ -1,40 +1,69 @@
import os
import base64
import time
from urllib.parse import urljoin
import requests
import structlog
from celery import shared_task
from django.conf import settings
from django.core import management
from django.core.files import File
from django.core.files.base import ContentFile
from akarpov.files.models import File as FileModel
from akarpov.files.services.preview import create_preview, get_file_mimetype
from akarpov.files.services.text import extract_file_text
logger = structlog.get_logger(__name__)
def sanitize_content(content):
"""Remove NUL (0x00) characters from the content."""
if isinstance(content, str):
return content.replace("\x00", "")
elif isinstance(content, bytes):
return content.replace(b"\x00", b"")
return content
@shared_task()
def process_file(pk: int):
pth = None
file = FileModel.objects.get(pk=pk)
if not file.name:
file.name = file.file.name.split("/")[-1]
try:
pth = create_preview(file.file.path)
if pth:
with open(pth, "rb") as f:
file.preview.save(
pth.split("/")[-1],
File(f),
save=False,
)
api_url = urljoin(settings.PREVIEW_SERVICE_URL, "/process_file/")
files = {"file": (file.name, file.file.open("rb"))}
headers = {
"X-API-Key": settings.PREVIEW_SERVICE_API_KEY,
"Accept": "application/json",
}
response = requests.post(api_url, files=files, headers=headers)
if response.status_code != 200:
logger.error(f"Failed to process file {pk}: {response.text}")
return
result = response.json()
file.file_type = result["file_type"]
file.content = sanitize_content(result["content"])
if result["preview"]:
image_data = base64.b64decode(result["preview"])
file.preview.save(
f"{file.name}_preview.jpg", ContentFile(image_data), save=False
)
file.save()
logger.info(f"File {pk} processed successfully")
except Exception as e:
logger.error(e)
file.file_type = get_file_mimetype(file.file.path)
file.content = extract_file_text(file.file.path)
file.save(update_fields=["preview", "name", "file_type", "content"])
if pth and os.path.isfile(pth):
os.remove(pth)
logger.error(f"Error processing file {pk}: {str(e)}")
finally:
file.file.close()
return pk

View File

@ -6,6 +6,7 @@
from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly
from akarpov.music.api.serializers import (
AddSongToPlaylistSerializer,
AllSearchSerializer,
AnonMusicUserSerializer,
FullAlbumSerializer,
FullAuthorSerializer,
@ -19,7 +20,6 @@
ListSongSlugsSerializer,
PlaylistSerializer,
SongSerializer,
AllSearchSerializer,
)
from akarpov.music.models import (
Album,
@ -29,7 +29,7 @@
SongUserRating,
UserListenHistory,
)
from akarpov.music.services.search import search_song, search_album, search_author
from akarpov.music.services.search import search_album, search_author, search_song
from akarpov.music.tasks import listen_to_song
from akarpov.users.models import User

View File

@ -1,7 +1,7 @@
from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry
from akarpov.music.models import Song, Album, Author
from akarpov.music.models import Album, Author, Song
@registry.register_document

View File

@ -3,8 +3,8 @@
from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl import Q as ES_Q
from akarpov.music.documents import SongDocument, AlbumDocument, AuthorDocument
from akarpov.music.models import Song, Author, Album
from akarpov.music.documents import AlbumDocument, AuthorDocument, SongDocument
from akarpov.music.models import Album, Author, Song
def search_song(query):

View File

@ -35,7 +35,8 @@ def album_create(sender, instance, created, **kwargs):
@receiver(post_save)
def send_que_status(sender, instance, created, **kwargs): ...
def send_que_status(sender, instance, created, **kwargs):
...
@receiver(pre_save, sender=SongUserRating)

View File

@ -4,7 +4,6 @@
import pylast
import spotipy
import structlog
import ytmusicapi
from asgiref.sync import async_to_sync
from celery import shared_task
from channels.layers import get_channel_layer
@ -12,6 +11,7 @@
from django.utils import timezone
from django.utils.timezone import now
from spotipy import SpotifyClientCredentials
from ytmusicapi import YTMusic
from akarpov.music.api.serializers import SongSerializer
from akarpov.music.models import (

View File

@ -3,6 +3,4 @@
set -o errexit
set -o nounset
/install_preview_dependencies
celery -A config.celery_app worker --autoscale 20 -l INFO

View File

@ -1,14 +0,0 @@
#!/bin/bash
apt-get update
apt-get install wget libnotify4 scribus libappindicator3-1 libayatana-indicator3-7 libdbusmenu-glib4 libdbusmenu-gtk3-4
apt-get install -y poppler-utils libfile-mimeinfo-perl ghostscript libsecret-1-0 zlib1g-dev libjpeg-dev imagemagick libmagic1 libreoffice inkscape xvfb
apt-get install -y libxml2-dev libxslt1-dev antiword unrtf tesseract-ocr flac lame libmad0 libsox-fmt-mp3 sox swig
apt-get install -y python-dev-is-python3 libxml2-dev libxslt1-dev antiword unrtf poppler-utils tesseract-ocr \
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
wget https://github.com/jgraph/drawio-desktop/releases/download/v13.0.3/draw.io-amd64-13.0.3.deb
dpkg -i draw.io-amd64-13.0.3.deb
rm draw.io-amd64-13.0.3.deb
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
rm -rf /var/lib/apt/lists/*
preview --check-dependencies

View File

@ -758,3 +758,9 @@
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
USE_X_FORWARDED_HOST = True
USE_X_FORWARDED_PORT = True
# PREVIEW
# ------------------------------------------------------------------------------
PREVIEW_SERVICE_URL = env("PREVIEW_SERVICE_URL", default=None)
PREVIEW_SERVICE_API_KEY = env("PREVIEW_SERVICE_API_KEY", default=None)

5764
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -69,7 +69,6 @@ channels = {extras = ["daphne"], version = "^4.0.0"}
django-upload-validator = "^1.1.6"
markdown = "^3.4.4"
pydotplus = "^2.0.2"
preview-generator = "^0.29"
uuid = "^1.30"
mutagen = "^1.46.0"
pydub = "^0.25.1"
@ -100,11 +99,8 @@ pytest-mock = "^3.11.1"
pytest-asyncio = "^0.21.1"
pytest-lambda = "^2.2.0"
pgvector = "^0.2.2"
pycld2 = "^0.41"
uuid6 = "^2023.5.2"
uvicorn = "0.23.2"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142"
fastapi = "0.103.0"
pydantic-settings = "^2.0.3"
@ -118,9 +114,9 @@ spotdl = "^4.2.4"
fuzzywuzzy = "^0.18.0"
python-levenshtein = "^0.23.0"
pylast = "^5.2.0"
textract = {git = "https://github.com/Alexander-D-Karpov/textract.git", branch = "master"}
librosa = "^0.10.1"
django-ckeditor-5 = "^0.2.12"
chardet = "^5.2.0"
[build-system]

View File

View File

@ -1,6 +0,0 @@
from haystack import Document
from milvus_haystack import MilvusDocumentStore
ds = MilvusDocumentStore()
ds.write_documents([Document("Some Content")])
ds.get_all_documents()

2185
search/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +0,0 @@
[tool.poetry]
name = "search"
version = "0.1.0"
description = ""
authors = ["Alexander-D-Karpov <alexandr.d.karpov@gmail.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
fastapi = "0.99.1"
pydantic = "1.10.13"
transformers = {version = "4.34.1", extras = ["torch"]}
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,4 +0,0 @@
#!/bin/bash
python -m spacy download en_core_web_lg
python -m spacy download xx_sent_ud_sm
python -m spacy download ru_core_news_lg