removed unused dependencies from project, moved files process to external service

This commit is contained in:
Alexander Karpov 2024-10-14 21:34:16 +03:00
parent 03c7c5309c
commit f5835d2821
27 changed files with 2761 additions and 5569 deletions

View File

@ -15,3 +15,5 @@ LAST_FM_SECRET=
SPOTIFY_ID= SPOTIFY_ID=
SPOTIFY_SECRET= SPOTIFY_SECRET=
YANDEX_TOKEN= YANDEX_TOKEN=
PREVIEW_SERVICE_API_KEY=
PREVIEW_SERVICE_URL=

View File

@ -1,51 +0,0 @@
import pycld2 as cld2
import spacy
import torch
from transformers import AutoModel, AutoTokenizer
# load ml classes and models on first request
# TODO: move to outer server/service
nlp = None
ru_nlp = None
ru_model = None
ru_tokenizer = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_text_embedding(text: str):
global nlp, ru_nlp, ru_model, ru_tokenizer
is_reliable, text_bytes_found, details = cld2.detect(text)
if is_reliable:
lang = details[0]
if lang[1] in ["ru", "en"]:
lang = lang[1]
else:
return None
else:
return None
if lang == "ru":
if not ru_nlp:
ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"])
lema = " ".join([token.lemma_ for token in ru_nlp(text)])
if not ru_model:
ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
if not ru_tokenizer:
ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
encodings = ru_tokenizer(
lema, # the texts to be tokenized
padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
return_tensors="pt", # return the tensors (not lists)
)
with torch.no_grad():
# get the model embeddings
embeds = ru_model(**encodings)
embeds = embeds[0]
elif lang == "en":
embeds = None
else:
embeds = None
return embeds

View File

@ -1,16 +1,10 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
def view(file: File): def view(file: File):
static = "" static = ""
content = "" content = ""
text = ( text = file.content.replace("\t", " ")
textract.process(file.file.path, extension="doc", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
for line in text.split("\n"): for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>" content += f"<p class='mt-1'>{line}</p>"
return static, content return static, content

View File

@ -1,16 +1,10 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
def view(file: File): def view(file: File):
static = "" static = ""
content = "" content = ""
text = ( text = file.content.replace("\t", " ")
textract.process(file.file.path, extension="docx", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
for line in text.split("\n"): for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>" content += f"<p class='mt-1'>{line}</p>"
return static, content return static, content

View File

@ -1,16 +1,10 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
def view(file: File): def view(file: File):
static = "" static = ""
content = "" content = ""
text = ( text = file.content.replace("\t", " ")
textract.process(file.file.path, extension="odt", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
for line in text.split("\n"): for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>" content += f"<p class='mt-1'>{line}</p>"
return static, content return static, content

View File

@ -1,5 +1,3 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
@ -7,11 +5,7 @@ def view(file: File) -> (str, str):
static = f""" static = f"""
<meta property="og:title" content="{file.name}" /> <meta property="og:title" content="{file.name}" />
""" """
text = ( text = file.content.replace("\t", " ")
textract.process(file.file.path, extension="ogg", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
content = ( content = (
""" """
<div id="waveform"> <div id="waveform">

View File

@ -1,42 +0,0 @@
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
# Set up stop words
english_stopwords = set(stopwords.words("english"))
russian_stopwords = set(stopwords.words("russian"))
# Set up lemmatizers
english_lemmatizer = None
russian_lemmatizer = None
def lemmatize_and_remove_stopwords(text, language="english"):
# Tokenize the text
global english_lemmatizer, russian_lemmatizer
tokens = word_tokenize(text)
# Lemmatize each token based on the specified language
lemmatized_tokens = []
for token in tokens:
if language == "russian":
if not russian_lemmatizer:
russian_lemmatizer = MorphAnalyzer()
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
else: # Default to English
if not english_lemmatizer:
english_lemmatizer = WordNetLemmatizer()
lemmatized_token = english_lemmatizer.lemmatize(token)
lemmatized_tokens.append(lemmatized_token)
# Remove stop words
filtered_tokens = [
token
for token in lemmatized_tokens
if token not in english_stopwords and token not in russian_stopwords
]
# Reconstruct the text
filtered_text = " ".join(filtered_tokens)
return filtered_text

View File

@ -1,8 +1,4 @@
from math import ceil
import magic import magic
from PIL import Image, ImageDraw, ImageFont
from preview_generator.manager import PreviewManager
from akarpov.files.models import File from akarpov.files.models import File
@ -19,90 +15,11 @@
manager = None manager = None
def textfile_to_image(textfile_path) -> Image:
"""Convert text file to a grayscale image.
arguments:
textfile_path - the content of this file will be converted to an image
font_path - path to a font file (for example impact.ttf)
"""
# parse the file into lines stripped of whitespace on the right side
with open(textfile_path) as f:
lines = tuple(line.rstrip() for line in f.readlines())
font: ImageFont = None
large_font = 20 # get better resolution with larger size
for font_filename in COMMON_MONO_FONT_FILENAMES:
try:
font = ImageFont.truetype(font_filename, size=large_font)
print(f'Using font "{font_filename}".')
break
except OSError:
print(f'Could not load font "{font_filename}".')
if font is None:
font = ImageFont.load_default()
print("Using default font.")
def _font_points_to_pixels(pt):
return round(pt * 96.0 / 72)
margin_pixels = 20
# height of the background image
tallest_line = max(lines, key=lambda line: font.getsize(line)[PIL_HEIGHT_INDEX])
max_line_height = _font_points_to_pixels(
font.getsize(tallest_line)[PIL_HEIGHT_INDEX]
)
realistic_line_height = max_line_height * 0.8
image_height = int(ceil(realistic_line_height * len(lines) + 2 * margin_pixels))
widest_line = max(lines, key=lambda s: font.getsize(s)[PIL_WIDTH_INDEX])
max_line_width = _font_points_to_pixels(font.getsize(widest_line)[PIL_WIDTH_INDEX])
image_width = int(ceil(max_line_width + (2 * margin_pixels)))
# draw the background
background_color = 255 # white
image = Image.new(
PIL_GRAYSCALE, (image_width, image_height), color=background_color
)
draw = ImageDraw.Draw(image)
font_color = 0
horizontal_position = margin_pixels
for i, line in enumerate(lines):
vertical_position = int(round(margin_pixels + (i * realistic_line_height)))
draw.text(
(horizontal_position, vertical_position), line, fill=font_color, font=font
)
return image
def create_preview(file_path: str) -> str:
global manager
# TODO: add text image generation/code image
if not manager:
manager = PreviewManager(cache_path, create_folder=True)
if manager.has_jpeg_preview(file_path):
return manager.get_jpeg_preview(file_path, height=500)
return ""
def get_file_mimetype(file_path: str) -> str: def get_file_mimetype(file_path: str) -> str:
mime = magic.Magic(mime=True) mime = magic.Magic(mime=True)
return mime.from_file(file_path) return mime.from_file(file_path)
def get_description(file_path: str) -> str:
global manager
if not manager:
manager = PreviewManager(cache_path, create_folder=True)
if manager.has_text_preview(file_path):
return manager.get_text_preview(file_path)
return ""
def get_base_meta(file: File): def get_base_meta(file: File):
preview = file.preview.url if file.preview else "" preview = file.preview.url if file.preview else ""
description = file.description if file.description else "" description = file.description if file.description else ""

View File

@ -11,12 +11,6 @@
from akarpov.files.models import File from akarpov.files.models import File
from ..documents import FileDocument from ..documents import FileDocument
from .lema import lemmatize_and_remove_stopwords
"""
Calculus on types of searches:
https://new.akarpov.ru/files/FZUTFBIyfbdlDHVzxUNU
"""
class BaseSearch: class BaseSearch:
@ -140,23 +134,20 @@ class SimilaritySearch(BaseSearch):
def search(self, query: str) -> QuerySet[File]: def search(self, query: str) -> QuerySet[File]:
if self.queryset is None: if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search") raise ValueError("Queryset cannot be None for similarity search")
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
queryset = ( queryset = (
self.queryset.annotate( self.queryset.annotate(
name_similarity=Coalesce( name_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("name"), filtered_query), TrigramSimilarity(UnaccentLower("name"), query),
Value(0), Value(0),
output_field=FloatField(), output_field=FloatField(),
), ),
description_similarity=Coalesce( description_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("description"), filtered_query), TrigramSimilarity(UnaccentLower("description"), query),
Value(0), Value(0),
output_field=FloatField(), output_field=FloatField(),
), ),
content_similarity=Coalesce( content_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("content"), filtered_query), TrigramSimilarity(UnaccentLower("content"), query),
Value(0), Value(0),
output_field=FloatField(), output_field=FloatField(),
), ),

View File

@ -1,18 +0,0 @@
import chardet
import textract
from textract.exceptions import ExtensionNotSupported
def extract_file_text(file: str) -> str:
try:
text = textract.process(file)
except ExtensionNotSupported:
try:
rawdata = open(file, "rb").read()
enc = chardet.detect(rawdata)
with open(file, encoding=enc["encoding"]) as f:
text = f.read()
except Exception:
return ""
return text

View File

@ -1,40 +1,69 @@
import os import base64
import time import time
from urllib.parse import urljoin
import requests
import structlog import structlog
from celery import shared_task from celery import shared_task
from django.conf import settings
from django.core import management from django.core import management
from django.core.files import File from django.core.files.base import ContentFile
from akarpov.files.models import File as FileModel from akarpov.files.models import File as FileModel
from akarpov.files.services.preview import create_preview, get_file_mimetype
from akarpov.files.services.text import extract_file_text
logger = structlog.get_logger(__name__) logger = structlog.get_logger(__name__)
def sanitize_content(content):
"""Remove NUL (0x00) characters from the content."""
if isinstance(content, str):
return content.replace("\x00", "")
elif isinstance(content, bytes):
return content.replace(b"\x00", b"")
return content
@shared_task() @shared_task()
def process_file(pk: int): def process_file(pk: int):
pth = None
file = FileModel.objects.get(pk=pk) file = FileModel.objects.get(pk=pk)
if not file.name: if not file.name:
file.name = file.file.name.split("/")[-1] file.name = file.file.name.split("/")[-1]
try: try:
pth = create_preview(file.file.path) api_url = urljoin(settings.PREVIEW_SERVICE_URL, "/process_file/")
if pth:
with open(pth, "rb") as f: files = {"file": (file.name, file.file.open("rb"))}
headers = {
"X-API-Key": settings.PREVIEW_SERVICE_API_KEY,
"Accept": "application/json",
}
response = requests.post(api_url, files=files, headers=headers)
if response.status_code != 200:
logger.error(f"Failed to process file {pk}: {response.text}")
return
result = response.json()
file.file_type = result["file_type"]
file.content = sanitize_content(result["content"])
if result["preview"]:
image_data = base64.b64decode(result["preview"])
file.preview.save( file.preview.save(
pth.split("/")[-1], f"{file.name}_preview.jpg", ContentFile(image_data), save=False
File(f),
save=False,
) )
file.save()
logger.info(f"File {pk} processed successfully")
except Exception as e: except Exception as e:
logger.error(e) logger.error(f"Error processing file {pk}: {str(e)}")
file.file_type = get_file_mimetype(file.file.path) finally:
file.content = extract_file_text(file.file.path) file.file.close()
file.save(update_fields=["preview", "name", "file_type", "content"])
if pth and os.path.isfile(pth):
os.remove(pth)
return pk return pk

View File

@ -6,6 +6,7 @@
from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly
from akarpov.music.api.serializers import ( from akarpov.music.api.serializers import (
AddSongToPlaylistSerializer, AddSongToPlaylistSerializer,
AllSearchSerializer,
AnonMusicUserSerializer, AnonMusicUserSerializer,
FullAlbumSerializer, FullAlbumSerializer,
FullAuthorSerializer, FullAuthorSerializer,
@ -19,7 +20,6 @@
ListSongSlugsSerializer, ListSongSlugsSerializer,
PlaylistSerializer, PlaylistSerializer,
SongSerializer, SongSerializer,
AllSearchSerializer,
) )
from akarpov.music.models import ( from akarpov.music.models import (
Album, Album,
@ -29,7 +29,7 @@
SongUserRating, SongUserRating,
UserListenHistory, UserListenHistory,
) )
from akarpov.music.services.search import search_song, search_album, search_author from akarpov.music.services.search import search_album, search_author, search_song
from akarpov.music.tasks import listen_to_song from akarpov.music.tasks import listen_to_song
from akarpov.users.models import User from akarpov.users.models import User

View File

@ -1,7 +1,7 @@
from django_elasticsearch_dsl import Document, fields from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry from django_elasticsearch_dsl.registries import registry
from akarpov.music.models import Song, Album, Author from akarpov.music.models import Album, Author, Song
@registry.register_document @registry.register_document

View File

@ -3,8 +3,8 @@
from django_elasticsearch_dsl.registries import registry from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl import Q as ES_Q from elasticsearch_dsl import Q as ES_Q
from akarpov.music.documents import SongDocument, AlbumDocument, AuthorDocument from akarpov.music.documents import AlbumDocument, AuthorDocument, SongDocument
from akarpov.music.models import Song, Author, Album from akarpov.music.models import Album, Author, Song
def search_song(query): def search_song(query):

View File

@ -35,7 +35,8 @@ def album_create(sender, instance, created, **kwargs):
@receiver(post_save) @receiver(post_save)
def send_que_status(sender, instance, created, **kwargs): ... def send_que_status(sender, instance, created, **kwargs):
...
@receiver(pre_save, sender=SongUserRating) @receiver(pre_save, sender=SongUserRating)

View File

@ -4,7 +4,6 @@
import pylast import pylast
import spotipy import spotipy
import structlog import structlog
import ytmusicapi
from asgiref.sync import async_to_sync from asgiref.sync import async_to_sync
from celery import shared_task from celery import shared_task
from channels.layers import get_channel_layer from channels.layers import get_channel_layer
@ -12,6 +11,7 @@
from django.utils import timezone from django.utils import timezone
from django.utils.timezone import now from django.utils.timezone import now
from spotipy import SpotifyClientCredentials from spotipy import SpotifyClientCredentials
from ytmusicapi import YTMusic
from akarpov.music.api.serializers import SongSerializer from akarpov.music.api.serializers import SongSerializer
from akarpov.music.models import ( from akarpov.music.models import (

View File

@ -3,6 +3,4 @@
set -o errexit set -o errexit
set -o nounset set -o nounset
/install_preview_dependencies
celery -A config.celery_app worker --autoscale 20 -l INFO celery -A config.celery_app worker --autoscale 20 -l INFO

View File

@ -1,14 +0,0 @@
#!/bin/bash
apt-get update
apt-get install wget libnotify4 scribus libappindicator3-1 libayatana-indicator3-7 libdbusmenu-glib4 libdbusmenu-gtk3-4
apt-get install -y poppler-utils libfile-mimeinfo-perl ghostscript libsecret-1-0 zlib1g-dev libjpeg-dev imagemagick libmagic1 libreoffice inkscape xvfb
apt-get install -y libxml2-dev libxslt1-dev antiword unrtf tesseract-ocr flac lame libmad0 libsox-fmt-mp3 sox swig
apt-get install -y python-dev-is-python3 libxml2-dev libxslt1-dev antiword unrtf poppler-utils tesseract-ocr \
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
wget https://github.com/jgraph/drawio-desktop/releases/download/v13.0.3/draw.io-amd64-13.0.3.deb
dpkg -i draw.io-amd64-13.0.3.deb
rm draw.io-amd64-13.0.3.deb
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
rm -rf /var/lib/apt/lists/*
preview --check-dependencies

View File

@ -758,3 +758,9 @@
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https") SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
USE_X_FORWARDED_HOST = True USE_X_FORWARDED_HOST = True
USE_X_FORWARDED_PORT = True USE_X_FORWARDED_PORT = True
# PREVIEW
# ------------------------------------------------------------------------------
PREVIEW_SERVICE_URL = env("PREVIEW_SERVICE_URL", default=None)
PREVIEW_SERVICE_API_KEY = env("PREVIEW_SERVICE_API_KEY", default=None)

5764
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -69,7 +69,6 @@ channels = {extras = ["daphne"], version = "^4.0.0"}
django-upload-validator = "^1.1.6" django-upload-validator = "^1.1.6"
markdown = "^3.4.4" markdown = "^3.4.4"
pydotplus = "^2.0.2" pydotplus = "^2.0.2"
preview-generator = "^0.29"
uuid = "^1.30" uuid = "^1.30"
mutagen = "^1.46.0" mutagen = "^1.46.0"
pydub = "^0.25.1" pydub = "^0.25.1"
@ -100,11 +99,8 @@ pytest-mock = "^3.11.1"
pytest-asyncio = "^0.21.1" pytest-asyncio = "^0.21.1"
pytest-lambda = "^2.2.0" pytest-lambda = "^2.2.0"
pgvector = "^0.2.2" pgvector = "^0.2.2"
pycld2 = "^0.41"
uuid6 = "^2023.5.2" uuid6 = "^2023.5.2"
uvicorn = "0.23.2" uvicorn = "0.23.2"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142" pymorphy3-dicts-ru = "^2.4.417150.4580142"
fastapi = "0.103.0" fastapi = "0.103.0"
pydantic-settings = "^2.0.3" pydantic-settings = "^2.0.3"
@ -118,9 +114,9 @@ spotdl = "^4.2.4"
fuzzywuzzy = "^0.18.0" fuzzywuzzy = "^0.18.0"
python-levenshtein = "^0.23.0" python-levenshtein = "^0.23.0"
pylast = "^5.2.0" pylast = "^5.2.0"
textract = {git = "https://github.com/Alexander-D-Karpov/textract.git", branch = "master"}
librosa = "^0.10.1" librosa = "^0.10.1"
django-ckeditor-5 = "^0.2.12" django-ckeditor-5 = "^0.2.12"
chardet = "^5.2.0"
[build-system] [build-system]

View File

View File

@ -1,6 +0,0 @@
from haystack import Document
from milvus_haystack import MilvusDocumentStore
ds = MilvusDocumentStore()
ds.write_documents([Document("Some Content")])
ds.get_all_documents()

2185
search/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +0,0 @@
[tool.poetry]
name = "search"
version = "0.1.0"
description = ""
authors = ["Alexander-D-Karpov <alexandr.d.karpov@gmail.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
fastapi = "0.99.1"
pydantic = "1.10.13"
transformers = {version = "4.34.1", extras = ["torch"]}
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,4 +0,0 @@
#!/bin/bash
python -m spacy download en_core_web_lg
python -m spacy download xx_sent_ud_sm
python -m spacy download ru_core_news_lg