mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-24 02:03:49 +03:00
Compare commits
No commits in common. "f320fa2d62237dd0a21428e34fe179ff1249ec8a" and "03c7c5309c20501781a34cce272203ad9bbe5d55" have entirely different histories.
f320fa2d62
...
03c7c5309c
|
@ -15,5 +15,3 @@ LAST_FM_SECRET=
|
||||||
SPOTIFY_ID=
|
SPOTIFY_ID=
|
||||||
SPOTIFY_SECRET=
|
SPOTIFY_SECRET=
|
||||||
YANDEX_TOKEN=
|
YANDEX_TOKEN=
|
||||||
PREVIEW_SERVICE_API_KEY=
|
|
||||||
PREVIEW_SERVICE_URL=
|
|
||||||
|
|
15
.github/workflows/ci.yml
vendored
15
.github/workflows/ci.yml
vendored
|
@ -24,10 +24,10 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Code Repository
|
- name: Checkout Code Repository
|
||||||
uses: actions/checkout@v4.2.1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Cache packages
|
- name: Cache packages
|
||||||
uses: actions/cache@v4.1.1
|
uses: actions/cache@v3
|
||||||
id: cache-packages
|
id: cache-packages
|
||||||
with:
|
with:
|
||||||
path: "~/packages/"
|
path: "~/packages/"
|
||||||
|
@ -45,18 +45,18 @@ jobs:
|
||||||
sudo dpkg -L libimage-exiftool-perl libmagickwand-dev | while IFS= read -r f; do if test -f $f; then echo $f; fi; done | xargs cp --parents --target-directory ~/packages/
|
sudo dpkg -L libimage-exiftool-perl libmagickwand-dev | while IFS= read -r f; do if test -f $f; then echo $f; fi; done | xargs cp --parents --target-directory ~/packages/
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- uses: actions/checkout@v4.2.1
|
- uses: actions/checkout@v3
|
||||||
- name: Install poetry
|
- name: Install poetry
|
||||||
run: pipx install poetry
|
run: pipx install poetry
|
||||||
|
|
||||||
- uses: actions/setup-python@v5.2.0
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: '3.11'
|
python-version: '3.11'
|
||||||
cache: 'poetry'
|
cache: 'poetry'
|
||||||
- run: poetry install
|
- run: poetry install
|
||||||
|
|
||||||
- name: Run pre-commit
|
- name: Run pre-commit
|
||||||
uses: pre-commit/action@v3.0.1
|
uses: pre-commit/action@v2.0.3
|
||||||
|
|
||||||
# With no caching at all the entire ci process takes 4m 30s to complete!
|
# With no caching at all the entire ci process takes 4m 30s to complete!
|
||||||
pytest:
|
pytest:
|
||||||
|
@ -64,10 +64,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Code Repository
|
- name: Checkout Code Repository
|
||||||
uses: actions/checkout@v4.2.1
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Install Docker Compose
|
|
||||||
run: sudo apt-get update && sudo apt-get install -y docker-compose
|
|
||||||
|
|
||||||
- name: Build the Stack
|
- name: Build the Stack
|
||||||
run: docker-compose -f local.yml build
|
run: docker-compose -f local.yml build
|
||||||
|
|
|
@ -28,6 +28,10 @@ $ uvicorn redirect.app:app --reload
|
||||||
```shell
|
```shell
|
||||||
$ docker-compose -f local.yml up
|
$ docker-compose -f local.yml up
|
||||||
```
|
```
|
||||||
|
Install file preview dependencies
|
||||||
|
```shell
|
||||||
|
$ docker-compose -f local.yml exec django /install_preview_dependencies
|
||||||
|
```
|
||||||
- server - http://127.0.0.1:8000
|
- server - http://127.0.0.1:8000
|
||||||
- mail - http://127.0.0.1:8025
|
- mail - http://127.0.0.1:8025
|
||||||
|
|
||||||
|
|
0
akarpov/common/ml/__init__.py
Normal file
0
akarpov/common/ml/__init__.py
Normal file
51
akarpov/common/ml/text.py
Normal file
51
akarpov/common/ml/text.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
import pycld2 as cld2
|
||||||
|
import spacy
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
# load ml classes and models on first request
|
||||||
|
# TODO: move to outer server/service
|
||||||
|
nlp = None
|
||||||
|
ru_nlp = None
|
||||||
|
|
||||||
|
ru_model = None
|
||||||
|
ru_tokenizer = None
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_embedding(text: str):
|
||||||
|
global nlp, ru_nlp, ru_model, ru_tokenizer
|
||||||
|
|
||||||
|
is_reliable, text_bytes_found, details = cld2.detect(text)
|
||||||
|
if is_reliable:
|
||||||
|
lang = details[0]
|
||||||
|
if lang[1] in ["ru", "en"]:
|
||||||
|
lang = lang[1]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if lang == "ru":
|
||||||
|
if not ru_nlp:
|
||||||
|
ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"])
|
||||||
|
lema = " ".join([token.lemma_ for token in ru_nlp(text)])
|
||||||
|
if not ru_model:
|
||||||
|
ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
|
||||||
|
if not ru_tokenizer:
|
||||||
|
ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
|
||||||
|
encodings = ru_tokenizer(
|
||||||
|
lema, # the texts to be tokenized
|
||||||
|
padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
|
||||||
|
return_tensors="pt", # return the tensors (not lists)
|
||||||
|
)
|
||||||
|
with torch.no_grad():
|
||||||
|
# get the model embeddings
|
||||||
|
embeds = ru_model(**encodings)
|
||||||
|
embeds = embeds[0]
|
||||||
|
elif lang == "en":
|
||||||
|
embeds = None
|
||||||
|
else:
|
||||||
|
embeds = None
|
||||||
|
|
||||||
|
return embeds
|
|
@ -1,10 +1,16 @@
|
||||||
|
import textract
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
|
||||||
def view(file: File):
|
def view(file: File):
|
||||||
static = ""
|
static = ""
|
||||||
content = ""
|
content = ""
|
||||||
text = file.content.replace("\t", " ")
|
text = (
|
||||||
|
textract.process(file.file.path, extension="doc", output_encoding="utf8")
|
||||||
|
.decode("utf8")
|
||||||
|
.replace("\t", " ")
|
||||||
|
)
|
||||||
for line in text.split("\n"):
|
for line in text.split("\n"):
|
||||||
content += f"<p class='mt-1'>{line}</p>"
|
content += f"<p class='mt-1'>{line}</p>"
|
||||||
return static, content
|
return static, content
|
||||||
|
|
|
@ -1,10 +1,16 @@
|
||||||
|
import textract
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
|
||||||
def view(file: File):
|
def view(file: File):
|
||||||
static = ""
|
static = ""
|
||||||
content = ""
|
content = ""
|
||||||
text = file.content.replace("\t", " ")
|
text = (
|
||||||
|
textract.process(file.file.path, extension="docx", output_encoding="utf8")
|
||||||
|
.decode("utf8")
|
||||||
|
.replace("\t", " ")
|
||||||
|
)
|
||||||
for line in text.split("\n"):
|
for line in text.split("\n"):
|
||||||
content += f"<p class='mt-1'>{line}</p>"
|
content += f"<p class='mt-1'>{line}</p>"
|
||||||
return static, content
|
return static, content
|
||||||
|
|
|
@ -1,10 +1,16 @@
|
||||||
|
import textract
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
|
||||||
def view(file: File):
|
def view(file: File):
|
||||||
static = ""
|
static = ""
|
||||||
content = ""
|
content = ""
|
||||||
text = file.content.replace("\t", " ")
|
text = (
|
||||||
|
textract.process(file.file.path, extension="odt", output_encoding="utf8")
|
||||||
|
.decode("utf8")
|
||||||
|
.replace("\t", " ")
|
||||||
|
)
|
||||||
for line in text.split("\n"):
|
for line in text.split("\n"):
|
||||||
content += f"<p class='mt-1'>{line}</p>"
|
content += f"<p class='mt-1'>{line}</p>"
|
||||||
return static, content
|
return static, content
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import textract
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,7 +7,11 @@ def view(file: File) -> (str, str):
|
||||||
static = f"""
|
static = f"""
|
||||||
<meta property="og:title" content="{file.name}" />
|
<meta property="og:title" content="{file.name}" />
|
||||||
"""
|
"""
|
||||||
text = file.content.replace("\t", " ")
|
text = (
|
||||||
|
textract.process(file.file.path, extension="ogg", output_encoding="utf8")
|
||||||
|
.decode("utf8")
|
||||||
|
.replace("\t", " ")
|
||||||
|
)
|
||||||
content = (
|
content = (
|
||||||
"""
|
"""
|
||||||
<div id="waveform">
|
<div id="waveform">
|
||||||
|
|
42
akarpov/files/services/lema.py
Normal file
42
akarpov/files/services/lema.py
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import WordNetLemmatizer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from pymorphy3 import MorphAnalyzer
|
||||||
|
|
||||||
|
# Set up stop words
|
||||||
|
english_stopwords = set(stopwords.words("english"))
|
||||||
|
russian_stopwords = set(stopwords.words("russian"))
|
||||||
|
|
||||||
|
# Set up lemmatizers
|
||||||
|
english_lemmatizer = None
|
||||||
|
russian_lemmatizer = None
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatize_and_remove_stopwords(text, language="english"):
|
||||||
|
# Tokenize the text
|
||||||
|
global english_lemmatizer, russian_lemmatizer
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
|
||||||
|
# Lemmatize each token based on the specified language
|
||||||
|
lemmatized_tokens = []
|
||||||
|
for token in tokens:
|
||||||
|
if language == "russian":
|
||||||
|
if not russian_lemmatizer:
|
||||||
|
russian_lemmatizer = MorphAnalyzer()
|
||||||
|
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
|
||||||
|
else: # Default to English
|
||||||
|
if not english_lemmatizer:
|
||||||
|
english_lemmatizer = WordNetLemmatizer()
|
||||||
|
lemmatized_token = english_lemmatizer.lemmatize(token)
|
||||||
|
lemmatized_tokens.append(lemmatized_token)
|
||||||
|
|
||||||
|
# Remove stop words
|
||||||
|
filtered_tokens = [
|
||||||
|
token
|
||||||
|
for token in lemmatized_tokens
|
||||||
|
if token not in english_stopwords and token not in russian_stopwords
|
||||||
|
]
|
||||||
|
|
||||||
|
# Reconstruct the text
|
||||||
|
filtered_text = " ".join(filtered_tokens)
|
||||||
|
return filtered_text
|
|
@ -1,4 +1,8 @@
|
||||||
|
from math import ceil
|
||||||
|
|
||||||
import magic
|
import magic
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
from preview_generator.manager import PreviewManager
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
@ -15,11 +19,90 @@
|
||||||
manager = None
|
manager = None
|
||||||
|
|
||||||
|
|
||||||
|
def textfile_to_image(textfile_path) -> Image:
|
||||||
|
"""Convert text file to a grayscale image.
|
||||||
|
|
||||||
|
arguments:
|
||||||
|
textfile_path - the content of this file will be converted to an image
|
||||||
|
font_path - path to a font file (for example impact.ttf)
|
||||||
|
"""
|
||||||
|
# parse the file into lines stripped of whitespace on the right side
|
||||||
|
with open(textfile_path) as f:
|
||||||
|
lines = tuple(line.rstrip() for line in f.readlines())
|
||||||
|
|
||||||
|
font: ImageFont = None
|
||||||
|
large_font = 20 # get better resolution with larger size
|
||||||
|
for font_filename in COMMON_MONO_FONT_FILENAMES:
|
||||||
|
try:
|
||||||
|
font = ImageFont.truetype(font_filename, size=large_font)
|
||||||
|
print(f'Using font "{font_filename}".')
|
||||||
|
break
|
||||||
|
except OSError:
|
||||||
|
print(f'Could not load font "{font_filename}".')
|
||||||
|
if font is None:
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
print("Using default font.")
|
||||||
|
|
||||||
|
def _font_points_to_pixels(pt):
|
||||||
|
return round(pt * 96.0 / 72)
|
||||||
|
|
||||||
|
margin_pixels = 20
|
||||||
|
|
||||||
|
# height of the background image
|
||||||
|
tallest_line = max(lines, key=lambda line: font.getsize(line)[PIL_HEIGHT_INDEX])
|
||||||
|
max_line_height = _font_points_to_pixels(
|
||||||
|
font.getsize(tallest_line)[PIL_HEIGHT_INDEX]
|
||||||
|
)
|
||||||
|
realistic_line_height = max_line_height * 0.8
|
||||||
|
image_height = int(ceil(realistic_line_height * len(lines) + 2 * margin_pixels))
|
||||||
|
|
||||||
|
widest_line = max(lines, key=lambda s: font.getsize(s)[PIL_WIDTH_INDEX])
|
||||||
|
max_line_width = _font_points_to_pixels(font.getsize(widest_line)[PIL_WIDTH_INDEX])
|
||||||
|
image_width = int(ceil(max_line_width + (2 * margin_pixels)))
|
||||||
|
|
||||||
|
# draw the background
|
||||||
|
background_color = 255 # white
|
||||||
|
image = Image.new(
|
||||||
|
PIL_GRAYSCALE, (image_width, image_height), color=background_color
|
||||||
|
)
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
|
||||||
|
font_color = 0
|
||||||
|
horizontal_position = margin_pixels
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
vertical_position = int(round(margin_pixels + (i * realistic_line_height)))
|
||||||
|
draw.text(
|
||||||
|
(horizontal_position, vertical_position), line, fill=font_color, font=font
|
||||||
|
)
|
||||||
|
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def create_preview(file_path: str) -> str:
|
||||||
|
global manager
|
||||||
|
# TODO: add text image generation/code image
|
||||||
|
if not manager:
|
||||||
|
manager = PreviewManager(cache_path, create_folder=True)
|
||||||
|
if manager.has_jpeg_preview(file_path):
|
||||||
|
return manager.get_jpeg_preview(file_path, height=500)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def get_file_mimetype(file_path: str) -> str:
|
def get_file_mimetype(file_path: str) -> str:
|
||||||
mime = magic.Magic(mime=True)
|
mime = magic.Magic(mime=True)
|
||||||
return mime.from_file(file_path)
|
return mime.from_file(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def get_description(file_path: str) -> str:
|
||||||
|
global manager
|
||||||
|
if not manager:
|
||||||
|
manager = PreviewManager(cache_path, create_folder=True)
|
||||||
|
|
||||||
|
if manager.has_text_preview(file_path):
|
||||||
|
return manager.get_text_preview(file_path)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def get_base_meta(file: File):
|
def get_base_meta(file: File):
|
||||||
preview = file.preview.url if file.preview else ""
|
preview = file.preview.url if file.preview else ""
|
||||||
description = file.description if file.description else ""
|
description = file.description if file.description else ""
|
||||||
|
|
|
@ -11,6 +11,12 @@
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
from ..documents import FileDocument
|
from ..documents import FileDocument
|
||||||
|
from .lema import lemmatize_and_remove_stopwords
|
||||||
|
|
||||||
|
"""
|
||||||
|
Calculus on types of searches:
|
||||||
|
https://new.akarpov.ru/files/FZUTFBIyfbdlDHVzxUNU
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class BaseSearch:
|
class BaseSearch:
|
||||||
|
@ -134,20 +140,23 @@ class SimilaritySearch(BaseSearch):
|
||||||
def search(self, query: str) -> QuerySet[File]:
|
def search(self, query: str) -> QuerySet[File]:
|
||||||
if self.queryset is None:
|
if self.queryset is None:
|
||||||
raise ValueError("Queryset cannot be None for similarity search")
|
raise ValueError("Queryset cannot be None for similarity search")
|
||||||
|
|
||||||
|
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
||||||
|
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
||||||
queryset = (
|
queryset = (
|
||||||
self.queryset.annotate(
|
self.queryset.annotate(
|
||||||
name_similarity=Coalesce(
|
name_similarity=Coalesce(
|
||||||
TrigramSimilarity(UnaccentLower("name"), query),
|
TrigramSimilarity(UnaccentLower("name"), filtered_query),
|
||||||
Value(0),
|
Value(0),
|
||||||
output_field=FloatField(),
|
output_field=FloatField(),
|
||||||
),
|
),
|
||||||
description_similarity=Coalesce(
|
description_similarity=Coalesce(
|
||||||
TrigramSimilarity(UnaccentLower("description"), query),
|
TrigramSimilarity(UnaccentLower("description"), filtered_query),
|
||||||
Value(0),
|
Value(0),
|
||||||
output_field=FloatField(),
|
output_field=FloatField(),
|
||||||
),
|
),
|
||||||
content_similarity=Coalesce(
|
content_similarity=Coalesce(
|
||||||
TrigramSimilarity(UnaccentLower("content"), query),
|
TrigramSimilarity(UnaccentLower("content"), filtered_query),
|
||||||
Value(0),
|
Value(0),
|
||||||
output_field=FloatField(),
|
output_field=FloatField(),
|
||||||
),
|
),
|
||||||
|
|
18
akarpov/files/services/text.py
Normal file
18
akarpov/files/services/text.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
import chardet
|
||||||
|
import textract
|
||||||
|
from textract.exceptions import ExtensionNotSupported
|
||||||
|
|
||||||
|
|
||||||
|
def extract_file_text(file: str) -> str:
|
||||||
|
try:
|
||||||
|
text = textract.process(file)
|
||||||
|
except ExtensionNotSupported:
|
||||||
|
try:
|
||||||
|
rawdata = open(file, "rb").read()
|
||||||
|
enc = chardet.detect(rawdata)
|
||||||
|
with open(file, encoding=enc["encoding"]) as f:
|
||||||
|
text = f.read()
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return text
|
|
@ -1,69 +1,40 @@
|
||||||
import base64
|
import os
|
||||||
import time
|
import time
|
||||||
from urllib.parse import urljoin
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import structlog
|
import structlog
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from django.conf import settings
|
|
||||||
from django.core import management
|
from django.core import management
|
||||||
from django.core.files.base import ContentFile
|
from django.core.files import File
|
||||||
|
|
||||||
from akarpov.files.models import File as FileModel
|
from akarpov.files.models import File as FileModel
|
||||||
|
from akarpov.files.services.preview import create_preview, get_file_mimetype
|
||||||
|
from akarpov.files.services.text import extract_file_text
|
||||||
|
|
||||||
logger = structlog.get_logger(__name__)
|
logger = structlog.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def sanitize_content(content):
|
|
||||||
"""Remove NUL (0x00) characters from the content."""
|
|
||||||
if isinstance(content, str):
|
|
||||||
return content.replace("\x00", "")
|
|
||||||
elif isinstance(content, bytes):
|
|
||||||
return content.replace(b"\x00", b"")
|
|
||||||
return content
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task()
|
@shared_task()
|
||||||
def process_file(pk: int):
|
def process_file(pk: int):
|
||||||
|
pth = None
|
||||||
file = FileModel.objects.get(pk=pk)
|
file = FileModel.objects.get(pk=pk)
|
||||||
if not file.name:
|
if not file.name:
|
||||||
file.name = file.file.name.split("/")[-1]
|
file.name = file.file.name.split("/")[-1]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
api_url = urljoin(settings.PREVIEW_SERVICE_URL, "/process_file/")
|
pth = create_preview(file.file.path)
|
||||||
|
if pth:
|
||||||
files = {"file": (file.name, file.file.open("rb"))}
|
with open(pth, "rb") as f:
|
||||||
headers = {
|
|
||||||
"X-API-Key": settings.PREVIEW_SERVICE_API_KEY,
|
|
||||||
"Accept": "application/json",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(api_url, files=files, headers=headers)
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
|
||||||
logger.error(f"Failed to process file {pk}: {response.text}")
|
|
||||||
return
|
|
||||||
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
file.file_type = result["file_type"]
|
|
||||||
file.content = sanitize_content(result["content"])
|
|
||||||
|
|
||||||
if result["preview"]:
|
|
||||||
image_data = base64.b64decode(result["preview"])
|
|
||||||
file.preview.save(
|
file.preview.save(
|
||||||
f"{file.name}_preview.jpg", ContentFile(image_data), save=False
|
pth.split("/")[-1],
|
||||||
|
File(f),
|
||||||
|
save=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
file.save()
|
|
||||||
|
|
||||||
logger.info(f"File {pk} processed successfully")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing file {pk}: {str(e)}")
|
logger.error(e)
|
||||||
finally:
|
file.file_type = get_file_mimetype(file.file.path)
|
||||||
file.file.close()
|
file.content = extract_file_text(file.file.path)
|
||||||
|
file.save(update_fields=["preview", "name", "file_type", "content"])
|
||||||
|
if pth and os.path.isfile(pth):
|
||||||
|
os.remove(pth)
|
||||||
return pk
|
return pk
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,6 @@
|
||||||
from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly
|
from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly
|
||||||
from akarpov.music.api.serializers import (
|
from akarpov.music.api.serializers import (
|
||||||
AddSongToPlaylistSerializer,
|
AddSongToPlaylistSerializer,
|
||||||
AllSearchSerializer,
|
|
||||||
AnonMusicUserSerializer,
|
AnonMusicUserSerializer,
|
||||||
FullAlbumSerializer,
|
FullAlbumSerializer,
|
||||||
FullAuthorSerializer,
|
FullAuthorSerializer,
|
||||||
|
@ -20,6 +19,7 @@
|
||||||
ListSongSlugsSerializer,
|
ListSongSlugsSerializer,
|
||||||
PlaylistSerializer,
|
PlaylistSerializer,
|
||||||
SongSerializer,
|
SongSerializer,
|
||||||
|
AllSearchSerializer,
|
||||||
)
|
)
|
||||||
from akarpov.music.models import (
|
from akarpov.music.models import (
|
||||||
Album,
|
Album,
|
||||||
|
@ -29,7 +29,7 @@
|
||||||
SongUserRating,
|
SongUserRating,
|
||||||
UserListenHistory,
|
UserListenHistory,
|
||||||
)
|
)
|
||||||
from akarpov.music.services.search import search_album, search_author, search_song
|
from akarpov.music.services.search import search_song, search_album, search_author
|
||||||
from akarpov.music.tasks import listen_to_song
|
from akarpov.music.tasks import listen_to_song
|
||||||
from akarpov.users.models import User
|
from akarpov.users.models import User
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from django_elasticsearch_dsl import Document, fields
|
from django_elasticsearch_dsl import Document, fields
|
||||||
from django_elasticsearch_dsl.registries import registry
|
from django_elasticsearch_dsl.registries import registry
|
||||||
|
|
||||||
from akarpov.music.models import Album, Author, Song
|
from akarpov.music.models import Song, Album, Author
|
||||||
|
|
||||||
|
|
||||||
@registry.register_document
|
@registry.register_document
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
from django_elasticsearch_dsl.registries import registry
|
from django_elasticsearch_dsl.registries import registry
|
||||||
from elasticsearch_dsl import Q as ES_Q
|
from elasticsearch_dsl import Q as ES_Q
|
||||||
|
|
||||||
from akarpov.music.documents import AlbumDocument, AuthorDocument, SongDocument
|
from akarpov.music.documents import SongDocument, AlbumDocument, AuthorDocument
|
||||||
from akarpov.music.models import Album, Author, Song
|
from akarpov.music.models import Song, Author, Album
|
||||||
|
|
||||||
|
|
||||||
def search_song(query):
|
def search_song(query):
|
||||||
|
|
|
@ -35,8 +35,7 @@ def album_create(sender, instance, created, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
@receiver(post_save)
|
@receiver(post_save)
|
||||||
def send_que_status(sender, instance, created, **kwargs):
|
def send_que_status(sender, instance, created, **kwargs): ...
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
@receiver(pre_save, sender=SongUserRating)
|
@receiver(pre_save, sender=SongUserRating)
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
import pylast
|
import pylast
|
||||||
import spotipy
|
import spotipy
|
||||||
import structlog
|
import structlog
|
||||||
|
import ytmusicapi
|
||||||
from asgiref.sync import async_to_sync
|
from asgiref.sync import async_to_sync
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
from channels.layers import get_channel_layer
|
from channels.layers import get_channel_layer
|
||||||
|
@ -11,7 +12,6 @@
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from django.utils.timezone import now
|
from django.utils.timezone import now
|
||||||
from spotipy import SpotifyClientCredentials
|
from spotipy import SpotifyClientCredentials
|
||||||
from ytmusicapi import YTMusic
|
|
||||||
|
|
||||||
from akarpov.music.api.serializers import SongSerializer
|
from akarpov.music.api.serializers import SongSerializer
|
||||||
from akarpov.music.models import (
|
from akarpov.music.models import (
|
||||||
|
|
|
@ -28,6 +28,8 @@ RUN apt-get update && \
|
||||||
apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev && \
|
apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev && \
|
||||||
# Dependencies for file preview generation
|
# Dependencies for file preview generation
|
||||||
apt-get install -y webp git libimage-exiftool-perl libmagickwand-dev ffmpeg libgdal-dev && \
|
apt-get install -y webp git libimage-exiftool-perl libmagickwand-dev ffmpeg libgdal-dev && \
|
||||||
|
# ML dependencies \
|
||||||
|
# none for now
|
||||||
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
|
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
@ -46,6 +48,7 @@ RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN poetry build && /venv/bin/pip install dist/*.whl
|
RUN poetry build && /venv/bin/pip install dist/*.whl
|
||||||
|
RUN /venv/bin/python -m nltk.downloader punkt stopwords wordnet
|
||||||
|
|
||||||
|
|
||||||
COPY ./compose/production/django/entrypoint /entrypoint
|
COPY ./compose/production/django/entrypoint /entrypoint
|
||||||
|
@ -68,6 +71,10 @@ COPY ./compose/local/django/start-redirect /start-redirect
|
||||||
RUN sed -i 's/\r$//g' /start-redirect
|
RUN sed -i 's/\r$//g' /start-redirect
|
||||||
RUN chmod +x /start-redirect
|
RUN chmod +x /start-redirect
|
||||||
|
|
||||||
|
COPY ./compose/local/django/install_preview_dependencies /install_preview_dependencies
|
||||||
|
RUN sed -i 's/\r$//g' /install_preview_dependencies
|
||||||
|
RUN chmod +x /install_preview_dependencies
|
||||||
|
|
||||||
COPY ./compose/local/django/celery/worker/start /start-celeryworker
|
COPY ./compose/local/django/celery/worker/start /start-celeryworker
|
||||||
RUN sed -i 's/\r$//g' /start-celeryworker
|
RUN sed -i 's/\r$//g' /start-celeryworker
|
||||||
RUN chmod +x /start-celeryworker
|
RUN chmod +x /start-celeryworker
|
||||||
|
|
|
@ -3,4 +3,6 @@
|
||||||
set -o errexit
|
set -o errexit
|
||||||
set -o nounset
|
set -o nounset
|
||||||
|
|
||||||
|
/install_preview_dependencies
|
||||||
|
|
||||||
celery -A config.celery_app worker --autoscale 20 -l INFO
|
celery -A config.celery_app worker --autoscale 20 -l INFO
|
||||||
|
|
14
compose/local/django/install_preview_dependencies
Normal file
14
compose/local/django/install_preview_dependencies
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
apt-get update
|
||||||
|
apt-get install wget libnotify4 scribus libappindicator3-1 libayatana-indicator3-7 libdbusmenu-glib4 libdbusmenu-gtk3-4
|
||||||
|
apt-get install -y poppler-utils libfile-mimeinfo-perl ghostscript libsecret-1-0 zlib1g-dev libjpeg-dev imagemagick libmagic1 libreoffice inkscape xvfb
|
||||||
|
apt-get install -y libxml2-dev libxslt1-dev antiword unrtf tesseract-ocr flac lame libmad0 libsox-fmt-mp3 sox swig
|
||||||
|
apt-get install -y python-dev-is-python3 libxml2-dev libxslt1-dev antiword unrtf poppler-utils tesseract-ocr \
|
||||||
|
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
|
||||||
|
wget https://github.com/jgraph/drawio-desktop/releases/download/v13.0.3/draw.io-amd64-13.0.3.deb
|
||||||
|
dpkg -i draw.io-amd64-13.0.3.deb
|
||||||
|
rm draw.io-amd64-13.0.3.deb
|
||||||
|
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
preview --check-dependencies
|
|
@ -758,9 +758,3 @@
|
||||||
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
|
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
|
||||||
USE_X_FORWARDED_HOST = True
|
USE_X_FORWARDED_HOST = True
|
||||||
USE_X_FORWARDED_PORT = True
|
USE_X_FORWARDED_PORT = True
|
||||||
|
|
||||||
|
|
||||||
# PREVIEW
|
|
||||||
# ------------------------------------------------------------------------------
|
|
||||||
PREVIEW_SERVICE_URL = env("PREVIEW_SERVICE_URL", default=None)
|
|
||||||
PREVIEW_SERVICE_API_KEY = env("PREVIEW_SERVICE_API_KEY", default=None)
|
|
||||||
|
|
5764
poetry.lock
generated
5764
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
|
@ -69,6 +69,7 @@ channels = {extras = ["daphne"], version = "^4.0.0"}
|
||||||
django-upload-validator = "^1.1.6"
|
django-upload-validator = "^1.1.6"
|
||||||
markdown = "^3.4.4"
|
markdown = "^3.4.4"
|
||||||
pydotplus = "^2.0.2"
|
pydotplus = "^2.0.2"
|
||||||
|
preview-generator = "^0.29"
|
||||||
uuid = "^1.30"
|
uuid = "^1.30"
|
||||||
mutagen = "^1.46.0"
|
mutagen = "^1.46.0"
|
||||||
pydub = "^0.25.1"
|
pydub = "^0.25.1"
|
||||||
|
@ -99,8 +100,11 @@ pytest-mock = "^3.11.1"
|
||||||
pytest-asyncio = "^0.21.1"
|
pytest-asyncio = "^0.21.1"
|
||||||
pytest-lambda = "^2.2.0"
|
pytest-lambda = "^2.2.0"
|
||||||
pgvector = "^0.2.2"
|
pgvector = "^0.2.2"
|
||||||
|
pycld2 = "^0.41"
|
||||||
uuid6 = "^2023.5.2"
|
uuid6 = "^2023.5.2"
|
||||||
uvicorn = "0.23.2"
|
uvicorn = "0.23.2"
|
||||||
|
nltk = "^3.8.1"
|
||||||
|
pymorphy3 = "^1.2.1"
|
||||||
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
||||||
fastapi = "0.103.0"
|
fastapi = "0.103.0"
|
||||||
pydantic-settings = "^2.0.3"
|
pydantic-settings = "^2.0.3"
|
||||||
|
@ -114,9 +118,9 @@ spotdl = "^4.2.4"
|
||||||
fuzzywuzzy = "^0.18.0"
|
fuzzywuzzy = "^0.18.0"
|
||||||
python-levenshtein = "^0.23.0"
|
python-levenshtein = "^0.23.0"
|
||||||
pylast = "^5.2.0"
|
pylast = "^5.2.0"
|
||||||
|
textract = {git = "https://github.com/Alexander-D-Karpov/textract.git", branch = "master"}
|
||||||
librosa = "^0.10.1"
|
librosa = "^0.10.1"
|
||||||
django-ckeditor-5 = "^0.2.12"
|
django-ckeditor-5 = "^0.2.12"
|
||||||
chardet = "^5.2.0"
|
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
0
search/__init__.py
Normal file
0
search/__init__.py
Normal file
6
search/pipeline.py
Normal file
6
search/pipeline.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from haystack import Document
|
||||||
|
from milvus_haystack import MilvusDocumentStore
|
||||||
|
|
||||||
|
ds = MilvusDocumentStore()
|
||||||
|
ds.write_documents([Document("Some Content")])
|
||||||
|
ds.get_all_documents()
|
2185
search/poetry.lock
generated
Normal file
2185
search/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
18
search/pyproject.toml
Normal file
18
search/pyproject.toml
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "search"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
authors = ["Alexander-D-Karpov <alexandr.d.karpov@gmail.com>"]
|
||||||
|
readme = "README.md"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.11"
|
||||||
|
fastapi = "0.99.1"
|
||||||
|
pydantic = "1.10.13"
|
||||||
|
transformers = {version = "4.34.1", extras = ["torch"]}
|
||||||
|
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
|
||||||
|
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=1.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
4
spacy_setup.sh
Executable file
4
spacy_setup.sh
Executable file
|
@ -0,0 +1,4 @@
|
||||||
|
#!/bin/bash
|
||||||
|
python -m spacy download en_core_web_lg
|
||||||
|
python -m spacy download xx_sent_ud_sm
|
||||||
|
python -m spacy download ru_core_news_lg
|
Loading…
Reference in New Issue
Block a user