Compare commits

..

No commits in common. "f320fa2d62237dd0a21428e34fe179ff1249ec8a" and "03c7c5309c20501781a34cce272203ad9bbe5d55" have entirely different histories.

30 changed files with 5586 additions and 2770 deletions

View File

@ -15,5 +15,3 @@ LAST_FM_SECRET=
SPOTIFY_ID= SPOTIFY_ID=
SPOTIFY_SECRET= SPOTIFY_SECRET=
YANDEX_TOKEN= YANDEX_TOKEN=
PREVIEW_SERVICE_API_KEY=
PREVIEW_SERVICE_URL=

View File

@ -24,10 +24,10 @@ jobs:
steps: steps:
- name: Checkout Code Repository - name: Checkout Code Repository
uses: actions/checkout@v4.2.1 uses: actions/checkout@v3
- name: Cache packages - name: Cache packages
uses: actions/cache@v4.1.1 uses: actions/cache@v3
id: cache-packages id: cache-packages
with: with:
path: "~/packages/" path: "~/packages/"
@ -45,18 +45,18 @@ jobs:
sudo dpkg -L libimage-exiftool-perl libmagickwand-dev | while IFS= read -r f; do if test -f $f; then echo $f; fi; done | xargs cp --parents --target-directory ~/packages/ sudo dpkg -L libimage-exiftool-perl libmagickwand-dev | while IFS= read -r f; do if test -f $f; then echo $f; fi; done | xargs cp --parents --target-directory ~/packages/
fi fi
- uses: actions/checkout@v4.2.1 - uses: actions/checkout@v3
- name: Install poetry - name: Install poetry
run: pipx install poetry run: pipx install poetry
- uses: actions/setup-python@v5.2.0 - uses: actions/setup-python@v5
with: with:
python-version: '3.11' python-version: '3.11'
cache: 'poetry' cache: 'poetry'
- run: poetry install - run: poetry install
- name: Run pre-commit - name: Run pre-commit
uses: pre-commit/action@v3.0.1 uses: pre-commit/action@v2.0.3
# With no caching at all the entire ci process takes 4m 30s to complete! # With no caching at all the entire ci process takes 4m 30s to complete!
pytest: pytest:
@ -64,10 +64,7 @@ jobs:
steps: steps:
- name: Checkout Code Repository - name: Checkout Code Repository
uses: actions/checkout@v4.2.1 uses: actions/checkout@v3
- name: Install Docker Compose
run: sudo apt-get update && sudo apt-get install -y docker-compose
- name: Build the Stack - name: Build the Stack
run: docker-compose -f local.yml build run: docker-compose -f local.yml build

View File

@ -28,6 +28,10 @@ $ uvicorn redirect.app:app --reload
```shell ```shell
$ docker-compose -f local.yml up $ docker-compose -f local.yml up
``` ```
Install file preview dependencies
```shell
$ docker-compose -f local.yml exec django /install_preview_dependencies
```
- server - http://127.0.0.1:8000 - server - http://127.0.0.1:8000
- mail - http://127.0.0.1:8025 - mail - http://127.0.0.1:8025

View File

51
akarpov/common/ml/text.py Normal file
View File

@ -0,0 +1,51 @@
import pycld2 as cld2
import spacy
import torch
from transformers import AutoModel, AutoTokenizer
# load ml classes and models on first request
# TODO: move to outer server/service
nlp = None
ru_nlp = None
ru_model = None
ru_tokenizer = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_text_embedding(text: str):
global nlp, ru_nlp, ru_model, ru_tokenizer
is_reliable, text_bytes_found, details = cld2.detect(text)
if is_reliable:
lang = details[0]
if lang[1] in ["ru", "en"]:
lang = lang[1]
else:
return None
else:
return None
if lang == "ru":
if not ru_nlp:
ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"])
lema = " ".join([token.lemma_ for token in ru_nlp(text)])
if not ru_model:
ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
if not ru_tokenizer:
ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
encodings = ru_tokenizer(
lema, # the texts to be tokenized
padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
return_tensors="pt", # return the tensors (not lists)
)
with torch.no_grad():
# get the model embeddings
embeds = ru_model(**encodings)
embeds = embeds[0]
elif lang == "en":
embeds = None
else:
embeds = None
return embeds

View File

@ -1,10 +1,16 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
def view(file: File): def view(file: File):
static = "" static = ""
content = "" content = ""
text = file.content.replace("\t", " ") text = (
textract.process(file.file.path, extension="doc", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
for line in text.split("\n"): for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>" content += f"<p class='mt-1'>{line}</p>"
return static, content return static, content

View File

@ -1,10 +1,16 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
def view(file: File): def view(file: File):
static = "" static = ""
content = "" content = ""
text = file.content.replace("\t", " ") text = (
textract.process(file.file.path, extension="docx", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
for line in text.split("\n"): for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>" content += f"<p class='mt-1'>{line}</p>"
return static, content return static, content

View File

@ -1,10 +1,16 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
def view(file: File): def view(file: File):
static = "" static = ""
content = "" content = ""
text = file.content.replace("\t", " ") text = (
textract.process(file.file.path, extension="odt", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
for line in text.split("\n"): for line in text.split("\n"):
content += f"<p class='mt-1'>{line}</p>" content += f"<p class='mt-1'>{line}</p>"
return static, content return static, content

View File

@ -1,3 +1,5 @@
import textract
from akarpov.files.models import File from akarpov.files.models import File
@ -5,7 +7,11 @@ def view(file: File) -> (str, str):
static = f""" static = f"""
<meta property="og:title" content="{file.name}" /> <meta property="og:title" content="{file.name}" />
""" """
text = file.content.replace("\t", " ") text = (
textract.process(file.file.path, extension="ogg", output_encoding="utf8")
.decode("utf8")
.replace("\t", " ")
)
content = ( content = (
""" """
<div id="waveform"> <div id="waveform">

View File

@ -0,0 +1,42 @@
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
# Set up stop words
english_stopwords = set(stopwords.words("english"))
russian_stopwords = set(stopwords.words("russian"))
# Set up lemmatizers
english_lemmatizer = None
russian_lemmatizer = None
def lemmatize_and_remove_stopwords(text, language="english"):
# Tokenize the text
global english_lemmatizer, russian_lemmatizer
tokens = word_tokenize(text)
# Lemmatize each token based on the specified language
lemmatized_tokens = []
for token in tokens:
if language == "russian":
if not russian_lemmatizer:
russian_lemmatizer = MorphAnalyzer()
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
else: # Default to English
if not english_lemmatizer:
english_lemmatizer = WordNetLemmatizer()
lemmatized_token = english_lemmatizer.lemmatize(token)
lemmatized_tokens.append(lemmatized_token)
# Remove stop words
filtered_tokens = [
token
for token in lemmatized_tokens
if token not in english_stopwords and token not in russian_stopwords
]
# Reconstruct the text
filtered_text = " ".join(filtered_tokens)
return filtered_text

View File

@ -1,4 +1,8 @@
from math import ceil
import magic import magic
from PIL import Image, ImageDraw, ImageFont
from preview_generator.manager import PreviewManager
from akarpov.files.models import File from akarpov.files.models import File
@ -15,11 +19,90 @@
manager = None manager = None
def textfile_to_image(textfile_path) -> Image:
"""Convert text file to a grayscale image.
arguments:
textfile_path - the content of this file will be converted to an image
font_path - path to a font file (for example impact.ttf)
"""
# parse the file into lines stripped of whitespace on the right side
with open(textfile_path) as f:
lines = tuple(line.rstrip() for line in f.readlines())
font: ImageFont = None
large_font = 20 # get better resolution with larger size
for font_filename in COMMON_MONO_FONT_FILENAMES:
try:
font = ImageFont.truetype(font_filename, size=large_font)
print(f'Using font "{font_filename}".')
break
except OSError:
print(f'Could not load font "{font_filename}".')
if font is None:
font = ImageFont.load_default()
print("Using default font.")
def _font_points_to_pixels(pt):
return round(pt * 96.0 / 72)
margin_pixels = 20
# height of the background image
tallest_line = max(lines, key=lambda line: font.getsize(line)[PIL_HEIGHT_INDEX])
max_line_height = _font_points_to_pixels(
font.getsize(tallest_line)[PIL_HEIGHT_INDEX]
)
realistic_line_height = max_line_height * 0.8
image_height = int(ceil(realistic_line_height * len(lines) + 2 * margin_pixels))
widest_line = max(lines, key=lambda s: font.getsize(s)[PIL_WIDTH_INDEX])
max_line_width = _font_points_to_pixels(font.getsize(widest_line)[PIL_WIDTH_INDEX])
image_width = int(ceil(max_line_width + (2 * margin_pixels)))
# draw the background
background_color = 255 # white
image = Image.new(
PIL_GRAYSCALE, (image_width, image_height), color=background_color
)
draw = ImageDraw.Draw(image)
font_color = 0
horizontal_position = margin_pixels
for i, line in enumerate(lines):
vertical_position = int(round(margin_pixels + (i * realistic_line_height)))
draw.text(
(horizontal_position, vertical_position), line, fill=font_color, font=font
)
return image
def create_preview(file_path: str) -> str:
global manager
# TODO: add text image generation/code image
if not manager:
manager = PreviewManager(cache_path, create_folder=True)
if manager.has_jpeg_preview(file_path):
return manager.get_jpeg_preview(file_path, height=500)
return ""
def get_file_mimetype(file_path: str) -> str: def get_file_mimetype(file_path: str) -> str:
mime = magic.Magic(mime=True) mime = magic.Magic(mime=True)
return mime.from_file(file_path) return mime.from_file(file_path)
def get_description(file_path: str) -> str:
global manager
if not manager:
manager = PreviewManager(cache_path, create_folder=True)
if manager.has_text_preview(file_path):
return manager.get_text_preview(file_path)
return ""
def get_base_meta(file: File): def get_base_meta(file: File):
preview = file.preview.url if file.preview else "" preview = file.preview.url if file.preview else ""
description = file.description if file.description else "" description = file.description if file.description else ""

View File

@ -11,6 +11,12 @@
from akarpov.files.models import File from akarpov.files.models import File
from ..documents import FileDocument from ..documents import FileDocument
from .lema import lemmatize_and_remove_stopwords
"""
Calculus on types of searches:
https://new.akarpov.ru/files/FZUTFBIyfbdlDHVzxUNU
"""
class BaseSearch: class BaseSearch:
@ -134,20 +140,23 @@ class SimilaritySearch(BaseSearch):
def search(self, query: str) -> QuerySet[File]: def search(self, query: str) -> QuerySet[File]:
if self.queryset is None: if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search") raise ValueError("Queryset cannot be None for similarity search")
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
queryset = ( queryset = (
self.queryset.annotate( self.queryset.annotate(
name_similarity=Coalesce( name_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("name"), query), TrigramSimilarity(UnaccentLower("name"), filtered_query),
Value(0), Value(0),
output_field=FloatField(), output_field=FloatField(),
), ),
description_similarity=Coalesce( description_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("description"), query), TrigramSimilarity(UnaccentLower("description"), filtered_query),
Value(0), Value(0),
output_field=FloatField(), output_field=FloatField(),
), ),
content_similarity=Coalesce( content_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("content"), query), TrigramSimilarity(UnaccentLower("content"), filtered_query),
Value(0), Value(0),
output_field=FloatField(), output_field=FloatField(),
), ),

View File

@ -0,0 +1,18 @@
import chardet
import textract
from textract.exceptions import ExtensionNotSupported
def extract_file_text(file: str) -> str:
try:
text = textract.process(file)
except ExtensionNotSupported:
try:
rawdata = open(file, "rb").read()
enc = chardet.detect(rawdata)
with open(file, encoding=enc["encoding"]) as f:
text = f.read()
except Exception:
return ""
return text

View File

@ -1,69 +1,40 @@
import base64 import os
import time import time
from urllib.parse import urljoin
import requests
import structlog import structlog
from celery import shared_task from celery import shared_task
from django.conf import settings
from django.core import management from django.core import management
from django.core.files.base import ContentFile from django.core.files import File
from akarpov.files.models import File as FileModel from akarpov.files.models import File as FileModel
from akarpov.files.services.preview import create_preview, get_file_mimetype
from akarpov.files.services.text import extract_file_text
logger = structlog.get_logger(__name__) logger = structlog.get_logger(__name__)
def sanitize_content(content):
"""Remove NUL (0x00) characters from the content."""
if isinstance(content, str):
return content.replace("\x00", "")
elif isinstance(content, bytes):
return content.replace(b"\x00", b"")
return content
@shared_task() @shared_task()
def process_file(pk: int): def process_file(pk: int):
pth = None
file = FileModel.objects.get(pk=pk) file = FileModel.objects.get(pk=pk)
if not file.name: if not file.name:
file.name = file.file.name.split("/")[-1] file.name = file.file.name.split("/")[-1]
try: try:
api_url = urljoin(settings.PREVIEW_SERVICE_URL, "/process_file/") pth = create_preview(file.file.path)
if pth:
files = {"file": (file.name, file.file.open("rb"))} with open(pth, "rb") as f:
headers = {
"X-API-Key": settings.PREVIEW_SERVICE_API_KEY,
"Accept": "application/json",
}
response = requests.post(api_url, files=files, headers=headers)
if response.status_code != 200:
logger.error(f"Failed to process file {pk}: {response.text}")
return
result = response.json()
file.file_type = result["file_type"]
file.content = sanitize_content(result["content"])
if result["preview"]:
image_data = base64.b64decode(result["preview"])
file.preview.save( file.preview.save(
f"{file.name}_preview.jpg", ContentFile(image_data), save=False pth.split("/")[-1],
File(f),
save=False,
) )
file.save()
logger.info(f"File {pk} processed successfully")
except Exception as e: except Exception as e:
logger.error(f"Error processing file {pk}: {str(e)}") logger.error(e)
finally: file.file_type = get_file_mimetype(file.file.path)
file.file.close() file.content = extract_file_text(file.file.path)
file.save(update_fields=["preview", "name", "file_type", "content"])
if pth and os.path.isfile(pth):
os.remove(pth)
return pk return pk

View File

@ -6,7 +6,6 @@
from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly
from akarpov.music.api.serializers import ( from akarpov.music.api.serializers import (
AddSongToPlaylistSerializer, AddSongToPlaylistSerializer,
AllSearchSerializer,
AnonMusicUserSerializer, AnonMusicUserSerializer,
FullAlbumSerializer, FullAlbumSerializer,
FullAuthorSerializer, FullAuthorSerializer,
@ -20,6 +19,7 @@
ListSongSlugsSerializer, ListSongSlugsSerializer,
PlaylistSerializer, PlaylistSerializer,
SongSerializer, SongSerializer,
AllSearchSerializer,
) )
from akarpov.music.models import ( from akarpov.music.models import (
Album, Album,
@ -29,7 +29,7 @@
SongUserRating, SongUserRating,
UserListenHistory, UserListenHistory,
) )
from akarpov.music.services.search import search_album, search_author, search_song from akarpov.music.services.search import search_song, search_album, search_author
from akarpov.music.tasks import listen_to_song from akarpov.music.tasks import listen_to_song
from akarpov.users.models import User from akarpov.users.models import User

View File

@ -1,7 +1,7 @@
from django_elasticsearch_dsl import Document, fields from django_elasticsearch_dsl import Document, fields
from django_elasticsearch_dsl.registries import registry from django_elasticsearch_dsl.registries import registry
from akarpov.music.models import Album, Author, Song from akarpov.music.models import Song, Album, Author
@registry.register_document @registry.register_document

View File

@ -3,8 +3,8 @@
from django_elasticsearch_dsl.registries import registry from django_elasticsearch_dsl.registries import registry
from elasticsearch_dsl import Q as ES_Q from elasticsearch_dsl import Q as ES_Q
from akarpov.music.documents import AlbumDocument, AuthorDocument, SongDocument from akarpov.music.documents import SongDocument, AlbumDocument, AuthorDocument
from akarpov.music.models import Album, Author, Song from akarpov.music.models import Song, Author, Album
def search_song(query): def search_song(query):

View File

@ -35,8 +35,7 @@ def album_create(sender, instance, created, **kwargs):
@receiver(post_save) @receiver(post_save)
def send_que_status(sender, instance, created, **kwargs): def send_que_status(sender, instance, created, **kwargs): ...
...
@receiver(pre_save, sender=SongUserRating) @receiver(pre_save, sender=SongUserRating)

View File

@ -4,6 +4,7 @@
import pylast import pylast
import spotipy import spotipy
import structlog import structlog
import ytmusicapi
from asgiref.sync import async_to_sync from asgiref.sync import async_to_sync
from celery import shared_task from celery import shared_task
from channels.layers import get_channel_layer from channels.layers import get_channel_layer
@ -11,7 +12,6 @@
from django.utils import timezone from django.utils import timezone
from django.utils.timezone import now from django.utils.timezone import now
from spotipy import SpotifyClientCredentials from spotipy import SpotifyClientCredentials
from ytmusicapi import YTMusic
from akarpov.music.api.serializers import SongSerializer from akarpov.music.api.serializers import SongSerializer
from akarpov.music.models import ( from akarpov.music.models import (

View File

@ -28,6 +28,8 @@ RUN apt-get update && \
apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev && \ apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev && \
# Dependencies for file preview generation # Dependencies for file preview generation
apt-get install -y webp git libimage-exiftool-perl libmagickwand-dev ffmpeg libgdal-dev && \ apt-get install -y webp git libimage-exiftool-perl libmagickwand-dev ffmpeg libgdal-dev && \
# ML dependencies \
# none for now
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \ apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
@ -46,6 +48,7 @@ RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -
COPY . . COPY . .
RUN poetry build && /venv/bin/pip install dist/*.whl RUN poetry build && /venv/bin/pip install dist/*.whl
RUN /venv/bin/python -m nltk.downloader punkt stopwords wordnet
COPY ./compose/production/django/entrypoint /entrypoint COPY ./compose/production/django/entrypoint /entrypoint
@ -68,6 +71,10 @@ COPY ./compose/local/django/start-redirect /start-redirect
RUN sed -i 's/\r$//g' /start-redirect RUN sed -i 's/\r$//g' /start-redirect
RUN chmod +x /start-redirect RUN chmod +x /start-redirect
COPY ./compose/local/django/install_preview_dependencies /install_preview_dependencies
RUN sed -i 's/\r$//g' /install_preview_dependencies
RUN chmod +x /install_preview_dependencies
COPY ./compose/local/django/celery/worker/start /start-celeryworker COPY ./compose/local/django/celery/worker/start /start-celeryworker
RUN sed -i 's/\r$//g' /start-celeryworker RUN sed -i 's/\r$//g' /start-celeryworker
RUN chmod +x /start-celeryworker RUN chmod +x /start-celeryworker

View File

@ -3,4 +3,6 @@
set -o errexit set -o errexit
set -o nounset set -o nounset
/install_preview_dependencies
celery -A config.celery_app worker --autoscale 20 -l INFO celery -A config.celery_app worker --autoscale 20 -l INFO

View File

@ -0,0 +1,14 @@
#!/bin/bash
apt-get update
apt-get install wget libnotify4 scribus libappindicator3-1 libayatana-indicator3-7 libdbusmenu-glib4 libdbusmenu-gtk3-4
apt-get install -y poppler-utils libfile-mimeinfo-perl ghostscript libsecret-1-0 zlib1g-dev libjpeg-dev imagemagick libmagic1 libreoffice inkscape xvfb
apt-get install -y libxml2-dev libxslt1-dev antiword unrtf tesseract-ocr flac lame libmad0 libsox-fmt-mp3 sox swig
apt-get install -y python-dev-is-python3 libxml2-dev libxslt1-dev antiword unrtf poppler-utils tesseract-ocr \
flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
wget https://github.com/jgraph/drawio-desktop/releases/download/v13.0.3/draw.io-amd64-13.0.3.deb
dpkg -i draw.io-amd64-13.0.3.deb
rm draw.io-amd64-13.0.3.deb
apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
rm -rf /var/lib/apt/lists/*
preview --check-dependencies

View File

@ -758,9 +758,3 @@
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https") SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
USE_X_FORWARDED_HOST = True USE_X_FORWARDED_HOST = True
USE_X_FORWARDED_PORT = True USE_X_FORWARDED_PORT = True
# PREVIEW
# ------------------------------------------------------------------------------
PREVIEW_SERVICE_URL = env("PREVIEW_SERVICE_URL", default=None)
PREVIEW_SERVICE_API_KEY = env("PREVIEW_SERVICE_API_KEY", default=None)

5764
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -69,6 +69,7 @@ channels = {extras = ["daphne"], version = "^4.0.0"}
django-upload-validator = "^1.1.6" django-upload-validator = "^1.1.6"
markdown = "^3.4.4" markdown = "^3.4.4"
pydotplus = "^2.0.2" pydotplus = "^2.0.2"
preview-generator = "^0.29"
uuid = "^1.30" uuid = "^1.30"
mutagen = "^1.46.0" mutagen = "^1.46.0"
pydub = "^0.25.1" pydub = "^0.25.1"
@ -99,8 +100,11 @@ pytest-mock = "^3.11.1"
pytest-asyncio = "^0.21.1" pytest-asyncio = "^0.21.1"
pytest-lambda = "^2.2.0" pytest-lambda = "^2.2.0"
pgvector = "^0.2.2" pgvector = "^0.2.2"
pycld2 = "^0.41"
uuid6 = "^2023.5.2" uuid6 = "^2023.5.2"
uvicorn = "0.23.2" uvicorn = "0.23.2"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142" pymorphy3-dicts-ru = "^2.4.417150.4580142"
fastapi = "0.103.0" fastapi = "0.103.0"
pydantic-settings = "^2.0.3" pydantic-settings = "^2.0.3"
@ -114,9 +118,9 @@ spotdl = "^4.2.4"
fuzzywuzzy = "^0.18.0" fuzzywuzzy = "^0.18.0"
python-levenshtein = "^0.23.0" python-levenshtein = "^0.23.0"
pylast = "^5.2.0" pylast = "^5.2.0"
textract = {git = "https://github.com/Alexander-D-Karpov/textract.git", branch = "master"}
librosa = "^0.10.1" librosa = "^0.10.1"
django-ckeditor-5 = "^0.2.12" django-ckeditor-5 = "^0.2.12"
chardet = "^5.2.0"
[build-system] [build-system]

0
search/__init__.py Normal file
View File

6
search/pipeline.py Normal file
View File

@ -0,0 +1,6 @@
from haystack import Document
from milvus_haystack import MilvusDocumentStore
ds = MilvusDocumentStore()
ds.write_documents([Document("Some Content")])
ds.get_all_documents()

2185
search/poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

18
search/pyproject.toml Normal file
View File

@ -0,0 +1,18 @@
[tool.poetry]
name = "search"
version = "0.1.0"
description = ""
authors = ["Alexander-D-Karpov <alexandr.d.karpov@gmail.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
fastapi = "0.99.1"
pydantic = "1.10.13"
transformers = {version = "4.34.1", extras = ["torch"]}
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

4
spacy_setup.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/bash
python -m spacy download en_core_web_lg
python -m spacy download xx_sent_ud_sm
python -m spacy download ru_core_news_lg