2025-09-26 04:36:34 +03:00
30 changed files with 5586 additions and 2770 deletions
--- a/.env.example
+++ b/.env.example
@ -15,5 +15,3 @@ LAST_FM_SECRET=
 SPOTIFY_ID=
 SPOTIFY_SECRET=
 YANDEX_TOKEN=
-PREVIEW_SERVICE_API_KEY=
-PREVIEW_SERVICE_URL=
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -24,10 +24,10 @@ jobs:

    steps:
      - name: Checkout Code Repository
-        uses: actions/checkout@v4.2.1
+        uses: actions/checkout@v3

      - name: Cache packages
-        uses: actions/cache@v4.1.1
+        uses: actions/cache@v3
        id: cache-packages
        with:
          path: "~/packages/"
@ -45,18 +45,18 @@ jobs:
            sudo dpkg -L libimage-exiftool-perl libmagickwand-dev  | while IFS= read -r f; do if test -f $f; then echo $f; fi; done | xargs cp --parents --target-directory ~/packages/
          fi

-      - uses: actions/checkout@v4.2.1
+      - uses: actions/checkout@v3
      - name: Install poetry
        run: pipx install poetry

-      - uses: actions/setup-python@v5.2.0
+      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'
          cache: 'poetry'
      - run: poetry install

      - name: Run pre-commit
-        uses: pre-commit/action@v3.0.1
+        uses: pre-commit/action@v2.0.3

  # With no caching at all the entire ci process takes 4m 30s to complete!
  pytest:
@ -64,10 +64,7 @@ jobs:

    steps:
      - name: Checkout Code Repository
-        uses: actions/checkout@v4.2.1
-
-      - name: Install Docker Compose
-        run: sudo apt-get update && sudo apt-get install -y docker-compose
+        uses: actions/checkout@v3

      - name: Build the Stack
        run:  docker-compose -f local.yml build
--- a/README.md
+++ b/README.md
@ -28,6 +28,10 @@ $ uvicorn redirect.app:app --reload
 ```shell
 $ docker-compose -f local.yml up
 ```
+Install file preview dependencies
+```shell
+$ docker-compose -f local.yml exec django /install_preview_dependencies
+```
 - server - http://127.0.0.1:8000
 - mail - http://127.0.0.1:8025

--- a/akarpov/common/ml/init.py
+++ b/akarpov/common/ml/init.py
--- a/akarpov/common/ml/text.py
+++ b/akarpov/common/ml/text.py
@ -0,0 +1,51 @@
+import pycld2 as cld2
+import spacy
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+# load ml classes and models on first request
+# TODO: move to outer server/service
+nlp = None
+ru_nlp = None
+
+ru_model = None
+ru_tokenizer = None
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def get_text_embedding(text: str):
+    global nlp, ru_nlp, ru_model, ru_tokenizer
+
+    is_reliable, text_bytes_found, details = cld2.detect(text)
+    if is_reliable:
+        lang = details[0]
+        if lang[1] in ["ru", "en"]:
+            lang = lang[1]
+        else:
+            return None
+    else:
+        return None
+
+    if lang == "ru":
+        if not ru_nlp:
+            ru_nlp = spacy.load("ru_core_news_md", disable=["parser", "ner"])
+        lema = " ".join([token.lemma_ for token in ru_nlp(text)])
+        if not ru_model:
+            ru_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
+        if not ru_tokenizer:
+            ru_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
+        encodings = ru_tokenizer(
+            lema,  # the texts to be tokenized
+            padding=True,  # pad the texts to the maximum length (so that all outputs have the same length)
+            return_tensors="pt",  # return the tensors (not lists)
+        )
+        with torch.no_grad():
+            # get the model embeddings
+            embeds = ru_model(**encodings)
+        embeds = embeds[0]
+    elif lang == "en":
+        embeds = None
+    else:
+        embeds = None
+
+    return embeds
--- a/akarpov/files/previews/application/doc.py
+++ b/akarpov/files/previews/application/doc.py
@ -1,10 +1,16 @@
+import textract
+
 from akarpov.files.models import File


 def view(file: File):
    static = ""
    content = ""
-    text = file.content.replace("\t", "    ")
+    text = (
+        textract.process(file.file.path, extension="doc", output_encoding="utf8")
+        .decode("utf8")
+        .replace("\t", "    ")
+    )
    for line in text.split("\n"):
        content += f"<p class='mt-1'>{line}</p>"
    return static, content
--- a/akarpov/files/previews/application/docx.py
+++ b/akarpov/files/previews/application/docx.py
@ -1,10 +1,16 @@
+import textract
+
 from akarpov.files.models import File


 def view(file: File):
    static = ""
    content = ""
-    text = file.content.replace("\t", "    ")
+    text = (
+        textract.process(file.file.path, extension="docx", output_encoding="utf8")
+        .decode("utf8")
+        .replace("\t", "    ")
+    )
    for line in text.split("\n"):
        content += f"<p class='mt-1'>{line}</p>"
    return static, content
--- a/akarpov/files/previews/application/odt.py
+++ b/akarpov/files/previews/application/odt.py
@ -1,10 +1,16 @@
+import textract
+
 from akarpov.files.models import File


 def view(file: File):
    static = ""
    content = ""
-    text = file.content.replace("\t", "    ")
+    text = (
+        textract.process(file.file.path, extension="odt", output_encoding="utf8")
+        .decode("utf8")
+        .replace("\t", "    ")
+    )
    for line in text.split("\n"):
        content += f"<p class='mt-1'>{line}</p>"
    return static, content
--- a/akarpov/files/previews/audio/oga.py
+++ b/akarpov/files/previews/audio/oga.py
@ -1,3 +1,5 @@
+import textract
+
 from akarpov.files.models import File


@ -5,7 +7,11 @@ def view(file: File) -> (str, str):
    static = f"""
    <meta property="og:title" content="{file.name}" />
    """
-    text = file.content.replace("\t", "    ")
+    text = (
+        textract.process(file.file.path, extension="ogg", output_encoding="utf8")
+        .decode("utf8")
+        .replace("\t", "    ")
+    )
    content = (
        """
    <div id="waveform">
--- a/akarpov/files/services/lema.py
+++ b/akarpov/files/services/lema.py
@ -0,0 +1,42 @@
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import word_tokenize
+from pymorphy3 import MorphAnalyzer
+
+# Set up stop words
+english_stopwords = set(stopwords.words("english"))
+russian_stopwords = set(stopwords.words("russian"))
+
+# Set up lemmatizers
+english_lemmatizer = None
+russian_lemmatizer = None
+
+
+def lemmatize_and_remove_stopwords(text, language="english"):
+    # Tokenize the text
+    global english_lemmatizer, russian_lemmatizer
+    tokens = word_tokenize(text)
+
+    # Lemmatize each token based on the specified language
+    lemmatized_tokens = []
+    for token in tokens:
+        if language == "russian":
+            if not russian_lemmatizer:
+                russian_lemmatizer = MorphAnalyzer()
+            lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
+        else:  # Default to English
+            if not english_lemmatizer:
+                english_lemmatizer = WordNetLemmatizer()
+            lemmatized_token = english_lemmatizer.lemmatize(token)
+        lemmatized_tokens.append(lemmatized_token)
+
+    # Remove stop words
+    filtered_tokens = [
+        token
+        for token in lemmatized_tokens
+        if token not in english_stopwords and token not in russian_stopwords
+    ]
+
+    # Reconstruct the text
+    filtered_text = " ".join(filtered_tokens)
+    return filtered_text
--- a/akarpov/files/services/preview.py
+++ b/akarpov/files/services/preview.py
@ -1,4 +1,8 @@
+from math import ceil
+
 import magic
+from PIL import Image, ImageDraw, ImageFont
+from preview_generator.manager import PreviewManager

 from akarpov.files.models import File

@ -15,11 +19,90 @@
 manager = None


+def textfile_to_image(textfile_path) -> Image:
+    """Convert text file to a grayscale image.
+
+    arguments:
+    textfile_path - the content of this file will be converted to an image
+    font_path - path to a font file (for example impact.ttf)
+    """
+    # parse the file into lines stripped of whitespace on the right side
+    with open(textfile_path) as f:
+        lines = tuple(line.rstrip() for line in f.readlines())
+
+    font: ImageFont = None
+    large_font = 20  # get better resolution with larger size
+    for font_filename in COMMON_MONO_FONT_FILENAMES:
+        try:
+            font = ImageFont.truetype(font_filename, size=large_font)
+            print(f'Using font "{font_filename}".')
+            break
+        except OSError:
+            print(f'Could not load font "{font_filename}".')
+    if font is None:
+        font = ImageFont.load_default()
+        print("Using default font.")
+
+    def _font_points_to_pixels(pt):
+        return round(pt * 96.0 / 72)
+
+    margin_pixels = 20
+
+    # height of the background image
+    tallest_line = max(lines, key=lambda line: font.getsize(line)[PIL_HEIGHT_INDEX])
+    max_line_height = _font_points_to_pixels(
+        font.getsize(tallest_line)[PIL_HEIGHT_INDEX]
+    )
+    realistic_line_height = max_line_height * 0.8
+    image_height = int(ceil(realistic_line_height * len(lines) + 2 * margin_pixels))
+
+    widest_line = max(lines, key=lambda s: font.getsize(s)[PIL_WIDTH_INDEX])
+    max_line_width = _font_points_to_pixels(font.getsize(widest_line)[PIL_WIDTH_INDEX])
+    image_width = int(ceil(max_line_width + (2 * margin_pixels)))
+
+    # draw the background
+    background_color = 255  # white
+    image = Image.new(
+        PIL_GRAYSCALE, (image_width, image_height), color=background_color
+    )
+    draw = ImageDraw.Draw(image)
+
+    font_color = 0
+    horizontal_position = margin_pixels
+    for i, line in enumerate(lines):
+        vertical_position = int(round(margin_pixels + (i * realistic_line_height)))
+        draw.text(
+            (horizontal_position, vertical_position), line, fill=font_color, font=font
+        )
+
+    return image
+
+
+def create_preview(file_path: str) -> str:
+    global manager
+    # TODO: add text image generation/code image
+    if not manager:
+        manager = PreviewManager(cache_path, create_folder=True)
+    if manager.has_jpeg_preview(file_path):
+        return manager.get_jpeg_preview(file_path, height=500)
+    return ""
+
+
 def get_file_mimetype(file_path: str) -> str:
    mime = magic.Magic(mime=True)
    return mime.from_file(file_path)


+def get_description(file_path: str) -> str:
+    global manager
+    if not manager:
+        manager = PreviewManager(cache_path, create_folder=True)
+
+    if manager.has_text_preview(file_path):
+        return manager.get_text_preview(file_path)
+    return ""
+
+
 def get_base_meta(file: File):
    preview = file.preview.url if file.preview else ""
    description = file.description if file.description else ""
--- a/akarpov/files/services/search.py
+++ b/akarpov/files/services/search.py
@ -11,6 +11,12 @@
 from akarpov.files.models import File

 from ..documents import FileDocument
+from .lema import lemmatize_and_remove_stopwords
+
+"""
+Calculus on types of searches:
+https://new.akarpov.ru/files/FZUTFBIyfbdlDHVzxUNU
+"""


 class BaseSearch:
@ -134,20 +140,23 @@ class SimilaritySearch(BaseSearch):
    def search(self, query: str) -> QuerySet[File]:
        if self.queryset is None:
            raise ValueError("Queryset cannot be None for similarity search")
+
+        language = "russian" if re.search("[а-яА-Я]", query) else "english"
+        filtered_query = lemmatize_and_remove_stopwords(query, language=language)
        queryset = (
            self.queryset.annotate(
                name_similarity=Coalesce(
-                    TrigramSimilarity(UnaccentLower("name"), query),
+                    TrigramSimilarity(UnaccentLower("name"), filtered_query),
                    Value(0),
                    output_field=FloatField(),
                ),
                description_similarity=Coalesce(
-                    TrigramSimilarity(UnaccentLower("description"), query),
+                    TrigramSimilarity(UnaccentLower("description"), filtered_query),
                    Value(0),
                    output_field=FloatField(),
                ),
                content_similarity=Coalesce(
-                    TrigramSimilarity(UnaccentLower("content"), query),
+                    TrigramSimilarity(UnaccentLower("content"), filtered_query),
                    Value(0),
                    output_field=FloatField(),
                ),
--- a/akarpov/files/services/text.py
+++ b/akarpov/files/services/text.py
@ -0,0 +1,18 @@
+import chardet
+import textract
+from textract.exceptions import ExtensionNotSupported
+
+
+def extract_file_text(file: str) -> str:
+    try:
+        text = textract.process(file)
+    except ExtensionNotSupported:
+        try:
+            rawdata = open(file, "rb").read()
+            enc = chardet.detect(rawdata)
+            with open(file, encoding=enc["encoding"]) as f:
+                text = f.read()
+        except Exception:
+            return ""
+
+    return text
--- a/akarpov/files/tasks.py
+++ b/akarpov/files/tasks.py
@ -1,69 +1,40 @@
-import base64
+import os
 import time
-from urllib.parse import urljoin

-import requests
 import structlog
 from celery import shared_task
-from django.conf import settings
 from django.core import management
-from django.core.files.base import ContentFile
+from django.core.files import File

 from akarpov.files.models import File as FileModel
+from akarpov.files.services.preview import create_preview, get_file_mimetype
+from akarpov.files.services.text import extract_file_text

 logger = structlog.get_logger(__name__)


-def sanitize_content(content):
-    """Remove NUL (0x00) characters from the content."""
-    if isinstance(content, str):
-        return content.replace("\x00", "")
-    elif isinstance(content, bytes):
-        return content.replace(b"\x00", b"")
-    return content
-
-
@shared_task()
 def process_file(pk: int):
+    pth = None
    file = FileModel.objects.get(pk=pk)
    if not file.name:
        file.name = file.file.name.split("/")[-1]
-
    try:
-        api_url = urljoin(settings.PREVIEW_SERVICE_URL, "/process_file/")
-
-        files = {"file": (file.name, file.file.open("rb"))}
-        headers = {
-            "X-API-Key": settings.PREVIEW_SERVICE_API_KEY,
-            "Accept": "application/json",
-        }
-
-        response = requests.post(api_url, files=files, headers=headers)
-
-        if response.status_code != 200:
-            logger.error(f"Failed to process file {pk}: {response.text}")
-            return
-
-        result = response.json()
-
-        file.file_type = result["file_type"]
-        file.content = sanitize_content(result["content"])
-
-        if result["preview"]:
-            image_data = base64.b64decode(result["preview"])
+        pth = create_preview(file.file.path)
+        if pth:
+            with open(pth, "rb") as f:
                file.preview.save(
-                f"{file.name}_preview.jpg", ContentFile(image_data), save=False
+                    pth.split("/")[-1],
+                    File(f),
+                    save=False,
                )
-
-        file.save()
-
-        logger.info(f"File {pk} processed successfully")
-
    except Exception as e:
-        logger.error(f"Error processing file {pk}: {str(e)}")
-    finally:
-        file.file.close()
-
+        logger.error(e)
+    file.file_type = get_file_mimetype(file.file.path)
+    file.content = extract_file_text(file.file.path)
+    file.save(update_fields=["preview", "name", "file_type", "content"])
+    if pth and os.path.isfile(pth):
+        os.remove(pth)
    return pk


--- a/akarpov/music/api/views.py
+++ b/akarpov/music/api/views.py
@ -6,7 +6,6 @@
 from akarpov.common.api.permissions import IsAdminOrReadOnly, IsCreatorOrReadOnly
 from akarpov.music.api.serializers import (
    AddSongToPlaylistSerializer,
-    AllSearchSerializer,
    AnonMusicUserSerializer,
    FullAlbumSerializer,
    FullAuthorSerializer,
@ -20,6 +19,7 @@
    ListSongSlugsSerializer,
    PlaylistSerializer,
    SongSerializer,
+    AllSearchSerializer,
 )
 from akarpov.music.models import (
    Album,
@ -29,7 +29,7 @@
    SongUserRating,
    UserListenHistory,
 )
-from akarpov.music.services.search import search_album, search_author, search_song
+from akarpov.music.services.search import search_song, search_album, search_author
 from akarpov.music.tasks import listen_to_song
 from akarpov.users.models import User

--- a/akarpov/music/documents.py
+++ b/akarpov/music/documents.py
@ -1,7 +1,7 @@
 from django_elasticsearch_dsl import Document, fields
 from django_elasticsearch_dsl.registries import registry

-from akarpov.music.models import Album, Author, Song
+from akarpov.music.models import Song, Album, Author


@registry.register_document
--- a/akarpov/music/services/search.py
+++ b/akarpov/music/services/search.py
@ -3,8 +3,8 @@
 from django_elasticsearch_dsl.registries import registry
 from elasticsearch_dsl import Q as ES_Q

-from akarpov.music.documents import AlbumDocument, AuthorDocument, SongDocument
-from akarpov.music.models import Album, Author, Song
+from akarpov.music.documents import SongDocument, AlbumDocument, AuthorDocument
+from akarpov.music.models import Song, Author, Album


 def search_song(query):
--- a/akarpov/music/signals.py
+++ b/akarpov/music/signals.py
@ -35,8 +35,7 @@ def album_create(sender, instance, created, **kwargs):


@receiver(post_save)
-def send_que_status(sender, instance, created, **kwargs):
-    ...
+def send_que_status(sender, instance, created, **kwargs): ...


@receiver(pre_save, sender=SongUserRating)
--- a/akarpov/music/tasks.py
+++ b/akarpov/music/tasks.py
@ -4,6 +4,7 @@
 import pylast
 import spotipy
 import structlog
+import ytmusicapi
 from asgiref.sync import async_to_sync
 from celery import shared_task
 from channels.layers import get_channel_layer
@ -11,7 +12,6 @@
 from django.utils import timezone
 from django.utils.timezone import now
 from spotipy import SpotifyClientCredentials
-from ytmusicapi import YTMusic

 from akarpov.music.api.serializers import SongSerializer
 from akarpov.music.models import (
--- a/compose/local/django/Dockerfile
+++ b/compose/local/django/Dockerfile
@ -28,6 +28,8 @@ RUN apt-get update && \
    apt-get install -y build-essential libpq-dev gettext libmagic-dev libjpeg-dev zlib1g-dev  && \
    # Dependencies for file preview generation
    apt-get install -y webp git libimage-exiftool-perl libmagickwand-dev ffmpeg libgdal-dev && \
+    # ML dependencies \
+    # none for now
    apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
    rm -rf /var/lib/apt/lists/*

@ -46,6 +48,7 @@ RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -

 COPY . .
 RUN poetry build && /venv/bin/pip install dist/*.whl
+RUN /venv/bin/python -m nltk.downloader punkt stopwords wordnet


 COPY ./compose/production/django/entrypoint /entrypoint
@ -68,6 +71,10 @@ COPY ./compose/local/django/start-redirect /start-redirect
 RUN sed -i 's/\r$//g' /start-redirect
 RUN chmod +x /start-redirect

+COPY ./compose/local/django/install_preview_dependencies /install_preview_dependencies
+RUN sed -i 's/\r$//g' /install_preview_dependencies
+RUN chmod +x /install_preview_dependencies
+
 COPY ./compose/local/django/celery/worker/start /start-celeryworker
 RUN sed -i 's/\r$//g' /start-celeryworker
 RUN chmod +x /start-celeryworker
--- a/compose/local/django/celery/worker/start
+++ b/compose/local/django/celery/worker/start
@ -3,4 +3,6 @@
 set -o errexit
 set -o nounset

+/install_preview_dependencies
+
 celery -A config.celery_app  worker --autoscale 20 -l INFO
--- a/compose/local/django/install_preview_dependencies
+++ b/compose/local/django/install_preview_dependencies
@ -0,0 +1,14 @@
+#!/bin/bash
+
+apt-get update
+apt-get install wget libnotify4 scribus libappindicator3-1 libayatana-indicator3-7 libdbusmenu-glib4 libdbusmenu-gtk3-4
+apt-get install -y poppler-utils libfile-mimeinfo-perl ghostscript libsecret-1-0 zlib1g-dev libjpeg-dev imagemagick libmagic1 libreoffice inkscape xvfb
+apt-get install -y libxml2-dev libxslt1-dev antiword unrtf tesseract-ocr flac lame libmad0 libsox-fmt-mp3 sox swig
+apt-get install -y python-dev-is-python3 libxml2-dev libxslt1-dev antiword unrtf poppler-utils tesseract-ocr \
+flac ffmpeg lame libmad0 libsox-fmt-mp3 sox libjpeg-dev swig
+wget https://github.com/jgraph/drawio-desktop/releases/download/v13.0.3/draw.io-amd64-13.0.3.deb
+dpkg -i draw.io-amd64-13.0.3.deb
+rm draw.io-amd64-13.0.3.deb
+apt-get purge -y --auto-remove -o APT:AutoRemove:RecommendsImportant=false && \
+rm -rf /var/lib/apt/lists/*
+preview --check-dependencies
--- a/config/settings/base.py
+++ b/config/settings/base.py
@ -758,9 +758,3 @@
 SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
 USE_X_FORWARDED_HOST = True
 USE_X_FORWARDED_PORT = True
-
-
-# PREVIEW
-# ------------------------------------------------------------------------------
-PREVIEW_SERVICE_URL = env("PREVIEW_SERVICE_URL", default=None)
-PREVIEW_SERVICE_API_KEY = env("PREVIEW_SERVICE_API_KEY", default=None)
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -69,6 +69,7 @@ channels = {extras = ["daphne"], version = "^4.0.0"}
 django-upload-validator = "^1.1.6"
 markdown = "^3.4.4"
 pydotplus = "^2.0.2"
+preview-generator = "^0.29"
 uuid = "^1.30"
 mutagen = "^1.46.0"
 pydub = "^0.25.1"
@ -99,8 +100,11 @@ pytest-mock = "^3.11.1"
 pytest-asyncio = "^0.21.1"
 pytest-lambda = "^2.2.0"
 pgvector = "^0.2.2"
+pycld2 = "^0.41"
 uuid6 = "^2023.5.2"
 uvicorn = "0.23.2"
+nltk = "^3.8.1"
+pymorphy3 = "^1.2.1"
 pymorphy3-dicts-ru = "^2.4.417150.4580142"
 fastapi = "0.103.0"
 pydantic-settings = "^2.0.3"
@ -114,9 +118,9 @@ spotdl = "^4.2.4"
 fuzzywuzzy = "^0.18.0"
 python-levenshtein = "^0.23.0"
 pylast = "^5.2.0"
+textract = {git = "https://github.com/Alexander-D-Karpov/textract.git", branch = "master"}
 librosa = "^0.10.1"
 django-ckeditor-5 = "^0.2.12"
-chardet = "^5.2.0"


 [build-system]
--- a/search/init.py
+++ b/search/init.py
--- a/search/pipeline.py
+++ b/search/pipeline.py
@ -0,0 +1,6 @@
+from haystack import Document
+from milvus_haystack import MilvusDocumentStore
+
+ds = MilvusDocumentStore()
+ds.write_documents([Document("Some Content")])
+ds.get_all_documents()
--- a/search/poetry.lock
+++ b/search/poetry.lock
--- a/search/pyproject.toml
+++ b/search/pyproject.toml
@ -0,0 +1,18 @@
+[tool.poetry]
+name = "search"
+version = "0.1.0"
+description = ""
+authors = ["Alexander-D-Karpov <alexandr.d.karpov@gmail.com>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.11"
+fastapi = "0.99.1"
+pydantic = "1.10.13"
+transformers = {version = "4.34.1", extras = ["torch"]}
+torch = ">=2.0.0, !=2.0.1, !=2.1.0"
+farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/spacy_setup.sh
+++ b/spacy_setup.sh
@ -0,0 +1,4 @@
+#!/bin/bash
+python -m spacy download en_core_web_lg
+python -m spacy download xx_sent_ud_sm
+python -m spacy download ru_core_news_lg