Merge 463f23d70d into 03b0de017c

Bump fastapi from 0.101.1 to 0.104.1
Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.101.1 to 0.104.1. - [Release notes](https://github.com/tiangolo/fastapi/releases) - [Commits](https://github.com/tiangolo/fastapi/compare/0.101.1...0.104.1) --- updated-dependencies: - dependency-name: fastapi dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>
2025-09-26 06:56:32 +03:00 · 2023-11-07 13:18:16 +00:00 · 2023-11-07 13:18:14 +00:00 · 2023-11-07 16:13:59 +03:00
8 changed files with 232 additions and 353 deletions
--- a/akarpov/files/services/lema.py
+++ b/akarpov/files/services/lema.py
@ -0,0 +1,37 @@
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from nltk.tokenize import word_tokenize
 from pymorphy3 import MorphAnalyzer
 # Set up stop words
 english_stopwords = set(stopwords.words("english"))
 russian_stopwords = set(stopwords.words("russian"))
 # Set up lemmatizers
 english_lemmatizer = WordNetLemmatizer()
 russian_lemmatizer = MorphAnalyzer()
 def lemmatize_and_remove_stopwords(text, language="english"):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize each token based on the specified language
    lemmatized_tokens = []
    for token in tokens:
        if language == "russian":
            lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
        else:  # Default to English
            lemmatized_token = english_lemmatizer.lemmatize(token)
        lemmatized_tokens.append(lemmatized_token)
    # Remove stop words
    filtered_tokens = [
        token
        for token in lemmatized_tokens
        if token not in english_stopwords and token not in russian_stopwords
    ]
    # Reconstruct the text
    filtered_text = " ".join(filtered_tokens)
    return filtered_text
--- a/akarpov/files/services/search.py
+++ b/akarpov/files/services/search.py
@ -3,13 +3,14 @@
 from typing import BinaryIO
 from django.conf import settings
 from django.contrib.postgres.lookups import Unaccent
 from django.contrib.postgres.search import TrigramSimilarity
-from django.db.models import Q, QuerySet
+from django.db.models import F, Func, Q, QuerySet
 from haystack.query import SearchQuerySet
 from akarpov.files.models import File
 from .lema import lemmatize_and_remove_stopwords
 class BaseSearch:
    def __init__(self, queryset: QuerySet | None = None):
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
                return False
 class UnaccentLower(Func):
    function = "UNACCENT"
    def as_sql(self, compiler, connection):
        unaccented_sql, unaccented_params = compiler.compile(
            self.get_source_expressions()[0]
        )
        lower_unaccented_sql = f"LOWER({unaccented_sql})"
        return lower_unaccented_sql, unaccented_params
 class SimilaritySearch(BaseSearch):
    def __init__(self, queryset: QuerySet[File] | None = None):
        super().__init__(queryset)
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
        if self.queryset is None:
            raise ValueError("Queryset cannot be None for similarity search")
-        # Perform a similarity search using trigram comparison
+        # Detect language and preprocess the query
-        return (
+        language = "russian" if re.search("[а-яА-Я]", query) else "english"
        filtered_query = lemmatize_and_remove_stopwords(query, language=language)
        # Annotate the queryset with similarity scores for each field
        queryset = (
            self.queryset.annotate(
-                name_unaccent=Unaccent("name"),
+                name_similarity=TrigramSimilarity(
-                description_unaccent=Unaccent("description"),
+                    UnaccentLower("name"), filtered_query
-                content_unaccent=Unaccent("content"),
+                ),
                description_similarity=TrigramSimilarity(
                    UnaccentLower("description"), filtered_query
                ),
                content_similarity=TrigramSimilarity(
                    UnaccentLower("content"), filtered_query
                ),
            )
            .annotate(
-                similarity=TrigramSimilarity("name_unaccent", query)
+                combined_similarity=(
-                + TrigramSimilarity("description_unaccent", query)
+                    F("name_similarity")
-                + TrigramSimilarity("content_unaccent", query)
+                    + F("description_similarity")
                    + F("content_similarity")
                )
-            .filter(similarity__gt=0.1)
+                / 3
            .order_by("-similarity")
            )
            .filter(combined_similarity__gt=0.1)
            .order_by("-combined_similarity")
        )
        print(filtered_query)
        print(queryset.query)
        print(
            queryset.values(
                "name_similarity", "description_similarity", "content_similarity"
            )
        )
        return queryset
--- a/akarpov/files/views.py
+++ b/akarpov/files/views.py
@ -65,7 +65,6 @@ def filter(self, queryset):
                    queryset=File.objects.filter(user=self.request.user)
                )
                queryset = search_instance.search(query)
                print(queryset, query)
        return queryset
--- a/compose/local/django/Dockerfile
+++ b/compose/local/django/Dockerfile
@ -38,6 +38,7 @@ RUN python -m venv /venv
 COPY pyproject.toml poetry.lock /app/
 RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
 RUN python -m nltk.downloader punkt stopwords wordnet
 COPY . .
 RUN poetry build && /venv/bin/pip install dist/*.whl
--- a/config/settings/base.py
+++ b/config/settings/base.py
@ -107,6 +107,7 @@
 ]
 THIRD_PARTY_APPS = [
    "django.contrib.postgres",
    "crispy_forms",
    "crispy_bootstrap5",
    "allauth",
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -107,9 +107,12 @@ pgvector = "^0.2.2"
 pycld2 = "^0.41"
 textract = "^1.6.5"
 uuid6 = "^2023.5.2"
-fastapi = "0.86.0"
+fastapi = "0.104.1"
 django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
 uvicorn = "^0.24.0.post1"
 nltk = "^3.8.1"
 pymorphy3 = "^1.2.1"
 pymorphy3-dicts-ru = "^2.4.417150.4580142"
 [build-system]
--- a/search/pyproject.toml
+++ b/search/pyproject.toml
@ -7,12 +7,12 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.11"
 farm-haystack = "^1.21.2"
 fastapi = "0.99.1"
 pydantic = "1.10.13"
-milvus-haystack = "^0.0.2"
+transformers = {version = "4.34.1", extras = ["torch"]}
-
+torch = ">=2.0.0, !=2.0.1, !=2.1.0"
 farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
 [build-system]
-requires = ["poetry-core"]
+requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"