added a better search

2025-04-15 18:21:58 +03:00 · 2023-11-07 16:13:59 +03:00 · 2023-11-07 16:13:59 +03:00 · 03b0de017c
commit 03b0de017c
parent b9715981e7
8 changed files with 196 additions and 348 deletions
--- a/akarpov/files/services/lema.py
+++ b/akarpov/files/services/lema.py
@ -0,0 +1,37 @@
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import word_tokenize
+from pymorphy3 import MorphAnalyzer
+
+# Set up stop words
+english_stopwords = set(stopwords.words("english"))
+russian_stopwords = set(stopwords.words("russian"))
+
+# Set up lemmatizers
+english_lemmatizer = WordNetLemmatizer()
+russian_lemmatizer = MorphAnalyzer()
+
+
+def lemmatize_and_remove_stopwords(text, language="english"):
+    # Tokenize the text
+    tokens = word_tokenize(text)
+
+    # Lemmatize each token based on the specified language
+    lemmatized_tokens = []
+    for token in tokens:
+        if language == "russian":
+            lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
+        else:  # Default to English
+            lemmatized_token = english_lemmatizer.lemmatize(token)
+        lemmatized_tokens.append(lemmatized_token)
+
+    # Remove stop words
+    filtered_tokens = [
+        token
+        for token in lemmatized_tokens
+        if token not in english_stopwords and token not in russian_stopwords
+    ]
+
+    # Reconstruct the text
+    filtered_text = " ".join(filtered_tokens)
+    return filtered_text
--- a/akarpov/files/services/search.py
+++ b/akarpov/files/services/search.py
@ -3,13 +3,14 @@
 from typing import BinaryIO

 from django.conf import settings
-from django.contrib.postgres.lookups import Unaccent
 from django.contrib.postgres.search import TrigramSimilarity
-from django.db.models import Q, QuerySet
+from django.db.models import F, Func, Q, QuerySet
 from haystack.query import SearchQuerySet

 from akarpov.files.models import File

+from .lema import lemmatize_and_remove_stopwords
+

 class BaseSearch:
    def __init__(self, queryset: QuerySet | None = None):
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
                return False


+class UnaccentLower(Func):
+    function = "UNACCENT"
+
+    def as_sql(self, compiler, connection):
+        unaccented_sql, unaccented_params = compiler.compile(
+            self.get_source_expressions()[0]
+        )
+        lower_unaccented_sql = f"LOWER({unaccented_sql})"
+        return lower_unaccented_sql, unaccented_params
+
+
 class SimilaritySearch(BaseSearch):
    def __init__(self, queryset: QuerySet[File] | None = None):
        super().__init__(queryset)
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
        if self.queryset is None:
            raise ValueError("Queryset cannot be None for similarity search")

-        # Perform a similarity search using trigram comparison
-        return (
+        # Detect language and preprocess the query
+        language = "russian" if re.search("[а-яА-Я]", query) else "english"
+        filtered_query = lemmatize_and_remove_stopwords(query, language=language)
+
+        # Annotate the queryset with similarity scores for each field
+        queryset = (
            self.queryset.annotate(
-                name_unaccent=Unaccent("name"),
-                description_unaccent=Unaccent("description"),
-                content_unaccent=Unaccent("content"),
+                name_similarity=TrigramSimilarity(
+                    UnaccentLower("name"), filtered_query
+                ),
+                description_similarity=TrigramSimilarity(
+                    UnaccentLower("description"), filtered_query
+                ),
+                content_similarity=TrigramSimilarity(
+                    UnaccentLower("content"), filtered_query
+                ),
            )
            .annotate(
-                similarity=TrigramSimilarity("name_unaccent", query)
-                + TrigramSimilarity("description_unaccent", query)
-                + TrigramSimilarity("content_unaccent", query)
+                combined_similarity=(
+                    F("name_similarity")
+                    + F("description_similarity")
+                    + F("content_similarity")
+                )
+                / 3
            )
-            .filter(similarity__gt=0.1)
-            .order_by("-similarity")
+            .filter(combined_similarity__gt=0.1)
+            .order_by("-combined_similarity")
        )
+        print(filtered_query)
+        print(queryset.query)
+        print(
+            queryset.values(
+                "name_similarity", "description_similarity", "content_similarity"
+            )
+        )
+
+        return queryset
--- a/akarpov/files/views.py
+++ b/akarpov/files/views.py
@ -65,7 +65,6 @@ def filter(self, queryset):
                    queryset=File.objects.filter(user=self.request.user)
                )
                queryset = search_instance.search(query)
-                print(queryset, query)
        return queryset


--- a/compose/local/django/Dockerfile
+++ b/compose/local/django/Dockerfile
@ -38,6 +38,7 @@ RUN python -m venv /venv

 COPY pyproject.toml poetry.lock /app/
 RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
+RUN python -m nltk.downloader punkt stopwords wordnet

 COPY . .
 RUN poetry build && /venv/bin/pip install dist/*.whl
--- a/config/settings/base.py
+++ b/config/settings/base.py
@ -107,6 +107,7 @@
 ]

 THIRD_PARTY_APPS = [
+    "django.contrib.postgres",
    "crispy_forms",
    "crispy_bootstrap5",
    "allauth",
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -110,6 +110,9 @@ uuid6 = "^2023.5.2"
 fastapi = "0.86.0"
 django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
 uvicorn = "^0.24.0.post1"
+nltk = "^3.8.1"
+pymorphy3 = "^1.2.1"
+pymorphy3-dicts-ru = "^2.4.417150.4580142"


 [build-system]
--- a/search/pyproject.toml
+++ b/search/pyproject.toml
@ -7,12 +7,12 @@ readme = "README.md"

 [tool.poetry.dependencies]
 python = "^3.11"
-farm-haystack = "^1.21.2"
 fastapi = "0.99.1"
 pydantic = "1.10.13"
-milvus-haystack = "^0.0.2"
-
+transformers = {version = "4.34.1", extras = ["torch"]}
+torch = ">=2.0.0, !=2.0.1, !=2.1.0"
+farm-haystack = {extras = ["faiss"], version = "^1.21.2"}

 [build-system]
-requires = ["poetry-core"]
+requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"