added a better search

This commit is contained in:
Alexander Karpov 2023-11-07 16:13:59 +03:00
parent b9715981e7
commit 03b0de017c
8 changed files with 196 additions and 348 deletions

View File

@ -0,0 +1,37 @@
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
# Set up stop words
english_stopwords = set(stopwords.words("english"))
russian_stopwords = set(stopwords.words("russian"))
# Set up lemmatizers
english_lemmatizer = WordNetLemmatizer()
russian_lemmatizer = MorphAnalyzer()
def lemmatize_and_remove_stopwords(text, language="english"):
# Tokenize the text
tokens = word_tokenize(text)
# Lemmatize each token based on the specified language
lemmatized_tokens = []
for token in tokens:
if language == "russian":
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
else: # Default to English
lemmatized_token = english_lemmatizer.lemmatize(token)
lemmatized_tokens.append(lemmatized_token)
# Remove stop words
filtered_tokens = [
token
for token in lemmatized_tokens
if token not in english_stopwords and token not in russian_stopwords
]
# Reconstruct the text
filtered_text = " ".join(filtered_tokens)
return filtered_text

View File

@ -3,13 +3,14 @@
from typing import BinaryIO
from django.conf import settings
from django.contrib.postgres.lookups import Unaccent
from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import Q, QuerySet
from django.db.models import F, Func, Q, QuerySet
from haystack.query import SearchQuerySet
from akarpov.files.models import File
from .lema import lemmatize_and_remove_stopwords
class BaseSearch:
def __init__(self, queryset: QuerySet | None = None):
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
return False
class UnaccentLower(Func):
function = "UNACCENT"
def as_sql(self, compiler, connection):
unaccented_sql, unaccented_params = compiler.compile(
self.get_source_expressions()[0]
)
lower_unaccented_sql = f"LOWER({unaccented_sql})"
return lower_unaccented_sql, unaccented_params
class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset)
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search")
# Perform a similarity search using trigram comparison
return (
# Detect language and preprocess the query
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
# Annotate the queryset with similarity scores for each field
queryset = (
self.queryset.annotate(
name_unaccent=Unaccent("name"),
description_unaccent=Unaccent("description"),
content_unaccent=Unaccent("content"),
name_similarity=TrigramSimilarity(
UnaccentLower("name"), filtered_query
),
description_similarity=TrigramSimilarity(
UnaccentLower("description"), filtered_query
),
content_similarity=TrigramSimilarity(
UnaccentLower("content"), filtered_query
),
)
.annotate(
similarity=TrigramSimilarity("name_unaccent", query)
+ TrigramSimilarity("description_unaccent", query)
+ TrigramSimilarity("content_unaccent", query)
combined_similarity=(
F("name_similarity")
+ F("description_similarity")
+ F("content_similarity")
)
/ 3
)
.filter(similarity__gt=0.1)
.order_by("-similarity")
.filter(combined_similarity__gt=0.1)
.order_by("-combined_similarity")
)
print(filtered_query)
print(queryset.query)
print(
queryset.values(
"name_similarity", "description_similarity", "content_similarity"
)
)
return queryset

View File

@ -65,7 +65,6 @@ def filter(self, queryset):
queryset=File.objects.filter(user=self.request.user)
)
queryset = search_instance.search(query)
print(queryset, query)
return queryset

View File

@ -38,6 +38,7 @@ RUN python -m venv /venv
COPY pyproject.toml poetry.lock /app/
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
RUN python -m nltk.downloader punkt stopwords wordnet
COPY . .
RUN poetry build && /venv/bin/pip install dist/*.whl

View File

@ -107,6 +107,7 @@
]
THIRD_PARTY_APPS = [
"django.contrib.postgres",
"crispy_forms",
"crispy_bootstrap5",
"allauth",

435
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -110,6 +110,9 @@ uuid6 = "^2023.5.2"
fastapi = "0.86.0"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
uvicorn = "^0.24.0.post1"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142"
[build-system]

View File

@ -7,12 +7,12 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
farm-haystack = "^1.21.2"
fastapi = "0.99.1"
pydantic = "1.10.13"
milvus-haystack = "^0.0.2"
transformers = {version = "4.34.1", extras = ["torch"]}
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
[build-system]
requires = ["poetry-core"]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"