Compare commits

...

3 Commits

Author SHA1 Message Date
dependabot[bot]
f229523a3e
Merge 444e36bac7 into 03b0de017c 2023-11-07 13:18:12 +00:00
dependabot[bot]
444e36bac7
Bump pytest-factoryboy from 2.3.1 to 2.6.0
Bumps [pytest-factoryboy](https://github.com/pytest-dev/pytest-factoryboy) from 2.3.1 to 2.6.0.
- [Changelog](https://github.com/pytest-dev/pytest-factoryboy/blob/master/CHANGES.rst)
- [Commits](https://github.com/pytest-dev/pytest-factoryboy/compare/2.3.1...2.6.0)

---
updated-dependencies:
- dependency-name: pytest-factoryboy
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-11-07 13:18:09 +00:00
03b0de017c added a better search 2023-11-07 16:13:59 +03:00
8 changed files with 225 additions and 377 deletions

View File

@ -0,0 +1,37 @@
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
# Set up stop words
english_stopwords = set(stopwords.words("english"))
russian_stopwords = set(stopwords.words("russian"))
# Set up lemmatizers
english_lemmatizer = WordNetLemmatizer()
russian_lemmatizer = MorphAnalyzer()
def lemmatize_and_remove_stopwords(text, language="english"):
# Tokenize the text
tokens = word_tokenize(text)
# Lemmatize each token based on the specified language
lemmatized_tokens = []
for token in tokens:
if language == "russian":
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
else: # Default to English
lemmatized_token = english_lemmatizer.lemmatize(token)
lemmatized_tokens.append(lemmatized_token)
# Remove stop words
filtered_tokens = [
token
for token in lemmatized_tokens
if token not in english_stopwords and token not in russian_stopwords
]
# Reconstruct the text
filtered_text = " ".join(filtered_tokens)
return filtered_text

View File

@ -3,13 +3,14 @@
from typing import BinaryIO
from django.conf import settings
from django.contrib.postgres.lookups import Unaccent
from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import Q, QuerySet
from django.db.models import F, Func, Q, QuerySet
from haystack.query import SearchQuerySet
from akarpov.files.models import File
from .lema import lemmatize_and_remove_stopwords
class BaseSearch:
def __init__(self, queryset: QuerySet | None = None):
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
return False
class UnaccentLower(Func):
function = "UNACCENT"
def as_sql(self, compiler, connection):
unaccented_sql, unaccented_params = compiler.compile(
self.get_source_expressions()[0]
)
lower_unaccented_sql = f"LOWER({unaccented_sql})"
return lower_unaccented_sql, unaccented_params
class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset)
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search")
# Perform a similarity search using trigram comparison
return (
# Detect language and preprocess the query
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
# Annotate the queryset with similarity scores for each field
queryset = (
self.queryset.annotate(
name_unaccent=Unaccent("name"),
description_unaccent=Unaccent("description"),
content_unaccent=Unaccent("content"),
name_similarity=TrigramSimilarity(
UnaccentLower("name"), filtered_query
),
description_similarity=TrigramSimilarity(
UnaccentLower("description"), filtered_query
),
content_similarity=TrigramSimilarity(
UnaccentLower("content"), filtered_query
),
)
.annotate(
similarity=TrigramSimilarity("name_unaccent", query)
+ TrigramSimilarity("description_unaccent", query)
+ TrigramSimilarity("content_unaccent", query)
combined_similarity=(
F("name_similarity")
+ F("description_similarity")
+ F("content_similarity")
)
.filter(similarity__gt=0.1)
.order_by("-similarity")
/ 3
)
.filter(combined_similarity__gt=0.1)
.order_by("-combined_similarity")
)
print(filtered_query)
print(queryset.query)
print(
queryset.values(
"name_similarity", "description_similarity", "content_similarity"
)
)
return queryset

View File

@ -65,7 +65,6 @@ def filter(self, queryset):
queryset=File.objects.filter(user=self.request.user)
)
queryset = search_instance.search(query)
print(queryset, query)
return queryset

View File

@ -38,6 +38,7 @@ RUN python -m venv /venv
COPY pyproject.toml poetry.lock /app/
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
RUN python -m nltk.downloader punkt stopwords wordnet
COPY . .
RUN poetry build && /venv/bin/pip install dist/*.whl

View File

@ -107,6 +107,7 @@
]
THIRD_PARTY_APPS = [
"django.contrib.postgres",
"crispy_forms",
"crispy_bootstrap5",
"allauth",

491
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -98,7 +98,7 @@ requests = ">=2.25"
spacy = {extras = ["lookups"], version = "^3.6.1"}
spacy-transformers = "^1.2.5"
extract-msg = "0.28.7"
pytest-factoryboy = "2.3.1"
pytest-factoryboy = "2.6.0"
pytest-xdist = "^3.3.1"
pytest-mock = "^3.11.1"
pytest-asyncio = "^0.21.1"
@ -110,6 +110,9 @@ uuid6 = "^2023.5.2"
fastapi = "0.86.0"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
uvicorn = "^0.24.0.post1"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142"
[build-system]

View File

@ -7,12 +7,12 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
farm-haystack = "^1.21.2"
fastapi = "0.99.1"
pydantic = "1.10.13"
milvus-haystack = "^0.0.2"
transformers = {version = "4.34.1", extras = ["torch"]}
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
[build-system]
requires = ["poetry-core"]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"