Compare commits

...

2 Commits

Author SHA1 Message Date
dependabot[bot]
463f23d70d
Bump fastapi from 0.101.1 to 0.104.1
Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.101.1 to 0.104.1.
- [Release notes](https://github.com/tiangolo/fastapi/releases)
- [Commits](https://github.com/tiangolo/fastapi/compare/0.101.1...0.104.1)

---
updated-dependencies:
- dependency-name: fastapi
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-11-07 13:18:14 +00:00
03b0de017c added a better search 2023-11-07 16:13:59 +03:00
8 changed files with 232 additions and 353 deletions

View File

@ -0,0 +1,37 @@
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
# Set up stop words
english_stopwords = set(stopwords.words("english"))
russian_stopwords = set(stopwords.words("russian"))
# Set up lemmatizers
english_lemmatizer = WordNetLemmatizer()
russian_lemmatizer = MorphAnalyzer()
def lemmatize_and_remove_stopwords(text, language="english"):
# Tokenize the text
tokens = word_tokenize(text)
# Lemmatize each token based on the specified language
lemmatized_tokens = []
for token in tokens:
if language == "russian":
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
else: # Default to English
lemmatized_token = english_lemmatizer.lemmatize(token)
lemmatized_tokens.append(lemmatized_token)
# Remove stop words
filtered_tokens = [
token
for token in lemmatized_tokens
if token not in english_stopwords and token not in russian_stopwords
]
# Reconstruct the text
filtered_text = " ".join(filtered_tokens)
return filtered_text

View File

@ -3,13 +3,14 @@
from typing import BinaryIO from typing import BinaryIO
from django.conf import settings from django.conf import settings
from django.contrib.postgres.lookups import Unaccent
from django.contrib.postgres.search import TrigramSimilarity from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import Q, QuerySet from django.db.models import F, Func, Q, QuerySet
from haystack.query import SearchQuerySet from haystack.query import SearchQuerySet
from akarpov.files.models import File from akarpov.files.models import File
from .lema import lemmatize_and_remove_stopwords
class BaseSearch: class BaseSearch:
def __init__(self, queryset: QuerySet | None = None): def __init__(self, queryset: QuerySet | None = None):
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
return False return False
class UnaccentLower(Func):
function = "UNACCENT"
def as_sql(self, compiler, connection):
unaccented_sql, unaccented_params = compiler.compile(
self.get_source_expressions()[0]
)
lower_unaccented_sql = f"LOWER({unaccented_sql})"
return lower_unaccented_sql, unaccented_params
class SimilaritySearch(BaseSearch): class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None): def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset) super().__init__(queryset)
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
if self.queryset is None: if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search") raise ValueError("Queryset cannot be None for similarity search")
# Perform a similarity search using trigram comparison # Detect language and preprocess the query
return ( language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
# Annotate the queryset with similarity scores for each field
queryset = (
self.queryset.annotate( self.queryset.annotate(
name_unaccent=Unaccent("name"), name_similarity=TrigramSimilarity(
description_unaccent=Unaccent("description"), UnaccentLower("name"), filtered_query
content_unaccent=Unaccent("content"), ),
description_similarity=TrigramSimilarity(
UnaccentLower("description"), filtered_query
),
content_similarity=TrigramSimilarity(
UnaccentLower("content"), filtered_query
),
) )
.annotate( .annotate(
similarity=TrigramSimilarity("name_unaccent", query) combined_similarity=(
+ TrigramSimilarity("description_unaccent", query) F("name_similarity")
+ TrigramSimilarity("content_unaccent", query) + F("description_similarity")
+ F("content_similarity")
) )
.filter(similarity__gt=0.1) / 3
.order_by("-similarity")
) )
.filter(combined_similarity__gt=0.1)
.order_by("-combined_similarity")
)
print(filtered_query)
print(queryset.query)
print(
queryset.values(
"name_similarity", "description_similarity", "content_similarity"
)
)
return queryset

View File

@ -65,7 +65,6 @@ def filter(self, queryset):
queryset=File.objects.filter(user=self.request.user) queryset=File.objects.filter(user=self.request.user)
) )
queryset = search_instance.search(query) queryset = search_instance.search(query)
print(queryset, query)
return queryset return queryset

View File

@ -38,6 +38,7 @@ RUN python -m venv /venv
COPY pyproject.toml poetry.lock /app/ COPY pyproject.toml poetry.lock /app/
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
RUN python -m nltk.downloader punkt stopwords wordnet
COPY . . COPY . .
RUN poetry build && /venv/bin/pip install dist/*.whl RUN poetry build && /venv/bin/pip install dist/*.whl

View File

@ -107,6 +107,7 @@
] ]
THIRD_PARTY_APPS = [ THIRD_PARTY_APPS = [
"django.contrib.postgres",
"crispy_forms", "crispy_forms",
"crispy_bootstrap5", "crispy_bootstrap5",
"allauth", "allauth",

474
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -107,9 +107,12 @@ pgvector = "^0.2.2"
pycld2 = "^0.41" pycld2 = "^0.41"
textract = "^1.6.5" textract = "^1.6.5"
uuid6 = "^2023.5.2" uuid6 = "^2023.5.2"
fastapi = "0.86.0" fastapi = "0.104.1"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"} django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
uvicorn = "^0.24.0.post1" uvicorn = "^0.24.0.post1"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142"
[build-system] [build-system]

View File

@ -7,12 +7,12 @@ readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.11" python = "^3.11"
farm-haystack = "^1.21.2"
fastapi = "0.99.1" fastapi = "0.99.1"
pydantic = "1.10.13" pydantic = "1.10.13"
milvus-haystack = "^0.0.2" transformers = {version = "4.34.1", extras = ["torch"]}
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"