mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-24 02:03:49 +03:00
Compare commits
2 Commits
3eafeefffe
...
463f23d70d
Author | SHA1 | Date | |
---|---|---|---|
|
463f23d70d | ||
03b0de017c |
37
akarpov/files/services/lema.py
Normal file
37
akarpov/files/services/lema.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from nltk.corpus import stopwords
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from nltk.tokenize import word_tokenize
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
|
||||
# Set up stop words
|
||||
english_stopwords = set(stopwords.words("english"))
|
||||
russian_stopwords = set(stopwords.words("russian"))
|
||||
|
||||
# Set up lemmatizers
|
||||
english_lemmatizer = WordNetLemmatizer()
|
||||
russian_lemmatizer = MorphAnalyzer()
|
||||
|
||||
|
||||
def lemmatize_and_remove_stopwords(text, language="english"):
|
||||
# Tokenize the text
|
||||
tokens = word_tokenize(text)
|
||||
|
||||
# Lemmatize each token based on the specified language
|
||||
lemmatized_tokens = []
|
||||
for token in tokens:
|
||||
if language == "russian":
|
||||
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
|
||||
else: # Default to English
|
||||
lemmatized_token = english_lemmatizer.lemmatize(token)
|
||||
lemmatized_tokens.append(lemmatized_token)
|
||||
|
||||
# Remove stop words
|
||||
filtered_tokens = [
|
||||
token
|
||||
for token in lemmatized_tokens
|
||||
if token not in english_stopwords and token not in russian_stopwords
|
||||
]
|
||||
|
||||
# Reconstruct the text
|
||||
filtered_text = " ".join(filtered_tokens)
|
||||
return filtered_text
|
|
@ -3,13 +3,14 @@
|
|||
from typing import BinaryIO
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.postgres.lookups import Unaccent
|
||||
from django.contrib.postgres.search import TrigramSimilarity
|
||||
from django.db.models import Q, QuerySet
|
||||
from django.db.models import F, Func, Q, QuerySet
|
||||
from haystack.query import SearchQuerySet
|
||||
|
||||
from akarpov.files.models import File
|
||||
|
||||
from .lema import lemmatize_and_remove_stopwords
|
||||
|
||||
|
||||
class BaseSearch:
|
||||
def __init__(self, queryset: QuerySet | None = None):
|
||||
|
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
class UnaccentLower(Func):
|
||||
function = "UNACCENT"
|
||||
|
||||
def as_sql(self, compiler, connection):
|
||||
unaccented_sql, unaccented_params = compiler.compile(
|
||||
self.get_source_expressions()[0]
|
||||
)
|
||||
lower_unaccented_sql = f"LOWER({unaccented_sql})"
|
||||
return lower_unaccented_sql, unaccented_params
|
||||
|
||||
|
||||
class SimilaritySearch(BaseSearch):
|
||||
def __init__(self, queryset: QuerySet[File] | None = None):
|
||||
super().__init__(queryset)
|
||||
|
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
|
|||
if self.queryset is None:
|
||||
raise ValueError("Queryset cannot be None for similarity search")
|
||||
|
||||
# Perform a similarity search using trigram comparison
|
||||
return (
|
||||
# Detect language and preprocess the query
|
||||
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
||||
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
||||
|
||||
# Annotate the queryset with similarity scores for each field
|
||||
queryset = (
|
||||
self.queryset.annotate(
|
||||
name_unaccent=Unaccent("name"),
|
||||
description_unaccent=Unaccent("description"),
|
||||
content_unaccent=Unaccent("content"),
|
||||
name_similarity=TrigramSimilarity(
|
||||
UnaccentLower("name"), filtered_query
|
||||
),
|
||||
description_similarity=TrigramSimilarity(
|
||||
UnaccentLower("description"), filtered_query
|
||||
),
|
||||
content_similarity=TrigramSimilarity(
|
||||
UnaccentLower("content"), filtered_query
|
||||
),
|
||||
)
|
||||
.annotate(
|
||||
similarity=TrigramSimilarity("name_unaccent", query)
|
||||
+ TrigramSimilarity("description_unaccent", query)
|
||||
+ TrigramSimilarity("content_unaccent", query)
|
||||
combined_similarity=(
|
||||
F("name_similarity")
|
||||
+ F("description_similarity")
|
||||
+ F("content_similarity")
|
||||
)
|
||||
.filter(similarity__gt=0.1)
|
||||
.order_by("-similarity")
|
||||
/ 3
|
||||
)
|
||||
.filter(combined_similarity__gt=0.1)
|
||||
.order_by("-combined_similarity")
|
||||
)
|
||||
print(filtered_query)
|
||||
print(queryset.query)
|
||||
print(
|
||||
queryset.values(
|
||||
"name_similarity", "description_similarity", "content_similarity"
|
||||
)
|
||||
)
|
||||
|
||||
return queryset
|
||||
|
|
|
@ -65,7 +65,6 @@ def filter(self, queryset):
|
|||
queryset=File.objects.filter(user=self.request.user)
|
||||
)
|
||||
queryset = search_instance.search(query)
|
||||
print(queryset, query)
|
||||
return queryset
|
||||
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ RUN python -m venv /venv
|
|||
|
||||
COPY pyproject.toml poetry.lock /app/
|
||||
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
||||
RUN python -m nltk.downloader punkt stopwords wordnet
|
||||
|
||||
COPY . .
|
||||
RUN poetry build && /venv/bin/pip install dist/*.whl
|
||||
|
|
|
@ -107,6 +107,7 @@
|
|||
]
|
||||
|
||||
THIRD_PARTY_APPS = [
|
||||
"django.contrib.postgres",
|
||||
"crispy_forms",
|
||||
"crispy_bootstrap5",
|
||||
"allauth",
|
||||
|
|
474
poetry.lock
generated
474
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
|
@ -107,9 +107,12 @@ pgvector = "^0.2.2"
|
|||
pycld2 = "^0.41"
|
||||
textract = "^1.6.5"
|
||||
uuid6 = "^2023.5.2"
|
||||
fastapi = "0.86.0"
|
||||
fastapi = "0.104.1"
|
||||
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
||||
uvicorn = "^0.24.0.post1"
|
||||
nltk = "^3.8.1"
|
||||
pymorphy3 = "^1.2.1"
|
||||
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
|
|
@ -7,12 +7,12 @@ readme = "README.md"
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
farm-haystack = "^1.21.2"
|
||||
fastapi = "0.99.1"
|
||||
pydantic = "1.10.13"
|
||||
milvus-haystack = "^0.0.2"
|
||||
|
||||
transformers = {version = "4.34.1", extras = ["torch"]}
|
||||
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
|
||||
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
|
Loading…
Reference in New Issue
Block a user