mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-24 03:13:43 +03:00
Compare commits
3 Commits
c7f769ee72
...
2b02ff2182
Author | SHA1 | Date | |
---|---|---|---|
|
2b02ff2182 | ||
|
463f23d70d | ||
03b0de017c |
37
akarpov/files/services/lema.py
Normal file
37
akarpov/files/services/lema.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import WordNetLemmatizer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from pymorphy3 import MorphAnalyzer
|
||||||
|
|
||||||
|
# Set up stop words
|
||||||
|
english_stopwords = set(stopwords.words("english"))
|
||||||
|
russian_stopwords = set(stopwords.words("russian"))
|
||||||
|
|
||||||
|
# Set up lemmatizers
|
||||||
|
english_lemmatizer = WordNetLemmatizer()
|
||||||
|
russian_lemmatizer = MorphAnalyzer()
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatize_and_remove_stopwords(text, language="english"):
|
||||||
|
# Tokenize the text
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
|
||||||
|
# Lemmatize each token based on the specified language
|
||||||
|
lemmatized_tokens = []
|
||||||
|
for token in tokens:
|
||||||
|
if language == "russian":
|
||||||
|
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
|
||||||
|
else: # Default to English
|
||||||
|
lemmatized_token = english_lemmatizer.lemmatize(token)
|
||||||
|
lemmatized_tokens.append(lemmatized_token)
|
||||||
|
|
||||||
|
# Remove stop words
|
||||||
|
filtered_tokens = [
|
||||||
|
token
|
||||||
|
for token in lemmatized_tokens
|
||||||
|
if token not in english_stopwords and token not in russian_stopwords
|
||||||
|
]
|
||||||
|
|
||||||
|
# Reconstruct the text
|
||||||
|
filtered_text = " ".join(filtered_tokens)
|
||||||
|
return filtered_text
|
|
@ -3,13 +3,14 @@
|
||||||
from typing import BinaryIO
|
from typing import BinaryIO
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.contrib.postgres.lookups import Unaccent
|
|
||||||
from django.contrib.postgres.search import TrigramSimilarity
|
from django.contrib.postgres.search import TrigramSimilarity
|
||||||
from django.db.models import Q, QuerySet
|
from django.db.models import F, Func, Q, QuerySet
|
||||||
from haystack.query import SearchQuerySet
|
from haystack.query import SearchQuerySet
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
from .lema import lemmatize_and_remove_stopwords
|
||||||
|
|
||||||
|
|
||||||
class BaseSearch:
|
class BaseSearch:
|
||||||
def __init__(self, queryset: QuerySet | None = None):
|
def __init__(self, queryset: QuerySet | None = None):
|
||||||
|
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class UnaccentLower(Func):
|
||||||
|
function = "UNACCENT"
|
||||||
|
|
||||||
|
def as_sql(self, compiler, connection):
|
||||||
|
unaccented_sql, unaccented_params = compiler.compile(
|
||||||
|
self.get_source_expressions()[0]
|
||||||
|
)
|
||||||
|
lower_unaccented_sql = f"LOWER({unaccented_sql})"
|
||||||
|
return lower_unaccented_sql, unaccented_params
|
||||||
|
|
||||||
|
|
||||||
class SimilaritySearch(BaseSearch):
|
class SimilaritySearch(BaseSearch):
|
||||||
def __init__(self, queryset: QuerySet[File] | None = None):
|
def __init__(self, queryset: QuerySet[File] | None = None):
|
||||||
super().__init__(queryset)
|
super().__init__(queryset)
|
||||||
|
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
|
||||||
if self.queryset is None:
|
if self.queryset is None:
|
||||||
raise ValueError("Queryset cannot be None for similarity search")
|
raise ValueError("Queryset cannot be None for similarity search")
|
||||||
|
|
||||||
# Perform a similarity search using trigram comparison
|
# Detect language and preprocess the query
|
||||||
return (
|
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
||||||
|
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
||||||
|
|
||||||
|
# Annotate the queryset with similarity scores for each field
|
||||||
|
queryset = (
|
||||||
self.queryset.annotate(
|
self.queryset.annotate(
|
||||||
name_unaccent=Unaccent("name"),
|
name_similarity=TrigramSimilarity(
|
||||||
description_unaccent=Unaccent("description"),
|
UnaccentLower("name"), filtered_query
|
||||||
content_unaccent=Unaccent("content"),
|
),
|
||||||
|
description_similarity=TrigramSimilarity(
|
||||||
|
UnaccentLower("description"), filtered_query
|
||||||
|
),
|
||||||
|
content_similarity=TrigramSimilarity(
|
||||||
|
UnaccentLower("content"), filtered_query
|
||||||
|
),
|
||||||
)
|
)
|
||||||
.annotate(
|
.annotate(
|
||||||
similarity=TrigramSimilarity("name_unaccent", query)
|
combined_similarity=(
|
||||||
+ TrigramSimilarity("description_unaccent", query)
|
F("name_similarity")
|
||||||
+ TrigramSimilarity("content_unaccent", query)
|
+ F("description_similarity")
|
||||||
|
+ F("content_similarity")
|
||||||
)
|
)
|
||||||
.filter(similarity__gt=0.1)
|
/ 3
|
||||||
.order_by("-similarity")
|
|
||||||
)
|
)
|
||||||
|
.filter(combined_similarity__gt=0.1)
|
||||||
|
.order_by("-combined_similarity")
|
||||||
|
)
|
||||||
|
print(filtered_query)
|
||||||
|
print(queryset.query)
|
||||||
|
print(
|
||||||
|
queryset.values(
|
||||||
|
"name_similarity", "description_similarity", "content_similarity"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return queryset
|
||||||
|
|
|
@ -65,7 +65,6 @@ def filter(self, queryset):
|
||||||
queryset=File.objects.filter(user=self.request.user)
|
queryset=File.objects.filter(user=self.request.user)
|
||||||
)
|
)
|
||||||
queryset = search_instance.search(query)
|
queryset = search_instance.search(query)
|
||||||
print(queryset, query)
|
|
||||||
return queryset
|
return queryset
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -38,6 +38,7 @@ RUN python -m venv /venv
|
||||||
|
|
||||||
COPY pyproject.toml poetry.lock /app/
|
COPY pyproject.toml poetry.lock /app/
|
||||||
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
||||||
|
RUN python -m nltk.downloader punkt stopwords wordnet
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN poetry build && /venv/bin/pip install dist/*.whl
|
RUN poetry build && /venv/bin/pip install dist/*.whl
|
||||||
|
|
|
@ -107,6 +107,7 @@
|
||||||
]
|
]
|
||||||
|
|
||||||
THIRD_PARTY_APPS = [
|
THIRD_PARTY_APPS = [
|
||||||
|
"django.contrib.postgres",
|
||||||
"crispy_forms",
|
"crispy_forms",
|
||||||
"crispy_bootstrap5",
|
"crispy_bootstrap5",
|
||||||
"allauth",
|
"allauth",
|
||||||
|
|
474
poetry.lock
generated
474
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
|
@ -107,9 +107,12 @@ pgvector = "^0.2.2"
|
||||||
pycld2 = "^0.41"
|
pycld2 = "^0.41"
|
||||||
textract = "^1.6.5"
|
textract = "^1.6.5"
|
||||||
uuid6 = "^2023.5.2"
|
uuid6 = "^2023.5.2"
|
||||||
fastapi = "0.86.0"
|
fastapi = "0.104.1"
|
||||||
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
||||||
uvicorn = "^0.24.0.post1"
|
uvicorn = "^0.24.0.post1"
|
||||||
|
nltk = "^3.8.1"
|
||||||
|
pymorphy3 = "^1.2.1"
|
||||||
|
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
|
@ -7,12 +7,12 @@ readme = "README.md"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.11"
|
python = "^3.11"
|
||||||
farm-haystack = "^1.21.2"
|
|
||||||
fastapi = "0.99.1"
|
fastapi = "0.99.1"
|
||||||
pydantic = "1.10.13"
|
pydantic = "1.10.13"
|
||||||
milvus-haystack = "^0.0.2"
|
transformers = {version = "4.34.1", extras = ["torch"]}
|
||||||
|
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
|
||||||
|
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core>=1.0.0"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user