mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-24 02:03:49 +03:00
Compare commits
3 Commits
c7f769ee72
...
2b02ff2182
Author | SHA1 | Date | |
---|---|---|---|
|
2b02ff2182 | ||
|
463f23d70d | ||
03b0de017c |
37
akarpov/files/services/lema.py
Normal file
37
akarpov/files/services/lema.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from nltk.corpus import stopwords
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from nltk.tokenize import word_tokenize
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
|
||||
# Set up stop words
|
||||
english_stopwords = set(stopwords.words("english"))
|
||||
russian_stopwords = set(stopwords.words("russian"))
|
||||
|
||||
# Set up lemmatizers
|
||||
english_lemmatizer = WordNetLemmatizer()
|
||||
russian_lemmatizer = MorphAnalyzer()
|
||||
|
||||
|
||||
def lemmatize_and_remove_stopwords(text, language="english"):
|
||||
# Tokenize the text
|
||||
tokens = word_tokenize(text)
|
||||
|
||||
# Lemmatize each token based on the specified language
|
||||
lemmatized_tokens = []
|
||||
for token in tokens:
|
||||
if language == "russian":
|
||||
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
|
||||
else: # Default to English
|
||||
lemmatized_token = english_lemmatizer.lemmatize(token)
|
||||
lemmatized_tokens.append(lemmatized_token)
|
||||
|
||||
# Remove stop words
|
||||
filtered_tokens = [
|
||||
token
|
||||
for token in lemmatized_tokens
|
||||
if token not in english_stopwords and token not in russian_stopwords
|
||||
]
|
||||
|
||||
# Reconstruct the text
|
||||
filtered_text = " ".join(filtered_tokens)
|
||||
return filtered_text
|
|
@ -3,13 +3,14 @@
|
|||
from typing import BinaryIO
|
||||
|
||||
from django.conf import settings
|
||||
from django.contrib.postgres.lookups import Unaccent
|
||||
from django.contrib.postgres.search import TrigramSimilarity
|
||||
from django.db.models import Q, QuerySet
|
||||
from django.db.models import F, Func, Q, QuerySet
|
||||
from haystack.query import SearchQuerySet
|
||||
|
||||
from akarpov.files.models import File
|
||||
|
||||
from .lema import lemmatize_and_remove_stopwords
|
||||
|
||||
|
||||
class BaseSearch:
|
||||
def __init__(self, queryset: QuerySet | None = None):
|
||||
|
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
class UnaccentLower(Func):
|
||||
function = "UNACCENT"
|
||||
|
||||
def as_sql(self, compiler, connection):
|
||||
unaccented_sql, unaccented_params = compiler.compile(
|
||||
self.get_source_expressions()[0]
|
||||
)
|
||||
lower_unaccented_sql = f"LOWER({unaccented_sql})"
|
||||
return lower_unaccented_sql, unaccented_params
|
||||
|
||||
|
||||
class SimilaritySearch(BaseSearch):
|
||||
def __init__(self, queryset: QuerySet[File] | None = None):
|
||||
super().__init__(queryset)
|
||||
|
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
|
|||
if self.queryset is None:
|
||||
raise ValueError("Queryset cannot be None for similarity search")
|
||||
|
||||
# Perform a similarity search using trigram comparison
|
||||
return (
|
||||
# Detect language and preprocess the query
|
||||
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
||||
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
||||
|
||||
# Annotate the queryset with similarity scores for each field
|
||||
queryset = (
|
||||
self.queryset.annotate(
|
||||
name_unaccent=Unaccent("name"),
|
||||
description_unaccent=Unaccent("description"),
|
||||
content_unaccent=Unaccent("content"),
|
||||
name_similarity=TrigramSimilarity(
|
||||
UnaccentLower("name"), filtered_query
|
||||
),
|
||||
description_similarity=TrigramSimilarity(
|
||||
UnaccentLower("description"), filtered_query
|
||||
),
|
||||
content_similarity=TrigramSimilarity(
|
||||
UnaccentLower("content"), filtered_query
|
||||
),
|
||||
)
|
||||
.annotate(
|
||||
similarity=TrigramSimilarity("name_unaccent", query)
|
||||
+ TrigramSimilarity("description_unaccent", query)
|
||||
+ TrigramSimilarity("content_unaccent", query)
|
||||
combined_similarity=(
|
||||
F("name_similarity")
|
||||
+ F("description_similarity")
|
||||
+ F("content_similarity")
|
||||
)
|
||||
/ 3
|
||||
)
|
||||
.filter(similarity__gt=0.1)
|
||||
.order_by("-similarity")
|
||||
.filter(combined_similarity__gt=0.1)
|
||||
.order_by("-combined_similarity")
|
||||
)
|
||||
print(filtered_query)
|
||||
print(queryset.query)
|
||||
print(
|
||||
queryset.values(
|
||||
"name_similarity", "description_similarity", "content_similarity"
|
||||
)
|
||||
)
|
||||
|
||||
return queryset
|
||||
|
|
|
@ -65,7 +65,6 @@ def filter(self, queryset):
|
|||
queryset=File.objects.filter(user=self.request.user)
|
||||
)
|
||||
queryset = search_instance.search(query)
|
||||
print(queryset, query)
|
||||
return queryset
|
||||
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ RUN python -m venv /venv
|
|||
|
||||
COPY pyproject.toml poetry.lock /app/
|
||||
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
||||
RUN python -m nltk.downloader punkt stopwords wordnet
|
||||
|
||||
COPY . .
|
||||
RUN poetry build && /venv/bin/pip install dist/*.whl
|
||||
|
|
|
@ -107,6 +107,7 @@
|
|||
]
|
||||
|
||||
THIRD_PARTY_APPS = [
|
||||
"django.contrib.postgres",
|
||||
"crispy_forms",
|
||||
"crispy_bootstrap5",
|
||||
"allauth",
|
||||
|
|
90
poetry.lock
generated
90
poetry.lock
generated
|
@ -1523,6 +1523,17 @@ twisted = {version = ">=22.4", extras = ["tls"]}
|
|||
[package.extras]
|
||||
tests = ["django", "hypothesis", "pytest", "pytest-asyncio"]
|
||||
|
||||
[[package]]
|
||||
name = "dawg-python"
|
||||
version = "0.7.2"
|
||||
description = "Pure-python reader for DAWGs (DAFSAs) created by dawgdic C++ library or DAWG Python extension."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "DAWG-Python-0.7.2.tar.gz", hash = "sha256:4a5e3286e6261cca02f205cfd5516a7ab10190fa30c51c28d345808f595e3421"},
|
||||
{file = "DAWG_Python-0.7.2-py2.py3-none-any.whl", hash = "sha256:4941d5df081b8d6fcb4597e073a9f60d5c1ccc9d17cd733e8744d7ecfec94ef3"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "decorator"
|
||||
version = "5.1.1"
|
||||
|
@ -2151,6 +2162,17 @@ compatible-mypy = ["mypy (>=0.991,<0.1000)"]
|
|||
coreapi = ["coreapi (>=2.0.0)"]
|
||||
markdown = ["types-Markdown (>=0.1.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "docopt-ng"
|
||||
version = "0.9.0"
|
||||
description = "Jazzband-maintained fork of docopt, the humane command line arguments parser."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "docopt_ng-0.9.0-py3-none-any.whl", hash = "sha256:bfe4c8b03f9fca424c24ee0b4ffa84bf7391cb18c29ce0f6a8227a3b01b81ff9"},
|
||||
{file = "docopt_ng-0.9.0.tar.gz", hash = "sha256:91c6da10b5bb6f2e9e25345829fb8278c78af019f6fc40887ad49b060483b1d7"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "docutils"
|
||||
version = "0.20.1"
|
||||
|
@ -3092,6 +3114,17 @@ MarkupSafe = ">=2.0"
|
|||
[package.extras]
|
||||
i18n = ["Babel (>=2.7)"]
|
||||
|
||||
[[package]]
|
||||
name = "joblib"
|
||||
version = "1.3.2"
|
||||
description = "Lightweight pipelining with Python functions"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"},
|
||||
{file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonfield"
|
||||
version = "3.1.0"
|
||||
|
@ -3940,6 +3973,31 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
|
|||
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
|
||||
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "nltk"
|
||||
version = "3.8.1"
|
||||
description = "Natural Language Toolkit"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"},
|
||||
{file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
click = "*"
|
||||
joblib = "*"
|
||||
regex = ">=2021.8.3"
|
||||
tqdm = "*"
|
||||
|
||||
[package.extras]
|
||||
all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"]
|
||||
corenlp = ["requests"]
|
||||
machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"]
|
||||
plot = ["matplotlib"]
|
||||
tgrep = ["pyparsing"]
|
||||
twitter = ["twython"]
|
||||
|
||||
[[package]]
|
||||
name = "nodeenv"
|
||||
version = "1.8.0"
|
||||
|
@ -4822,6 +4880,36 @@ files = [
|
|||
[package.dependencies]
|
||||
pylint = ">=1.7"
|
||||
|
||||
[[package]]
|
||||
name = "pymorphy3"
|
||||
version = "1.2.1"
|
||||
description = "Morphological analyzer (POS tagger + inflection engine) for Russian language."
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pymorphy3-1.2.1-py3-none-any.whl", hash = "sha256:88700966f55e77e3d2aedf194fa00bb4a175c2626017fe423e94ce11bc98f1ff"},
|
||||
{file = "pymorphy3-1.2.1.tar.gz", hash = "sha256:0cc186a3b0716129dd45e3b89f5e8339e5943d9013f93cfd4c58e5335daf296d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
dawg-python = ">=0.7.1"
|
||||
docopt-ng = ">=0.6"
|
||||
pymorphy3-dicts-ru = "*"
|
||||
|
||||
[package.extras]
|
||||
fast = ["DAWG (>=0.8)"]
|
||||
|
||||
[[package]]
|
||||
name = "pymorphy3-dicts-ru"
|
||||
version = "2.4.417150.4580142"
|
||||
description = "Russian dictionaries for pymorphy2"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "pymorphy3-dicts-ru-2.4.417150.4580142.tar.gz", hash = "sha256:39ab379d4ca905bafed50f5afc3a3de6f9643605776fbcabc4d3088d4ed382b0"},
|
||||
{file = "pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl", hash = "sha256:718bac64c73c10c16073a199402657283d9b64c04188b694f6d3e9b0d85440f4"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyopenssl"
|
||||
version = "23.2.0"
|
||||
|
@ -7709,4 +7797,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "a306fabd9575b5cddbfdddf2ceec22c8be1bb358b4755df2d78279de4745f0e8"
|
||||
content-hash = "efd8671d3e8f1f59dcd418468893f93f5e2e0c860005cfd78284879d92cf2f0d"
|
||||
|
|
|
@ -110,6 +110,9 @@ uuid6 = "^2023.5.2"
|
|||
fastapi = "0.104.1"
|
||||
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
||||
uvicorn = "^0.24.0.post1"
|
||||
nltk = "^3.8.1"
|
||||
pymorphy3 = "^1.2.1"
|
||||
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
|
|
@ -7,12 +7,12 @@ readme = "README.md"
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
farm-haystack = "^1.21.2"
|
||||
fastapi = "0.99.1"
|
||||
pydantic = "1.10.13"
|
||||
milvus-haystack = "^0.0.2"
|
||||
|
||||
transformers = {version = "4.34.1", extras = ["torch"]}
|
||||
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
|
||||
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
|
Loading…
Reference in New Issue
Block a user