mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-24 07:53:42 +03:00
Compare commits
2 Commits
34cb364e62
...
444e36bac7
Author | SHA1 | Date | |
---|---|---|---|
|
444e36bac7 | ||
03b0de017c |
37
akarpov/files/services/lema.py
Normal file
37
akarpov/files/services/lema.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import WordNetLemmatizer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from pymorphy3 import MorphAnalyzer
|
||||||
|
|
||||||
|
# Set up stop words
|
||||||
|
english_stopwords = set(stopwords.words("english"))
|
||||||
|
russian_stopwords = set(stopwords.words("russian"))
|
||||||
|
|
||||||
|
# Set up lemmatizers
|
||||||
|
english_lemmatizer = WordNetLemmatizer()
|
||||||
|
russian_lemmatizer = MorphAnalyzer()
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatize_and_remove_stopwords(text, language="english"):
|
||||||
|
# Tokenize the text
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
|
||||||
|
# Lemmatize each token based on the specified language
|
||||||
|
lemmatized_tokens = []
|
||||||
|
for token in tokens:
|
||||||
|
if language == "russian":
|
||||||
|
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
|
||||||
|
else: # Default to English
|
||||||
|
lemmatized_token = english_lemmatizer.lemmatize(token)
|
||||||
|
lemmatized_tokens.append(lemmatized_token)
|
||||||
|
|
||||||
|
# Remove stop words
|
||||||
|
filtered_tokens = [
|
||||||
|
token
|
||||||
|
for token in lemmatized_tokens
|
||||||
|
if token not in english_stopwords and token not in russian_stopwords
|
||||||
|
]
|
||||||
|
|
||||||
|
# Reconstruct the text
|
||||||
|
filtered_text = " ".join(filtered_tokens)
|
||||||
|
return filtered_text
|
|
@ -3,13 +3,14 @@
|
||||||
from typing import BinaryIO
|
from typing import BinaryIO
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.contrib.postgres.lookups import Unaccent
|
|
||||||
from django.contrib.postgres.search import TrigramSimilarity
|
from django.contrib.postgres.search import TrigramSimilarity
|
||||||
from django.db.models import Q, QuerySet
|
from django.db.models import F, Func, Q, QuerySet
|
||||||
from haystack.query import SearchQuerySet
|
from haystack.query import SearchQuerySet
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
from .lema import lemmatize_and_remove_stopwords
|
||||||
|
|
||||||
|
|
||||||
class BaseSearch:
|
class BaseSearch:
|
||||||
def __init__(self, queryset: QuerySet | None = None):
|
def __init__(self, queryset: QuerySet | None = None):
|
||||||
|
@ -76,6 +77,17 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class UnaccentLower(Func):
|
||||||
|
function = "UNACCENT"
|
||||||
|
|
||||||
|
def as_sql(self, compiler, connection):
|
||||||
|
unaccented_sql, unaccented_params = compiler.compile(
|
||||||
|
self.get_source_expressions()[0]
|
||||||
|
)
|
||||||
|
lower_unaccented_sql = f"LOWER({unaccented_sql})"
|
||||||
|
return lower_unaccented_sql, unaccented_params
|
||||||
|
|
||||||
|
|
||||||
class SimilaritySearch(BaseSearch):
|
class SimilaritySearch(BaseSearch):
|
||||||
def __init__(self, queryset: QuerySet[File] | None = None):
|
def __init__(self, queryset: QuerySet[File] | None = None):
|
||||||
super().__init__(queryset)
|
super().__init__(queryset)
|
||||||
|
@ -84,18 +96,40 @@ def search(self, query: str) -> QuerySet[File]:
|
||||||
if self.queryset is None:
|
if self.queryset is None:
|
||||||
raise ValueError("Queryset cannot be None for similarity search")
|
raise ValueError("Queryset cannot be None for similarity search")
|
||||||
|
|
||||||
# Perform a similarity search using trigram comparison
|
# Detect language and preprocess the query
|
||||||
return (
|
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
||||||
|
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
||||||
|
|
||||||
|
# Annotate the queryset with similarity scores for each field
|
||||||
|
queryset = (
|
||||||
self.queryset.annotate(
|
self.queryset.annotate(
|
||||||
name_unaccent=Unaccent("name"),
|
name_similarity=TrigramSimilarity(
|
||||||
description_unaccent=Unaccent("description"),
|
UnaccentLower("name"), filtered_query
|
||||||
content_unaccent=Unaccent("content"),
|
),
|
||||||
|
description_similarity=TrigramSimilarity(
|
||||||
|
UnaccentLower("description"), filtered_query
|
||||||
|
),
|
||||||
|
content_similarity=TrigramSimilarity(
|
||||||
|
UnaccentLower("content"), filtered_query
|
||||||
|
),
|
||||||
)
|
)
|
||||||
.annotate(
|
.annotate(
|
||||||
similarity=TrigramSimilarity("name_unaccent", query)
|
combined_similarity=(
|
||||||
+ TrigramSimilarity("description_unaccent", query)
|
F("name_similarity")
|
||||||
+ TrigramSimilarity("content_unaccent", query)
|
+ F("description_similarity")
|
||||||
|
+ F("content_similarity")
|
||||||
)
|
)
|
||||||
.filter(similarity__gt=0.1)
|
/ 3
|
||||||
.order_by("-similarity")
|
|
||||||
)
|
)
|
||||||
|
.filter(combined_similarity__gt=0.1)
|
||||||
|
.order_by("-combined_similarity")
|
||||||
|
)
|
||||||
|
print(filtered_query)
|
||||||
|
print(queryset.query)
|
||||||
|
print(
|
||||||
|
queryset.values(
|
||||||
|
"name_similarity", "description_similarity", "content_similarity"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return queryset
|
||||||
|
|
|
@ -65,7 +65,6 @@ def filter(self, queryset):
|
||||||
queryset=File.objects.filter(user=self.request.user)
|
queryset=File.objects.filter(user=self.request.user)
|
||||||
)
|
)
|
||||||
queryset = search_instance.search(query)
|
queryset = search_instance.search(query)
|
||||||
print(queryset, query)
|
|
||||||
return queryset
|
return queryset
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -38,6 +38,7 @@ RUN python -m venv /venv
|
||||||
|
|
||||||
COPY pyproject.toml poetry.lock /app/
|
COPY pyproject.toml poetry.lock /app/
|
||||||
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
||||||
|
RUN python -m nltk.downloader punkt stopwords wordnet
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN poetry build && /venv/bin/pip install dist/*.whl
|
RUN poetry build && /venv/bin/pip install dist/*.whl
|
||||||
|
|
|
@ -107,6 +107,7 @@
|
||||||
]
|
]
|
||||||
|
|
||||||
THIRD_PARTY_APPS = [
|
THIRD_PARTY_APPS = [
|
||||||
|
"django.contrib.postgres",
|
||||||
"crispy_forms",
|
"crispy_forms",
|
||||||
"crispy_bootstrap5",
|
"crispy_bootstrap5",
|
||||||
"allauth",
|
"allauth",
|
||||||
|
|
90
poetry.lock
generated
90
poetry.lock
generated
|
@ -1512,6 +1512,17 @@ twisted = {version = ">=22.4", extras = ["tls"]}
|
||||||
[package.extras]
|
[package.extras]
|
||||||
tests = ["django", "hypothesis", "pytest", "pytest-asyncio"]
|
tests = ["django", "hypothesis", "pytest", "pytest-asyncio"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dawg-python"
|
||||||
|
version = "0.7.2"
|
||||||
|
description = "Pure-python reader for DAWGs (DAFSAs) created by dawgdic C++ library or DAWG Python extension."
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "DAWG-Python-0.7.2.tar.gz", hash = "sha256:4a5e3286e6261cca02f205cfd5516a7ab10190fa30c51c28d345808f595e3421"},
|
||||||
|
{file = "DAWG_Python-0.7.2-py2.py3-none-any.whl", hash = "sha256:4941d5df081b8d6fcb4597e073a9f60d5c1ccc9d17cd733e8744d7ecfec94ef3"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "decorator"
|
name = "decorator"
|
||||||
version = "5.1.1"
|
version = "5.1.1"
|
||||||
|
@ -2140,6 +2151,17 @@ compatible-mypy = ["mypy (>=0.991,<0.1000)"]
|
||||||
coreapi = ["coreapi (>=2.0.0)"]
|
coreapi = ["coreapi (>=2.0.0)"]
|
||||||
markdown = ["types-Markdown (>=0.1.5)"]
|
markdown = ["types-Markdown (>=0.1.5)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "docopt-ng"
|
||||||
|
version = "0.9.0"
|
||||||
|
description = "Jazzband-maintained fork of docopt, the humane command line arguments parser."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "docopt_ng-0.9.0-py3-none-any.whl", hash = "sha256:bfe4c8b03f9fca424c24ee0b4ffa84bf7391cb18c29ce0f6a8227a3b01b81ff9"},
|
||||||
|
{file = "docopt_ng-0.9.0.tar.gz", hash = "sha256:91c6da10b5bb6f2e9e25345829fb8278c78af019f6fc40887ad49b060483b1d7"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docutils"
|
name = "docutils"
|
||||||
version = "0.20.1"
|
version = "0.20.1"
|
||||||
|
@ -3082,6 +3104,17 @@ MarkupSafe = ">=2.0"
|
||||||
[package.extras]
|
[package.extras]
|
||||||
i18n = ["Babel (>=2.7)"]
|
i18n = ["Babel (>=2.7)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "joblib"
|
||||||
|
version = "1.3.2"
|
||||||
|
description = "Lightweight pipelining with Python functions"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"},
|
||||||
|
{file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jsonfield"
|
name = "jsonfield"
|
||||||
version = "3.1.0"
|
version = "3.1.0"
|
||||||
|
@ -3911,6 +3944,31 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
|
||||||
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
|
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
|
||||||
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nltk"
|
||||||
|
version = "3.8.1"
|
||||||
|
description = "Natural Language Toolkit"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"},
|
||||||
|
{file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
click = "*"
|
||||||
|
joblib = "*"
|
||||||
|
regex = ">=2021.8.3"
|
||||||
|
tqdm = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"]
|
||||||
|
corenlp = ["requests"]
|
||||||
|
machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"]
|
||||||
|
plot = ["matplotlib"]
|
||||||
|
tgrep = ["pyparsing"]
|
||||||
|
twitter = ["twython"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nodeenv"
|
name = "nodeenv"
|
||||||
version = "1.8.0"
|
version = "1.8.0"
|
||||||
|
@ -4793,6 +4851,36 @@ files = [
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
pylint = ">=1.7"
|
pylint = ">=1.7"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pymorphy3"
|
||||||
|
version = "1.2.1"
|
||||||
|
description = "Morphological analyzer (POS tagger + inflection engine) for Russian language."
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "pymorphy3-1.2.1-py3-none-any.whl", hash = "sha256:88700966f55e77e3d2aedf194fa00bb4a175c2626017fe423e94ce11bc98f1ff"},
|
||||||
|
{file = "pymorphy3-1.2.1.tar.gz", hash = "sha256:0cc186a3b0716129dd45e3b89f5e8339e5943d9013f93cfd4c58e5335daf296d"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
dawg-python = ">=0.7.1"
|
||||||
|
docopt-ng = ">=0.6"
|
||||||
|
pymorphy3-dicts-ru = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
fast = ["DAWG (>=0.8)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pymorphy3-dicts-ru"
|
||||||
|
version = "2.4.417150.4580142"
|
||||||
|
description = "Russian dictionaries for pymorphy2"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "pymorphy3-dicts-ru-2.4.417150.4580142.tar.gz", hash = "sha256:39ab379d4ca905bafed50f5afc3a3de6f9643605776fbcabc4d3088d4ed382b0"},
|
||||||
|
{file = "pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl", hash = "sha256:718bac64c73c10c16073a199402657283d9b64c04188b694f6d3e9b0d85440f4"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyopenssl"
|
name = "pyopenssl"
|
||||||
version = "23.2.0"
|
version = "23.2.0"
|
||||||
|
@ -7678,4 +7766,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "96905e9613ee78eb47461e88acc8fe40b481de95425960bb71a559c368f942bb"
|
content-hash = "66f69d7c71004c0446a51845aa50a5f3e4a7fda401e8f18a9684ae5e34e6be75"
|
||||||
|
|
|
@ -110,6 +110,9 @@ uuid6 = "^2023.5.2"
|
||||||
fastapi = "0.86.0"
|
fastapi = "0.86.0"
|
||||||
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
||||||
uvicorn = "^0.24.0.post1"
|
uvicorn = "^0.24.0.post1"
|
||||||
|
nltk = "^3.8.1"
|
||||||
|
pymorphy3 = "^1.2.1"
|
||||||
|
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
|
@ -7,12 +7,12 @@ readme = "README.md"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.11"
|
python = "^3.11"
|
||||||
farm-haystack = "^1.21.2"
|
|
||||||
fastapi = "0.99.1"
|
fastapi = "0.99.1"
|
||||||
pydantic = "1.10.13"
|
pydantic = "1.10.13"
|
||||||
milvus-haystack = "^0.0.2"
|
transformers = {version = "4.34.1", extras = ["torch"]}
|
||||||
|
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
|
||||||
|
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core>=1.0.0"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user