Compare commits

..

2 Commits

Author SHA1 Message Date
dependabot[bot]
c7f769ee72
Merge 3eafeefffe into b9715981e7 2023-11-07 10:45:46 +00:00
dependabot[bot]
3eafeefffe
Bump fastapi from 0.101.1 to 0.104.1
Bumps [fastapi](https://github.com/tiangolo/fastapi) from 0.101.1 to 0.104.1.
- [Release notes](https://github.com/tiangolo/fastapi/releases)
- [Commits](https://github.com/tiangolo/fastapi/compare/0.101.1...0.104.1)

---
updated-dependencies:
- dependency-name: fastapi
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-11-07 10:45:43 +00:00
8 changed files with 18 additions and 181 deletions

View File

@ -1,37 +0,0 @@
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pymorphy3 import MorphAnalyzer
# Set up stop words
english_stopwords = set(stopwords.words("english"))
russian_stopwords = set(stopwords.words("russian"))
# Set up lemmatizers
english_lemmatizer = WordNetLemmatizer()
russian_lemmatizer = MorphAnalyzer()
def lemmatize_and_remove_stopwords(text, language="english"):
# Tokenize the text
tokens = word_tokenize(text)
# Lemmatize each token based on the specified language
lemmatized_tokens = []
for token in tokens:
if language == "russian":
lemmatized_token = russian_lemmatizer.parse(token)[0].normal_form
else: # Default to English
lemmatized_token = english_lemmatizer.lemmatize(token)
lemmatized_tokens.append(lemmatized_token)
# Remove stop words
filtered_tokens = [
token
for token in lemmatized_tokens
if token not in english_stopwords and token not in russian_stopwords
]
# Reconstruct the text
filtered_text = " ".join(filtered_tokens)
return filtered_text

View File

@ -3,14 +3,13 @@
from typing import BinaryIO
from django.conf import settings
from django.contrib.postgres.lookups import Unaccent
from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import F, Func, Q, QuerySet
from django.db.models import Q, QuerySet
from haystack.query import SearchQuerySet
from akarpov.files.models import File
from .lema import lemmatize_and_remove_stopwords
class BaseSearch:
def __init__(self, queryset: QuerySet | None = None):
@ -77,17 +76,6 @@ def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
return False
class UnaccentLower(Func):
function = "UNACCENT"
def as_sql(self, compiler, connection):
unaccented_sql, unaccented_params = compiler.compile(
self.get_source_expressions()[0]
)
lower_unaccented_sql = f"LOWER({unaccented_sql})"
return lower_unaccented_sql, unaccented_params
class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset)
@ -96,40 +84,18 @@ def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search")
# Detect language and preprocess the query
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
# Annotate the queryset with similarity scores for each field
queryset = (
# Perform a similarity search using trigram comparison
return (
self.queryset.annotate(
name_similarity=TrigramSimilarity(
UnaccentLower("name"), filtered_query
),
description_similarity=TrigramSimilarity(
UnaccentLower("description"), filtered_query
),
content_similarity=TrigramSimilarity(
UnaccentLower("content"), filtered_query
),
name_unaccent=Unaccent("name"),
description_unaccent=Unaccent("description"),
content_unaccent=Unaccent("content"),
)
.annotate(
combined_similarity=(
F("name_similarity")
+ F("description_similarity")
+ F("content_similarity")
)
/ 3
similarity=TrigramSimilarity("name_unaccent", query)
+ TrigramSimilarity("description_unaccent", query)
+ TrigramSimilarity("content_unaccent", query)
)
.filter(combined_similarity__gt=0.1)
.order_by("-combined_similarity")
.filter(similarity__gt=0.1)
.order_by("-similarity")
)
print(filtered_query)
print(queryset.query)
print(
queryset.values(
"name_similarity", "description_similarity", "content_similarity"
)
)
return queryset

View File

@ -65,6 +65,7 @@ def filter(self, queryset):
queryset=File.objects.filter(user=self.request.user)
)
queryset = search_instance.search(query)
print(queryset, query)
return queryset

View File

@ -38,7 +38,6 @@ RUN python -m venv /venv
COPY pyproject.toml poetry.lock /app/
RUN poetry export --without-hashes -f requirements.txt | /venv/bin/pip install -r /dev/stdin
RUN python -m nltk.downloader punkt stopwords wordnet
COPY . .
RUN poetry build && /venv/bin/pip install dist/*.whl

View File

@ -107,7 +107,6 @@
]
THIRD_PARTY_APPS = [
"django.contrib.postgres",
"crispy_forms",
"crispy_bootstrap5",
"allauth",

90
poetry.lock generated
View File

@ -1523,17 +1523,6 @@ twisted = {version = ">=22.4", extras = ["tls"]}
[package.extras]
tests = ["django", "hypothesis", "pytest", "pytest-asyncio"]
[[package]]
name = "dawg-python"
version = "0.7.2"
description = "Pure-python reader for DAWGs (DAFSAs) created by dawgdic C++ library or DAWG Python extension."
optional = false
python-versions = "*"
files = [
{file = "DAWG-Python-0.7.2.tar.gz", hash = "sha256:4a5e3286e6261cca02f205cfd5516a7ab10190fa30c51c28d345808f595e3421"},
{file = "DAWG_Python-0.7.2-py2.py3-none-any.whl", hash = "sha256:4941d5df081b8d6fcb4597e073a9f60d5c1ccc9d17cd733e8744d7ecfec94ef3"},
]
[[package]]
name = "decorator"
version = "5.1.1"
@ -2162,17 +2151,6 @@ compatible-mypy = ["mypy (>=0.991,<0.1000)"]
coreapi = ["coreapi (>=2.0.0)"]
markdown = ["types-Markdown (>=0.1.5)"]
[[package]]
name = "docopt-ng"
version = "0.9.0"
description = "Jazzband-maintained fork of docopt, the humane command line arguments parser."
optional = false
python-versions = ">=3.7"
files = [
{file = "docopt_ng-0.9.0-py3-none-any.whl", hash = "sha256:bfe4c8b03f9fca424c24ee0b4ffa84bf7391cb18c29ce0f6a8227a3b01b81ff9"},
{file = "docopt_ng-0.9.0.tar.gz", hash = "sha256:91c6da10b5bb6f2e9e25345829fb8278c78af019f6fc40887ad49b060483b1d7"},
]
[[package]]
name = "docutils"
version = "0.20.1"
@ -3114,17 +3092,6 @@ MarkupSafe = ">=2.0"
[package.extras]
i18n = ["Babel (>=2.7)"]
[[package]]
name = "joblib"
version = "1.3.2"
description = "Lightweight pipelining with Python functions"
optional = false
python-versions = ">=3.7"
files = [
{file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"},
{file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"},
]
[[package]]
name = "jsonfield"
version = "3.1.0"
@ -3973,31 +3940,6 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
[[package]]
name = "nltk"
version = "3.8.1"
description = "Natural Language Toolkit"
optional = false
python-versions = ">=3.7"
files = [
{file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"},
{file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"},
]
[package.dependencies]
click = "*"
joblib = "*"
regex = ">=2021.8.3"
tqdm = "*"
[package.extras]
all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"]
corenlp = ["requests"]
machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"]
plot = ["matplotlib"]
tgrep = ["pyparsing"]
twitter = ["twython"]
[[package]]
name = "nodeenv"
version = "1.8.0"
@ -4880,36 +4822,6 @@ files = [
[package.dependencies]
pylint = ">=1.7"
[[package]]
name = "pymorphy3"
version = "1.2.1"
description = "Morphological analyzer (POS tagger + inflection engine) for Russian language."
optional = false
python-versions = "*"
files = [
{file = "pymorphy3-1.2.1-py3-none-any.whl", hash = "sha256:88700966f55e77e3d2aedf194fa00bb4a175c2626017fe423e94ce11bc98f1ff"},
{file = "pymorphy3-1.2.1.tar.gz", hash = "sha256:0cc186a3b0716129dd45e3b89f5e8339e5943d9013f93cfd4c58e5335daf296d"},
]
[package.dependencies]
dawg-python = ">=0.7.1"
docopt-ng = ">=0.6"
pymorphy3-dicts-ru = "*"
[package.extras]
fast = ["DAWG (>=0.8)"]
[[package]]
name = "pymorphy3-dicts-ru"
version = "2.4.417150.4580142"
description = "Russian dictionaries for pymorphy2"
optional = false
python-versions = "*"
files = [
{file = "pymorphy3-dicts-ru-2.4.417150.4580142.tar.gz", hash = "sha256:39ab379d4ca905bafed50f5afc3a3de6f9643605776fbcabc4d3088d4ed382b0"},
{file = "pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl", hash = "sha256:718bac64c73c10c16073a199402657283d9b64c04188b694f6d3e9b0d85440f4"},
]
[[package]]
name = "pyopenssl"
version = "23.2.0"
@ -7797,4 +7709,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "efd8671d3e8f1f59dcd418468893f93f5e2e0c860005cfd78284879d92cf2f0d"
content-hash = "a306fabd9575b5cddbfdddf2ceec22c8be1bb358b4755df2d78279de4745f0e8"

View File

@ -110,9 +110,6 @@ uuid6 = "^2023.5.2"
fastapi = "0.104.1"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
uvicorn = "^0.24.0.post1"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142"
[build-system]

View File

@ -7,12 +7,12 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
farm-haystack = "^1.21.2"
fastapi = "0.99.1"
pydantic = "1.10.13"
transformers = {version = "4.34.1", extras = ["torch"]}
torch = ">=2.0.0, !=2.0.1, !=2.1.0"
farm-haystack = {extras = ["faiss"], version = "^1.21.2"}
milvus-haystack = "^0.0.2"
[build-system]
requires = ["poetry-core>=1.0.0"]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"