mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-11 00:06:34 +03:00
updated neuro and similarity search, moved to elastic 8
This commit is contained in:
parent
356476217d
commit
5b457b3668
31
akarpov/files/documents.py
Normal file
31
akarpov/files/documents.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
from django_elasticsearch_dsl import Document
|
||||||
|
from django_elasticsearch_dsl.registries import registry
|
||||||
|
|
||||||
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
|
||||||
|
@registry.register_document
|
||||||
|
class FileDocument(Document):
|
||||||
|
class Index:
|
||||||
|
name = "files"
|
||||||
|
settings = {"number_of_shards": 1, "number_of_replicas": 0}
|
||||||
|
|
||||||
|
class Django:
|
||||||
|
model = File
|
||||||
|
fields = [
|
||||||
|
"name",
|
||||||
|
"description",
|
||||||
|
"content",
|
||||||
|
]
|
||||||
|
|
||||||
|
def prepare_description(self, instance):
|
||||||
|
# This method is called for every instance before indexing
|
||||||
|
return instance.description or ""
|
||||||
|
|
||||||
|
def prepare_content(self, instance):
|
||||||
|
# This method is called for every instance before indexing
|
||||||
|
return (
|
||||||
|
instance.content.decode("utf-8")
|
||||||
|
if isinstance(instance.content, bytes)
|
||||||
|
else instance.content
|
||||||
|
)
|
|
@ -1,17 +0,0 @@
|
||||||
from haystack import indexes
|
|
||||||
|
|
||||||
from .models import File
|
|
||||||
|
|
||||||
|
|
||||||
class FileIndex(indexes.SearchIndex, indexes.Indexable):
|
|
||||||
text = indexes.CharField(document=True, use_template=True)
|
|
||||||
name = indexes.CharField(model_attr="name", default="")
|
|
||||||
description = indexes.CharField(model_attr="description", default="")
|
|
||||||
content = indexes.CharField(model_attr="content", default="")
|
|
||||||
|
|
||||||
def get_model(self):
|
|
||||||
return File
|
|
||||||
|
|
||||||
def index_queryset(self, using=None):
|
|
||||||
# Return the default queryset to be used for indexing.
|
|
||||||
return self.get_model().objects.all()
|
|
|
@ -4,11 +4,13 @@
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.contrib.postgres.search import TrigramSimilarity
|
from django.contrib.postgres.search import TrigramSimilarity
|
||||||
from django.db.models import F, Func, Q, QuerySet
|
from django.db.models import Case, F, FloatField, Func, Q, QuerySet, Value, When
|
||||||
from haystack.query import SearchQuerySet
|
from django.db.models.functions import Coalesce
|
||||||
|
from elasticsearch_dsl import Q as ES_Q
|
||||||
|
|
||||||
from akarpov.files.models import File
|
from akarpov.files.models import File
|
||||||
|
|
||||||
|
from ..documents import FileDocument
|
||||||
from .lema import lemmatize_and_remove_stopwords
|
from .lema import lemmatize_and_remove_stopwords
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,17 +18,55 @@ class BaseSearch:
|
||||||
def __init__(self, queryset: QuerySet | None = None):
|
def __init__(self, queryset: QuerySet | None = None):
|
||||||
self.queryset: QuerySet | None = queryset
|
self.queryset: QuerySet | None = queryset
|
||||||
|
|
||||||
def search(self, query: str) -> QuerySet | SearchQuerySet | list[File]:
|
def search(self, query: str) -> QuerySet | list[File]:
|
||||||
raise NotImplementedError("Subclasses must implement this method")
|
raise NotImplementedError("Subclasses must implement this method")
|
||||||
|
|
||||||
|
|
||||||
class NeuroSearch(BaseSearch):
|
class NeuroSearch(BaseSearch):
|
||||||
def search(self, query: str) -> SearchQuerySet:
|
def search(self, query: str):
|
||||||
# Search across multiple fields
|
if not self.queryset:
|
||||||
sqs: SearchQuerySet = SearchQuerySet().filter(content=query)
|
raise ValueError("Queryset cannot be None for search")
|
||||||
sqs = sqs.filter_or(name=query)
|
|
||||||
sqs = sqs.filter_or(description=query)
|
# Perform the Elasticsearch query using a combination of match, match_phrase_prefix, and wildcard queries
|
||||||
return sqs
|
search = FileDocument.search()
|
||||||
|
search_query = ES_Q(
|
||||||
|
"bool",
|
||||||
|
should=[
|
||||||
|
ES_Q(
|
||||||
|
"multi_match",
|
||||||
|
query=query,
|
||||||
|
fields=["name", "description", "content"],
|
||||||
|
type="best_fields",
|
||||||
|
),
|
||||||
|
ES_Q("match_phrase_prefix", name=query),
|
||||||
|
ES_Q("wildcard", name=f"*{query}*"),
|
||||||
|
ES_Q("wildcard", description=f"*{query}*"),
|
||||||
|
ES_Q("wildcard", content=f"*{query}*"),
|
||||||
|
],
|
||||||
|
minimum_should_match=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
search = search.query(search_query)
|
||||||
|
|
||||||
|
# Execute the search to get the results
|
||||||
|
response = search.execute()
|
||||||
|
|
||||||
|
# Check if there are hits, if not return an empty queryset
|
||||||
|
if not response.hits:
|
||||||
|
return self.queryset.none()
|
||||||
|
|
||||||
|
# Collect the IDs of the hits
|
||||||
|
hit_ids = [hit.meta.id for hit in response.hits]
|
||||||
|
|
||||||
|
# Use the hit IDs to filter the queryset and preserve the order
|
||||||
|
preserved_order = Case(
|
||||||
|
*[When(pk=pk, then=pos) for pos, pk in enumerate(hit_ids)]
|
||||||
|
)
|
||||||
|
relevant_queryset = self.queryset.filter(pk__in=hit_ids).order_by(
|
||||||
|
preserved_order
|
||||||
|
)
|
||||||
|
|
||||||
|
return relevant_queryset
|
||||||
|
|
||||||
|
|
||||||
class CaseSensitiveSearch(BaseSearch):
|
class CaseSensitiveSearch(BaseSearch):
|
||||||
|
@ -89,28 +129,28 @@ def as_sql(self, compiler, connection):
|
||||||
|
|
||||||
|
|
||||||
class SimilaritySearch(BaseSearch):
|
class SimilaritySearch(BaseSearch):
|
||||||
def __init__(self, queryset: QuerySet[File] | None = None):
|
|
||||||
super().__init__(queryset)
|
|
||||||
|
|
||||||
def search(self, query: str) -> QuerySet[File]:
|
def search(self, query: str) -> QuerySet[File]:
|
||||||
if self.queryset is None:
|
if self.queryset is None:
|
||||||
raise ValueError("Queryset cannot be None for similarity search")
|
raise ValueError("Queryset cannot be None for similarity search")
|
||||||
|
|
||||||
# Detect language and preprocess the query
|
|
||||||
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
||||||
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
||||||
|
|
||||||
# Annotate the queryset with similarity scores for each field
|
|
||||||
queryset = (
|
queryset = (
|
||||||
self.queryset.annotate(
|
self.queryset.annotate(
|
||||||
name_similarity=TrigramSimilarity(
|
name_similarity=Coalesce(
|
||||||
UnaccentLower("name"), filtered_query
|
TrigramSimilarity(UnaccentLower("name"), filtered_query),
|
||||||
|
Value(0),
|
||||||
|
output_field=FloatField(),
|
||||||
),
|
),
|
||||||
description_similarity=TrigramSimilarity(
|
description_similarity=Coalesce(
|
||||||
UnaccentLower("description"), filtered_query
|
TrigramSimilarity(UnaccentLower("description"), filtered_query),
|
||||||
|
Value(0),
|
||||||
|
output_field=FloatField(),
|
||||||
),
|
),
|
||||||
content_similarity=TrigramSimilarity(
|
content_similarity=Coalesce(
|
||||||
UnaccentLower("content"), filtered_query
|
TrigramSimilarity(UnaccentLower("content"), filtered_query),
|
||||||
|
Value(0),
|
||||||
|
output_field=FloatField(),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
.annotate(
|
.annotate(
|
||||||
|
@ -119,17 +159,9 @@ def search(self, query: str) -> QuerySet[File]:
|
||||||
+ F("description_similarity")
|
+ F("description_similarity")
|
||||||
+ F("content_similarity")
|
+ F("content_similarity")
|
||||||
)
|
)
|
||||||
/ 3
|
|
||||||
)
|
)
|
||||||
.filter(combined_similarity__gt=0.1)
|
.filter(combined_similarity__gt=0.1)
|
||||||
.order_by("-combined_similarity")
|
.order_by("-combined_similarity")
|
||||||
)
|
)
|
||||||
print(filtered_query)
|
|
||||||
print(queryset.query)
|
|
||||||
print(
|
|
||||||
queryset.values(
|
|
||||||
"name_similarity", "description_similarity", "content_similarity"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return queryset
|
return queryset
|
||||||
|
|
|
@ -3,9 +3,8 @@
|
||||||
|
|
||||||
import structlog
|
import structlog
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
|
from django.core import management
|
||||||
from django.core.files import File
|
from django.core.files import File
|
||||||
from haystack.management.commands import rebuild_index, update_index
|
|
||||||
from haystack.query import SearchQuerySet
|
|
||||||
|
|
||||||
from akarpov.files.models import File as FileModel
|
from akarpov.files.models import File as FileModel
|
||||||
from akarpov.files.services.preview import create_preview, get_file_mimetype
|
from akarpov.files.services.preview import create_preview, get_file_mimetype
|
||||||
|
@ -39,25 +38,10 @@ def process_file(pk: int):
|
||||||
return pk
|
return pk
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
|
||||||
def task_rebuild_index():
|
|
||||||
start_time = time.time()
|
|
||||||
rebuild_index.Command().handle(interactive=False)
|
|
||||||
end_time = time.time()
|
|
||||||
duration = end_time - start_time
|
|
||||||
|
|
||||||
indexed_count = SearchQuerySet().all().count()
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"index_rebuild_finished", duration=duration, indexed_count=indexed_count
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
def update_index_task():
|
def update_index_task():
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
management.call_command("search_index", "--rebuild", "-f")
|
||||||
update_index.Command().handle(interactive=False)
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
duration = end_time - start_time
|
duration = end_time - start_time
|
||||||
logger.info("update_index_completed", duration=duration)
|
logger.info("update_index_completed", duration=duration)
|
||||||
|
|
|
@ -70,7 +70,7 @@ def filter(self, queryset):
|
||||||
|
|
||||||
class TopFolderView(LoginRequiredMixin, ListView, FileFilterView):
|
class TopFolderView(LoginRequiredMixin, ListView, FileFilterView):
|
||||||
template_name = "files/list.html"
|
template_name = "files/list.html"
|
||||||
paginate_by = 18
|
paginate_by = 38
|
||||||
model = BaseFileItem
|
model = BaseFileItem
|
||||||
|
|
||||||
def get_context_data(self, **kwargs):
|
def get_context_data(self, **kwargs):
|
||||||
|
|
|
@ -66,7 +66,7 @@
|
||||||
"blog.post": {"ops": ("fetch", "get"), "timeout": 20 * 15},
|
"blog.post": {"ops": ("fetch", "get"), "timeout": 20 * 15},
|
||||||
"themes.theme": {"ops": ("fetch", "get"), "timeout": 60 * 60},
|
"themes.theme": {"ops": ("fetch", "get"), "timeout": 60 * 60},
|
||||||
"gallery.*": {"ops": "all", "timeout": 60 * 15},
|
"gallery.*": {"ops": "all", "timeout": 60 * 15},
|
||||||
"files.*": {"ops": "all", "timeout": 60 * 5},
|
"files.*": {"ops": ("fetch", "get"), "timeout": 60 * 5},
|
||||||
"auth.permission": {"ops": "all", "timeout": 60 * 15},
|
"auth.permission": {"ops": "all", "timeout": 60 * 15},
|
||||||
}
|
}
|
||||||
CACHEOPS_REDIS = env.str("REDIS_URL")
|
CACHEOPS_REDIS = env.str("REDIS_URL")
|
||||||
|
@ -133,7 +133,7 @@
|
||||||
"django_filters",
|
"django_filters",
|
||||||
"django_tables2",
|
"django_tables2",
|
||||||
"location_field",
|
"location_field",
|
||||||
"haystack",
|
"django_elasticsearch_dsl",
|
||||||
]
|
]
|
||||||
|
|
||||||
HEALTH_CHECKS = [
|
HEALTH_CHECKS = [
|
||||||
|
@ -434,13 +434,9 @@
|
||||||
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
|
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
|
||||||
CELERY_BEAT_SCHEDULE = {
|
CELERY_BEAT_SCHEDULE = {
|
||||||
"update-index-every-hour": {
|
"update-index-every-hour": {
|
||||||
"task": "akarpov.files.tasks.task_rebuild_index",
|
"task": "akarpov.files.tasks.update_index_task",
|
||||||
"schedule": crontab(minute="0"),
|
"schedule": crontab(minute="0"),
|
||||||
},
|
},
|
||||||
"rebuild-index-every-day": {
|
|
||||||
"task": "akarpov.files.tasks.task_rebuild_index",
|
|
||||||
"schedule": crontab(hour="2", minute="0", day_of_week="*"),
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -611,14 +607,8 @@
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# HAYSTACK
|
# ELASTICSEARCH
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
HAYSTACK_CONNECTIONS = {
|
ELASTICSEARCH_DSL = {
|
||||||
"default": {
|
"default": {"hosts": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/")},
|
||||||
"ENGINE": "haystack.backends.elasticsearch7_backend.Elasticsearch7SearchEngine",
|
|
||||||
"URL": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/"),
|
|
||||||
"INDEX_NAME": "haystack",
|
|
||||||
"TIMEOUT": 60 * 5,
|
|
||||||
"BATCH_SIZE": 100,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -100,12 +100,12 @@ services:
|
||||||
command: /start-flower
|
command: /start-flower
|
||||||
|
|
||||||
elasticsearch:
|
elasticsearch:
|
||||||
image: elasticsearch:7.17.14
|
image: elasticsearch:8.11.1
|
||||||
container_name: elasticsearch
|
container_name: elasticsearch
|
||||||
environment:
|
environment:
|
||||||
- cluster.name=docker-cluster
|
- cluster.name=docker-cluster
|
||||||
- bootstrap.memory_lock=true
|
- bootstrap.memory_lock=true
|
||||||
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
|
- "ES_JAVA_OPTS=-Xms6g -Xmx512m"
|
||||||
ulimits:
|
ulimits:
|
||||||
memlock:
|
memlock:
|
||||||
soft: -1
|
soft: -1
|
||||||
|
|
3483
poetry.lock
generated
3483
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
|
@ -107,13 +107,15 @@ pgvector = "^0.2.2"
|
||||||
pycld2 = "^0.41"
|
pycld2 = "^0.41"
|
||||||
textract = "^1.6.5"
|
textract = "^1.6.5"
|
||||||
uuid6 = "^2023.5.2"
|
uuid6 = "^2023.5.2"
|
||||||
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
|
||||||
uvicorn = "^0.24.0.post1"
|
uvicorn = "^0.24.0.post1"
|
||||||
nltk = "^3.8.1"
|
nltk = "^3.8.1"
|
||||||
pymorphy3 = "^1.2.1"
|
pymorphy3 = "^1.2.1"
|
||||||
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
||||||
fastapi = "^0.104.1"
|
fastapi = "^0.104.1"
|
||||||
pydantic-settings = "^2.0.3"
|
pydantic-settings = "^2.0.3"
|
||||||
|
django-elasticsearch-dsl = "^8.0"
|
||||||
|
elasticsearch-dsl = "^8.11.0"
|
||||||
|
numpy = "1.25.2"
|
||||||
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user