mirror of
https://github.com/Alexander-D-Karpov/akarpov
synced 2024-11-22 10:56:39 +03:00
updated neuro and similarity search, moved to elastic 8
This commit is contained in:
parent
356476217d
commit
5b457b3668
31
akarpov/files/documents.py
Normal file
31
akarpov/files/documents.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
from django_elasticsearch_dsl import Document
|
||||
from django_elasticsearch_dsl.registries import registry
|
||||
|
||||
from akarpov.files.models import File
|
||||
|
||||
|
||||
@registry.register_document
|
||||
class FileDocument(Document):
|
||||
class Index:
|
||||
name = "files"
|
||||
settings = {"number_of_shards": 1, "number_of_replicas": 0}
|
||||
|
||||
class Django:
|
||||
model = File
|
||||
fields = [
|
||||
"name",
|
||||
"description",
|
||||
"content",
|
||||
]
|
||||
|
||||
def prepare_description(self, instance):
|
||||
# This method is called for every instance before indexing
|
||||
return instance.description or ""
|
||||
|
||||
def prepare_content(self, instance):
|
||||
# This method is called for every instance before indexing
|
||||
return (
|
||||
instance.content.decode("utf-8")
|
||||
if isinstance(instance.content, bytes)
|
||||
else instance.content
|
||||
)
|
|
@ -1,17 +0,0 @@
|
|||
from haystack import indexes
|
||||
|
||||
from .models import File
|
||||
|
||||
|
||||
class FileIndex(indexes.SearchIndex, indexes.Indexable):
|
||||
text = indexes.CharField(document=True, use_template=True)
|
||||
name = indexes.CharField(model_attr="name", default="")
|
||||
description = indexes.CharField(model_attr="description", default="")
|
||||
content = indexes.CharField(model_attr="content", default="")
|
||||
|
||||
def get_model(self):
|
||||
return File
|
||||
|
||||
def index_queryset(self, using=None):
|
||||
# Return the default queryset to be used for indexing.
|
||||
return self.get_model().objects.all()
|
|
@ -4,11 +4,13 @@
|
|||
|
||||
from django.conf import settings
|
||||
from django.contrib.postgres.search import TrigramSimilarity
|
||||
from django.db.models import F, Func, Q, QuerySet
|
||||
from haystack.query import SearchQuerySet
|
||||
from django.db.models import Case, F, FloatField, Func, Q, QuerySet, Value, When
|
||||
from django.db.models.functions import Coalesce
|
||||
from elasticsearch_dsl import Q as ES_Q
|
||||
|
||||
from akarpov.files.models import File
|
||||
|
||||
from ..documents import FileDocument
|
||||
from .lema import lemmatize_and_remove_stopwords
|
||||
|
||||
|
||||
|
@ -16,17 +18,55 @@ class BaseSearch:
|
|||
def __init__(self, queryset: QuerySet | None = None):
|
||||
self.queryset: QuerySet | None = queryset
|
||||
|
||||
def search(self, query: str) -> QuerySet | SearchQuerySet | list[File]:
|
||||
def search(self, query: str) -> QuerySet | list[File]:
|
||||
raise NotImplementedError("Subclasses must implement this method")
|
||||
|
||||
|
||||
class NeuroSearch(BaseSearch):
|
||||
def search(self, query: str) -> SearchQuerySet:
|
||||
# Search across multiple fields
|
||||
sqs: SearchQuerySet = SearchQuerySet().filter(content=query)
|
||||
sqs = sqs.filter_or(name=query)
|
||||
sqs = sqs.filter_or(description=query)
|
||||
return sqs
|
||||
def search(self, query: str):
|
||||
if not self.queryset:
|
||||
raise ValueError("Queryset cannot be None for search")
|
||||
|
||||
# Perform the Elasticsearch query using a combination of match, match_phrase_prefix, and wildcard queries
|
||||
search = FileDocument.search()
|
||||
search_query = ES_Q(
|
||||
"bool",
|
||||
should=[
|
||||
ES_Q(
|
||||
"multi_match",
|
||||
query=query,
|
||||
fields=["name", "description", "content"],
|
||||
type="best_fields",
|
||||
),
|
||||
ES_Q("match_phrase_prefix", name=query),
|
||||
ES_Q("wildcard", name=f"*{query}*"),
|
||||
ES_Q("wildcard", description=f"*{query}*"),
|
||||
ES_Q("wildcard", content=f"*{query}*"),
|
||||
],
|
||||
minimum_should_match=1,
|
||||
)
|
||||
|
||||
search = search.query(search_query)
|
||||
|
||||
# Execute the search to get the results
|
||||
response = search.execute()
|
||||
|
||||
# Check if there are hits, if not return an empty queryset
|
||||
if not response.hits:
|
||||
return self.queryset.none()
|
||||
|
||||
# Collect the IDs of the hits
|
||||
hit_ids = [hit.meta.id for hit in response.hits]
|
||||
|
||||
# Use the hit IDs to filter the queryset and preserve the order
|
||||
preserved_order = Case(
|
||||
*[When(pk=pk, then=pos) for pos, pk in enumerate(hit_ids)]
|
||||
)
|
||||
relevant_queryset = self.queryset.filter(pk__in=hit_ids).order_by(
|
||||
preserved_order
|
||||
)
|
||||
|
||||
return relevant_queryset
|
||||
|
||||
|
||||
class CaseSensitiveSearch(BaseSearch):
|
||||
|
@ -89,28 +129,28 @@ def as_sql(self, compiler, connection):
|
|||
|
||||
|
||||
class SimilaritySearch(BaseSearch):
|
||||
def __init__(self, queryset: QuerySet[File] | None = None):
|
||||
super().__init__(queryset)
|
||||
|
||||
def search(self, query: str) -> QuerySet[File]:
|
||||
if self.queryset is None:
|
||||
raise ValueError("Queryset cannot be None for similarity search")
|
||||
|
||||
# Detect language and preprocess the query
|
||||
language = "russian" if re.search("[а-яА-Я]", query) else "english"
|
||||
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
|
||||
|
||||
# Annotate the queryset with similarity scores for each field
|
||||
queryset = (
|
||||
self.queryset.annotate(
|
||||
name_similarity=TrigramSimilarity(
|
||||
UnaccentLower("name"), filtered_query
|
||||
name_similarity=Coalesce(
|
||||
TrigramSimilarity(UnaccentLower("name"), filtered_query),
|
||||
Value(0),
|
||||
output_field=FloatField(),
|
||||
),
|
||||
description_similarity=TrigramSimilarity(
|
||||
UnaccentLower("description"), filtered_query
|
||||
description_similarity=Coalesce(
|
||||
TrigramSimilarity(UnaccentLower("description"), filtered_query),
|
||||
Value(0),
|
||||
output_field=FloatField(),
|
||||
),
|
||||
content_similarity=TrigramSimilarity(
|
||||
UnaccentLower("content"), filtered_query
|
||||
content_similarity=Coalesce(
|
||||
TrigramSimilarity(UnaccentLower("content"), filtered_query),
|
||||
Value(0),
|
||||
output_field=FloatField(),
|
||||
),
|
||||
)
|
||||
.annotate(
|
||||
|
@ -119,17 +159,9 @@ def search(self, query: str) -> QuerySet[File]:
|
|||
+ F("description_similarity")
|
||||
+ F("content_similarity")
|
||||
)
|
||||
/ 3
|
||||
)
|
||||
.filter(combined_similarity__gt=0.1)
|
||||
.order_by("-combined_similarity")
|
||||
)
|
||||
print(filtered_query)
|
||||
print(queryset.query)
|
||||
print(
|
||||
queryset.values(
|
||||
"name_similarity", "description_similarity", "content_similarity"
|
||||
)
|
||||
)
|
||||
|
||||
return queryset
|
||||
|
|
|
@ -3,9 +3,8 @@
|
|||
|
||||
import structlog
|
||||
from celery import shared_task
|
||||
from django.core import management
|
||||
from django.core.files import File
|
||||
from haystack.management.commands import rebuild_index, update_index
|
||||
from haystack.query import SearchQuerySet
|
||||
|
||||
from akarpov.files.models import File as FileModel
|
||||
from akarpov.files.services.preview import create_preview, get_file_mimetype
|
||||
|
@ -39,25 +38,10 @@ def process_file(pk: int):
|
|||
return pk
|
||||
|
||||
|
||||
@shared_task
|
||||
def task_rebuild_index():
|
||||
start_time = time.time()
|
||||
rebuild_index.Command().handle(interactive=False)
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
|
||||
indexed_count = SearchQuerySet().all().count()
|
||||
|
||||
logger.info(
|
||||
"index_rebuild_finished", duration=duration, indexed_count=indexed_count
|
||||
)
|
||||
|
||||
|
||||
@shared_task
|
||||
def update_index_task():
|
||||
start_time = time.time()
|
||||
|
||||
update_index.Command().handle(interactive=False)
|
||||
management.call_command("search_index", "--rebuild", "-f")
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
logger.info("update_index_completed", duration=duration)
|
||||
|
|
|
@ -70,7 +70,7 @@ def filter(self, queryset):
|
|||
|
||||
class TopFolderView(LoginRequiredMixin, ListView, FileFilterView):
|
||||
template_name = "files/list.html"
|
||||
paginate_by = 18
|
||||
paginate_by = 38
|
||||
model = BaseFileItem
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
|
|
|
@ -66,7 +66,7 @@
|
|||
"blog.post": {"ops": ("fetch", "get"), "timeout": 20 * 15},
|
||||
"themes.theme": {"ops": ("fetch", "get"), "timeout": 60 * 60},
|
||||
"gallery.*": {"ops": "all", "timeout": 60 * 15},
|
||||
"files.*": {"ops": "all", "timeout": 60 * 5},
|
||||
"files.*": {"ops": ("fetch", "get"), "timeout": 60 * 5},
|
||||
"auth.permission": {"ops": "all", "timeout": 60 * 15},
|
||||
}
|
||||
CACHEOPS_REDIS = env.str("REDIS_URL")
|
||||
|
@ -133,7 +133,7 @@
|
|||
"django_filters",
|
||||
"django_tables2",
|
||||
"location_field",
|
||||
"haystack",
|
||||
"django_elasticsearch_dsl",
|
||||
]
|
||||
|
||||
HEALTH_CHECKS = [
|
||||
|
@ -434,13 +434,9 @@
|
|||
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
|
||||
CELERY_BEAT_SCHEDULE = {
|
||||
"update-index-every-hour": {
|
||||
"task": "akarpov.files.tasks.task_rebuild_index",
|
||||
"task": "akarpov.files.tasks.update_index_task",
|
||||
"schedule": crontab(minute="0"),
|
||||
},
|
||||
"rebuild-index-every-day": {
|
||||
"task": "akarpov.files.tasks.task_rebuild_index",
|
||||
"schedule": crontab(hour="2", minute="0", day_of_week="*"),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
@ -611,14 +607,8 @@
|
|||
)
|
||||
|
||||
|
||||
# HAYSTACK
|
||||
# ELASTICSEARCH
|
||||
# ------------------------------------------------------------------------------
|
||||
HAYSTACK_CONNECTIONS = {
|
||||
"default": {
|
||||
"ENGINE": "haystack.backends.elasticsearch7_backend.Elasticsearch7SearchEngine",
|
||||
"URL": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/"),
|
||||
"INDEX_NAME": "haystack",
|
||||
"TIMEOUT": 60 * 5,
|
||||
"BATCH_SIZE": 100,
|
||||
},
|
||||
ELASTICSEARCH_DSL = {
|
||||
"default": {"hosts": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/")},
|
||||
}
|
||||
|
|
|
@ -100,12 +100,12 @@ services:
|
|||
command: /start-flower
|
||||
|
||||
elasticsearch:
|
||||
image: elasticsearch:7.17.14
|
||||
image: elasticsearch:8.11.1
|
||||
container_name: elasticsearch
|
||||
environment:
|
||||
- cluster.name=docker-cluster
|
||||
- bootstrap.memory_lock=true
|
||||
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
|
||||
- "ES_JAVA_OPTS=-Xms6g -Xmx512m"
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
|
|
3483
poetry.lock
generated
3483
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
|
@ -107,13 +107,15 @@ pgvector = "^0.2.2"
|
|||
pycld2 = "^0.41"
|
||||
textract = "^1.6.5"
|
||||
uuid6 = "^2023.5.2"
|
||||
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
|
||||
uvicorn = "^0.24.0.post1"
|
||||
nltk = "^3.8.1"
|
||||
pymorphy3 = "^1.2.1"
|
||||
pymorphy3-dicts-ru = "^2.4.417150.4580142"
|
||||
fastapi = "^0.104.1"
|
||||
pydantic-settings = "^2.0.3"
|
||||
django-elasticsearch-dsl = "^8.0"
|
||||
elasticsearch-dsl = "^8.11.0"
|
||||
numpy = "1.25.2"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
|
Loading…
Reference in New Issue
Block a user