updated neuro and similarity search, moved to elastic 8

This commit is contained in:
Alexander Karpov 2023-11-20 22:22:19 +03:00
parent 356476217d
commit 5b457b3668
9 changed files with 1723 additions and 1950 deletions

View File

@ -0,0 +1,31 @@
from django_elasticsearch_dsl import Document
from django_elasticsearch_dsl.registries import registry
from akarpov.files.models import File
@registry.register_document
class FileDocument(Document):
class Index:
name = "files"
settings = {"number_of_shards": 1, "number_of_replicas": 0}
class Django:
model = File
fields = [
"name",
"description",
"content",
]
def prepare_description(self, instance):
# This method is called for every instance before indexing
return instance.description or ""
def prepare_content(self, instance):
# This method is called for every instance before indexing
return (
instance.content.decode("utf-8")
if isinstance(instance.content, bytes)
else instance.content
)

View File

@ -1,17 +0,0 @@
from haystack import indexes
from .models import File
class FileIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name", default="")
description = indexes.CharField(model_attr="description", default="")
content = indexes.CharField(model_attr="content", default="")
def get_model(self):
return File
def index_queryset(self, using=None):
# Return the default queryset to be used for indexing.
return self.get_model().objects.all()

View File

@ -4,11 +4,13 @@
from django.conf import settings
from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import F, Func, Q, QuerySet
from haystack.query import SearchQuerySet
from django.db.models import Case, F, FloatField, Func, Q, QuerySet, Value, When
from django.db.models.functions import Coalesce
from elasticsearch_dsl import Q as ES_Q
from akarpov.files.models import File
from ..documents import FileDocument
from .lema import lemmatize_and_remove_stopwords
@ -16,17 +18,55 @@ class BaseSearch:
def __init__(self, queryset: QuerySet | None = None):
self.queryset: QuerySet | None = queryset
def search(self, query: str) -> QuerySet | SearchQuerySet | list[File]:
def search(self, query: str) -> QuerySet | list[File]:
raise NotImplementedError("Subclasses must implement this method")
class NeuroSearch(BaseSearch):
def search(self, query: str) -> SearchQuerySet:
# Search across multiple fields
sqs: SearchQuerySet = SearchQuerySet().filter(content=query)
sqs = sqs.filter_or(name=query)
sqs = sqs.filter_or(description=query)
return sqs
def search(self, query: str):
if not self.queryset:
raise ValueError("Queryset cannot be None for search")
# Perform the Elasticsearch query using a combination of match, match_phrase_prefix, and wildcard queries
search = FileDocument.search()
search_query = ES_Q(
"bool",
should=[
ES_Q(
"multi_match",
query=query,
fields=["name", "description", "content"],
type="best_fields",
),
ES_Q("match_phrase_prefix", name=query),
ES_Q("wildcard", name=f"*{query}*"),
ES_Q("wildcard", description=f"*{query}*"),
ES_Q("wildcard", content=f"*{query}*"),
],
minimum_should_match=1,
)
search = search.query(search_query)
# Execute the search to get the results
response = search.execute()
# Check if there are hits, if not return an empty queryset
if not response.hits:
return self.queryset.none()
# Collect the IDs of the hits
hit_ids = [hit.meta.id for hit in response.hits]
# Use the hit IDs to filter the queryset and preserve the order
preserved_order = Case(
*[When(pk=pk, then=pos) for pos, pk in enumerate(hit_ids)]
)
relevant_queryset = self.queryset.filter(pk__in=hit_ids).order_by(
preserved_order
)
return relevant_queryset
class CaseSensitiveSearch(BaseSearch):
@ -89,28 +129,28 @@ def as_sql(self, compiler, connection):
class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset)
def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search")
# Detect language and preprocess the query
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
# Annotate the queryset with similarity scores for each field
queryset = (
self.queryset.annotate(
name_similarity=TrigramSimilarity(
UnaccentLower("name"), filtered_query
name_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("name"), filtered_query),
Value(0),
output_field=FloatField(),
),
description_similarity=TrigramSimilarity(
UnaccentLower("description"), filtered_query
description_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("description"), filtered_query),
Value(0),
output_field=FloatField(),
),
content_similarity=TrigramSimilarity(
UnaccentLower("content"), filtered_query
content_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("content"), filtered_query),
Value(0),
output_field=FloatField(),
),
)
.annotate(
@ -119,17 +159,9 @@ def search(self, query: str) -> QuerySet[File]:
+ F("description_similarity")
+ F("content_similarity")
)
/ 3
)
.filter(combined_similarity__gt=0.1)
.order_by("-combined_similarity")
)
print(filtered_query)
print(queryset.query)
print(
queryset.values(
"name_similarity", "description_similarity", "content_similarity"
)
)
return queryset

View File

@ -3,9 +3,8 @@
import structlog
from celery import shared_task
from django.core import management
from django.core.files import File
from haystack.management.commands import rebuild_index, update_index
from haystack.query import SearchQuerySet
from akarpov.files.models import File as FileModel
from akarpov.files.services.preview import create_preview, get_file_mimetype
@ -39,25 +38,10 @@ def process_file(pk: int):
return pk
@shared_task
def task_rebuild_index():
start_time = time.time()
rebuild_index.Command().handle(interactive=False)
end_time = time.time()
duration = end_time - start_time
indexed_count = SearchQuerySet().all().count()
logger.info(
"index_rebuild_finished", duration=duration, indexed_count=indexed_count
)
@shared_task
def update_index_task():
start_time = time.time()
update_index.Command().handle(interactive=False)
management.call_command("search_index", "--rebuild", "-f")
end_time = time.time()
duration = end_time - start_time
logger.info("update_index_completed", duration=duration)

View File

@ -70,7 +70,7 @@ def filter(self, queryset):
class TopFolderView(LoginRequiredMixin, ListView, FileFilterView):
template_name = "files/list.html"
paginate_by = 18
paginate_by = 38
model = BaseFileItem
def get_context_data(self, **kwargs):

View File

@ -66,7 +66,7 @@
"blog.post": {"ops": ("fetch", "get"), "timeout": 20 * 15},
"themes.theme": {"ops": ("fetch", "get"), "timeout": 60 * 60},
"gallery.*": {"ops": "all", "timeout": 60 * 15},
"files.*": {"ops": "all", "timeout": 60 * 5},
"files.*": {"ops": ("fetch", "get"), "timeout": 60 * 5},
"auth.permission": {"ops": "all", "timeout": 60 * 15},
}
CACHEOPS_REDIS = env.str("REDIS_URL")
@ -133,7 +133,7 @@
"django_filters",
"django_tables2",
"location_field",
"haystack",
"django_elasticsearch_dsl",
]
HEALTH_CHECKS = [
@ -434,13 +434,9 @@
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
CELERY_BEAT_SCHEDULE = {
"update-index-every-hour": {
"task": "akarpov.files.tasks.task_rebuild_index",
"task": "akarpov.files.tasks.update_index_task",
"schedule": crontab(minute="0"),
},
"rebuild-index-every-day": {
"task": "akarpov.files.tasks.task_rebuild_index",
"schedule": crontab(hour="2", minute="0", day_of_week="*"),
},
}
@ -611,14 +607,8 @@
)
# HAYSTACK
# ELASTICSEARCH
# ------------------------------------------------------------------------------
HAYSTACK_CONNECTIONS = {
"default": {
"ENGINE": "haystack.backends.elasticsearch7_backend.Elasticsearch7SearchEngine",
"URL": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/"),
"INDEX_NAME": "haystack",
"TIMEOUT": 60 * 5,
"BATCH_SIZE": 100,
},
ELASTICSEARCH_DSL = {
"default": {"hosts": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/")},
}

View File

@ -100,12 +100,12 @@ services:
command: /start-flower
elasticsearch:
image: elasticsearch:7.17.14
image: elasticsearch:8.11.1
container_name: elasticsearch
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
- "ES_JAVA_OPTS=-Xms6g -Xmx512m"
ulimits:
memlock:
soft: -1

3483
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -107,13 +107,15 @@ pgvector = "^0.2.2"
pycld2 = "^0.41"
textract = "^1.6.5"
uuid6 = "^2023.5.2"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
uvicorn = "^0.24.0.post1"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142"
fastapi = "^0.104.1"
pydantic-settings = "^2.0.3"
django-elasticsearch-dsl = "^8.0"
elasticsearch-dsl = "^8.11.0"
numpy = "1.25.2"
[build-system]