Compare commits

..

No commits in common. "6fb08d6569a1294dab194b3d860dff8d4c9ddfb4" and "356476217d47c17018a4e75afa1ac8531a186f10" have entirely different histories.

9 changed files with 1958 additions and 1728 deletions

View File

@ -1,31 +0,0 @@
from django_elasticsearch_dsl import Document
from django_elasticsearch_dsl.registries import registry
from akarpov.files.models import File
@registry.register_document
class FileDocument(Document):
class Index:
name = "files"
settings = {"number_of_shards": 1, "number_of_replicas": 0}
class Django:
model = File
fields = [
"name",
"description",
"content",
]
def prepare_description(self, instance):
# This method is called for every instance before indexing
return instance.description or ""
def prepare_content(self, instance):
# This method is called for every instance before indexing
return (
instance.content.decode("utf-8")
if isinstance(instance.content, bytes)
else instance.content
)

View File

@ -0,0 +1,17 @@
from haystack import indexes
from .models import File
class FileIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name", default="")
description = indexes.CharField(model_attr="description", default="")
content = indexes.CharField(model_attr="content", default="")
def get_model(self):
return File
def index_queryset(self, using=None):
# Return the default queryset to be used for indexing.
return self.get_model().objects.all()

View File

@ -4,13 +4,11 @@
from django.conf import settings
from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import Case, F, FloatField, Func, Q, QuerySet, Value, When
from django.db.models.functions import Coalesce
from elasticsearch_dsl import Q as ES_Q
from django.db.models import F, Func, Q, QuerySet
from haystack.query import SearchQuerySet
from akarpov.files.models import File
from ..documents import FileDocument
from .lema import lemmatize_and_remove_stopwords
@ -18,55 +16,17 @@ class BaseSearch:
def __init__(self, queryset: QuerySet | None = None):
self.queryset: QuerySet | None = queryset
def search(self, query: str) -> QuerySet | list[File]:
def search(self, query: str) -> QuerySet | SearchQuerySet | list[File]:
raise NotImplementedError("Subclasses must implement this method")
class NeuroSearch(BaseSearch):
def search(self, query: str):
if not self.queryset:
raise ValueError("Queryset cannot be None for search")
# Perform the Elasticsearch query using a combination of match, match_phrase_prefix, and wildcard queries
search = FileDocument.search()
search_query = ES_Q(
"bool",
should=[
ES_Q(
"multi_match",
query=query,
fields=["name", "description", "content"],
type="best_fields",
),
ES_Q("match_phrase_prefix", name=query),
ES_Q("wildcard", name=f"*{query}*"),
ES_Q("wildcard", description=f"*{query}*"),
ES_Q("wildcard", content=f"*{query}*"),
],
minimum_should_match=1,
)
search = search.query(search_query)
# Execute the search to get the results
response = search.execute()
# Check if there are hits, if not return an empty queryset
if not response.hits:
return self.queryset.none()
# Collect the IDs of the hits
hit_ids = [hit.meta.id for hit in response.hits]
# Use the hit IDs to filter the queryset and preserve the order
preserved_order = Case(
*[When(pk=pk, then=pos) for pos, pk in enumerate(hit_ids)]
)
relevant_queryset = self.queryset.filter(pk__in=hit_ids).order_by(
preserved_order
)
return relevant_queryset
def search(self, query: str) -> SearchQuerySet:
# Search across multiple fields
sqs: SearchQuerySet = SearchQuerySet().filter(content=query)
sqs = sqs.filter_or(name=query)
sqs = sqs.filter_or(description=query)
return sqs
class CaseSensitiveSearch(BaseSearch):
@ -129,28 +89,28 @@ def as_sql(self, compiler, connection):
class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset)
def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search")
# Detect language and preprocess the query
language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language)
# Annotate the queryset with similarity scores for each field
queryset = (
self.queryset.annotate(
name_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("name"), filtered_query),
Value(0),
output_field=FloatField(),
name_similarity=TrigramSimilarity(
UnaccentLower("name"), filtered_query
),
description_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("description"), filtered_query),
Value(0),
output_field=FloatField(),
description_similarity=TrigramSimilarity(
UnaccentLower("description"), filtered_query
),
content_similarity=Coalesce(
TrigramSimilarity(UnaccentLower("content"), filtered_query),
Value(0),
output_field=FloatField(),
content_similarity=TrigramSimilarity(
UnaccentLower("content"), filtered_query
),
)
.annotate(
@ -159,9 +119,17 @@ def search(self, query: str) -> QuerySet[File]:
+ F("description_similarity")
+ F("content_similarity")
)
/ 3
)
.filter(combined_similarity__gt=0.1)
.order_by("-combined_similarity")
)
print(filtered_query)
print(queryset.query)
print(
queryset.values(
"name_similarity", "description_similarity", "content_similarity"
)
)
return queryset

View File

@ -3,8 +3,9 @@
import structlog
from celery import shared_task
from django.core import management
from django.core.files import File
from haystack.management.commands import rebuild_index, update_index
from haystack.query import SearchQuerySet
from akarpov.files.models import File as FileModel
from akarpov.files.services.preview import create_preview, get_file_mimetype
@ -38,10 +39,25 @@ def process_file(pk: int):
return pk
@shared_task
def task_rebuild_index():
start_time = time.time()
rebuild_index.Command().handle(interactive=False)
end_time = time.time()
duration = end_time - start_time
indexed_count = SearchQuerySet().all().count()
logger.info(
"index_rebuild_finished", duration=duration, indexed_count=indexed_count
)
@shared_task
def update_index_task():
start_time = time.time()
management.call_command("search_index", "--rebuild", "-f")
update_index.Command().handle(interactive=False)
end_time = time.time()
duration = end_time - start_time
logger.info("update_index_completed", duration=duration)

View File

@ -70,7 +70,7 @@ def filter(self, queryset):
class TopFolderView(LoginRequiredMixin, ListView, FileFilterView):
template_name = "files/list.html"
paginate_by = 38
paginate_by = 18
model = BaseFileItem
def get_context_data(self, **kwargs):

View File

@ -66,7 +66,7 @@
"blog.post": {"ops": ("fetch", "get"), "timeout": 20 * 15},
"themes.theme": {"ops": ("fetch", "get"), "timeout": 60 * 60},
"gallery.*": {"ops": "all", "timeout": 60 * 15},
"files.*": {"ops": ("fetch", "get"), "timeout": 60 * 5},
"files.*": {"ops": "all", "timeout": 60 * 5},
"auth.permission": {"ops": "all", "timeout": 60 * 15},
}
CACHEOPS_REDIS = env.str("REDIS_URL")
@ -133,7 +133,7 @@
"django_filters",
"django_tables2",
"location_field",
"django_elasticsearch_dsl",
"haystack",
]
HEALTH_CHECKS = [
@ -434,9 +434,13 @@
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
CELERY_BEAT_SCHEDULE = {
"update-index-every-hour": {
"task": "akarpov.files.tasks.update_index_task",
"task": "akarpov.files.tasks.task_rebuild_index",
"schedule": crontab(minute="0"),
},
"rebuild-index-every-day": {
"task": "akarpov.files.tasks.task_rebuild_index",
"schedule": crontab(hour="2", minute="0", day_of_week="*"),
},
}
@ -607,8 +611,14 @@
)
# ELASTICSEARCH
# HAYSTACK
# ------------------------------------------------------------------------------
ELASTICSEARCH_DSL = {
"default": {"hosts": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/")},
HAYSTACK_CONNECTIONS = {
"default": {
"ENGINE": "haystack.backends.elasticsearch7_backend.Elasticsearch7SearchEngine",
"URL": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/"),
"INDEX_NAME": "haystack",
"TIMEOUT": 60 * 5,
"BATCH_SIZE": 100,
},
}

View File

@ -100,15 +100,18 @@ services:
command: /start-flower
elasticsearch:
image: elasticsearch:8.8.0
image: elasticsearch:7.17.14
container_name: elasticsearch
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- akarpov_local_elasticsearch_data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
- "9300:9300"
environment:
- node.name=activity
- discovery.type=single-node
- cluster.name=ws-es-data-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms4g -Xmx4g"
volumes:
- akarpov_local_elasticsearch_data:/usr/share/elasticsearch/data

3479
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -107,15 +107,13 @@ pgvector = "^0.2.2"
pycld2 = "^0.41"
textract = "^1.6.5"
uuid6 = "^2023.5.2"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
uvicorn = "^0.24.0.post1"
nltk = "^3.8.1"
pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142"
fastapi = "^0.104.1"
pydantic-settings = "^2.0.3"
django-elasticsearch-dsl = "^8.0"
elasticsearch-dsl = "^8.11.0"
numpy = "1.25.2"
[build-system]