Compare commits

...

2 Commits

9 changed files with 1730 additions and 1960 deletions

View File

@ -0,0 +1,31 @@
from django_elasticsearch_dsl import Document
from django_elasticsearch_dsl.registries import registry
from akarpov.files.models import File
@registry.register_document
class FileDocument(Document):
class Index:
name = "files"
settings = {"number_of_shards": 1, "number_of_replicas": 0}
class Django:
model = File
fields = [
"name",
"description",
"content",
]
def prepare_description(self, instance):
# This method is called for every instance before indexing
return instance.description or ""
def prepare_content(self, instance):
# This method is called for every instance before indexing
return (
instance.content.decode("utf-8")
if isinstance(instance.content, bytes)
else instance.content
)

View File

@ -1,17 +0,0 @@
from haystack import indexes
from .models import File
class FileIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name", default="")
description = indexes.CharField(model_attr="description", default="")
content = indexes.CharField(model_attr="content", default="")
def get_model(self):
return File
def index_queryset(self, using=None):
# Return the default queryset to be used for indexing.
return self.get_model().objects.all()

View File

@ -4,11 +4,13 @@
from django.conf import settings from django.conf import settings
from django.contrib.postgres.search import TrigramSimilarity from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import F, Func, Q, QuerySet from django.db.models import Case, F, FloatField, Func, Q, QuerySet, Value, When
from haystack.query import SearchQuerySet from django.db.models.functions import Coalesce
from elasticsearch_dsl import Q as ES_Q
from akarpov.files.models import File from akarpov.files.models import File
from ..documents import FileDocument
from .lema import lemmatize_and_remove_stopwords from .lema import lemmatize_and_remove_stopwords
@ -16,17 +18,55 @@ class BaseSearch:
def __init__(self, queryset: QuerySet | None = None): def __init__(self, queryset: QuerySet | None = None):
self.queryset: QuerySet | None = queryset self.queryset: QuerySet | None = queryset
def search(self, query: str) -> QuerySet | SearchQuerySet | list[File]: def search(self, query: str) -> QuerySet | list[File]:
raise NotImplementedError("Subclasses must implement this method") raise NotImplementedError("Subclasses must implement this method")
class NeuroSearch(BaseSearch): class NeuroSearch(BaseSearch):
def search(self, query: str) -> SearchQuerySet: def search(self, query: str):
# Search across multiple fields if not self.queryset:
sqs: SearchQuerySet = SearchQuerySet().filter(content=query) raise ValueError("Queryset cannot be None for search")
sqs = sqs.filter_or(name=query)
sqs = sqs.filter_or(description=query) # Perform the Elasticsearch query using a combination of match, match_phrase_prefix, and wildcard queries
return sqs search = FileDocument.search()
search_query = ES_Q(
"bool",
should=[
ES_Q(
"multi_match",
query=query,
fields=["name", "description", "content"],
type="best_fields",
),
ES_Q("match_phrase_prefix", name=query),
ES_Q("wildcard", name=f"*{query}*"),
ES_Q("wildcard", description=f"*{query}*"),
ES_Q("wildcard", content=f"*{query}*"),
],
minimum_should_match=1,
)
search = search.query(search_query)
# Execute the search to get the results
response = search.execute()
# Check if there are hits, if not return an empty queryset
if not response.hits:
return self.queryset.none()
# Collect the IDs of the hits
hit_ids = [hit.meta.id for hit in response.hits]
# Use the hit IDs to filter the queryset and preserve the order
preserved_order = Case(
*[When(pk=pk, then=pos) for pos, pk in enumerate(hit_ids)]
)
relevant_queryset = self.queryset.filter(pk__in=hit_ids).order_by(
preserved_order
)
return relevant_queryset
class CaseSensitiveSearch(BaseSearch): class CaseSensitiveSearch(BaseSearch):
@ -89,28 +129,28 @@ def as_sql(self, compiler, connection):
class SimilaritySearch(BaseSearch): class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset)
def search(self, query: str) -> QuerySet[File]: def search(self, query: str) -> QuerySet[File]:
if self.queryset is None: if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search") raise ValueError("Queryset cannot be None for similarity search")
# Detect language and preprocess the query
language = "russian" if re.search("[а-яА-Я]", query) else "english" language = "russian" if re.search("[а-яА-Я]", query) else "english"
filtered_query = lemmatize_and_remove_stopwords(query, language=language) filtered_query = lemmatize_and_remove_stopwords(query, language=language)
# Annotate the queryset with similarity scores for each field
queryset = ( queryset = (
self.queryset.annotate( self.queryset.annotate(
name_similarity=TrigramSimilarity( name_similarity=Coalesce(
UnaccentLower("name"), filtered_query TrigramSimilarity(UnaccentLower("name"), filtered_query),
Value(0),
output_field=FloatField(),
), ),
description_similarity=TrigramSimilarity( description_similarity=Coalesce(
UnaccentLower("description"), filtered_query TrigramSimilarity(UnaccentLower("description"), filtered_query),
Value(0),
output_field=FloatField(),
), ),
content_similarity=TrigramSimilarity( content_similarity=Coalesce(
UnaccentLower("content"), filtered_query TrigramSimilarity(UnaccentLower("content"), filtered_query),
Value(0),
output_field=FloatField(),
), ),
) )
.annotate( .annotate(
@ -119,17 +159,9 @@ def search(self, query: str) -> QuerySet[File]:
+ F("description_similarity") + F("description_similarity")
+ F("content_similarity") + F("content_similarity")
) )
/ 3
) )
.filter(combined_similarity__gt=0.1) .filter(combined_similarity__gt=0.1)
.order_by("-combined_similarity") .order_by("-combined_similarity")
) )
print(filtered_query)
print(queryset.query)
print(
queryset.values(
"name_similarity", "description_similarity", "content_similarity"
)
)
return queryset return queryset

View File

@ -3,9 +3,8 @@
import structlog import structlog
from celery import shared_task from celery import shared_task
from django.core import management
from django.core.files import File from django.core.files import File
from haystack.management.commands import rebuild_index, update_index
from haystack.query import SearchQuerySet
from akarpov.files.models import File as FileModel from akarpov.files.models import File as FileModel
from akarpov.files.services.preview import create_preview, get_file_mimetype from akarpov.files.services.preview import create_preview, get_file_mimetype
@ -39,25 +38,10 @@ def process_file(pk: int):
return pk return pk
@shared_task
def task_rebuild_index():
start_time = time.time()
rebuild_index.Command().handle(interactive=False)
end_time = time.time()
duration = end_time - start_time
indexed_count = SearchQuerySet().all().count()
logger.info(
"index_rebuild_finished", duration=duration, indexed_count=indexed_count
)
@shared_task @shared_task
def update_index_task(): def update_index_task():
start_time = time.time() start_time = time.time()
management.call_command("search_index", "--rebuild", "-f")
update_index.Command().handle(interactive=False)
end_time = time.time() end_time = time.time()
duration = end_time - start_time duration = end_time - start_time
logger.info("update_index_completed", duration=duration) logger.info("update_index_completed", duration=duration)

View File

@ -70,7 +70,7 @@ def filter(self, queryset):
class TopFolderView(LoginRequiredMixin, ListView, FileFilterView): class TopFolderView(LoginRequiredMixin, ListView, FileFilterView):
template_name = "files/list.html" template_name = "files/list.html"
paginate_by = 18 paginate_by = 38
model = BaseFileItem model = BaseFileItem
def get_context_data(self, **kwargs): def get_context_data(self, **kwargs):

View File

@ -66,7 +66,7 @@
"blog.post": {"ops": ("fetch", "get"), "timeout": 20 * 15}, "blog.post": {"ops": ("fetch", "get"), "timeout": 20 * 15},
"themes.theme": {"ops": ("fetch", "get"), "timeout": 60 * 60}, "themes.theme": {"ops": ("fetch", "get"), "timeout": 60 * 60},
"gallery.*": {"ops": "all", "timeout": 60 * 15}, "gallery.*": {"ops": "all", "timeout": 60 * 15},
"files.*": {"ops": "all", "timeout": 60 * 5}, "files.*": {"ops": ("fetch", "get"), "timeout": 60 * 5},
"auth.permission": {"ops": "all", "timeout": 60 * 15}, "auth.permission": {"ops": "all", "timeout": 60 * 15},
} }
CACHEOPS_REDIS = env.str("REDIS_URL") CACHEOPS_REDIS = env.str("REDIS_URL")
@ -133,7 +133,7 @@
"django_filters", "django_filters",
"django_tables2", "django_tables2",
"location_field", "location_field",
"haystack", "django_elasticsearch_dsl",
] ]
HEALTH_CHECKS = [ HEALTH_CHECKS = [
@ -434,13 +434,9 @@
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler" CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
CELERY_BEAT_SCHEDULE = { CELERY_BEAT_SCHEDULE = {
"update-index-every-hour": { "update-index-every-hour": {
"task": "akarpov.files.tasks.task_rebuild_index", "task": "akarpov.files.tasks.update_index_task",
"schedule": crontab(minute="0"), "schedule": crontab(minute="0"),
}, },
"rebuild-index-every-day": {
"task": "akarpov.files.tasks.task_rebuild_index",
"schedule": crontab(hour="2", minute="0", day_of_week="*"),
},
} }
@ -611,14 +607,8 @@
) )
# HAYSTACK # ELASTICSEARCH
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
HAYSTACK_CONNECTIONS = { ELASTICSEARCH_DSL = {
"default": { "default": {"hosts": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/")},
"ENGINE": "haystack.backends.elasticsearch7_backend.Elasticsearch7SearchEngine",
"URL": env("ELASTIC_SEARCH", default="http://127.0.0.1:9200/"),
"INDEX_NAME": "haystack",
"TIMEOUT": 60 * 5,
"BATCH_SIZE": 100,
},
} }

View File

@ -100,18 +100,15 @@ services:
command: /start-flower command: /start-flower
elasticsearch: elasticsearch:
image: elasticsearch:7.17.14 image: elasticsearch:8.8.0
container_name: elasticsearch
environment:
- cluster.name=docker-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- akarpov_local_elasticsearch_data:/usr/share/elasticsearch/data
ports: ports:
- "9200:9200" - "9200:9200"
- "9300:9300" - "9300:9300"
environment:
- node.name=activity
- discovery.type=single-node
- cluster.name=ws-es-data-cluster
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms4g -Xmx4g"
volumes:
- akarpov_local_elasticsearch_data:/usr/share/elasticsearch/data

3483
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -107,13 +107,15 @@ pgvector = "^0.2.2"
pycld2 = "^0.41" pycld2 = "^0.41"
textract = "^1.6.5" textract = "^1.6.5"
uuid6 = "^2023.5.2" uuid6 = "^2023.5.2"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
uvicorn = "^0.24.0.post1" uvicorn = "^0.24.0.post1"
nltk = "^3.8.1" nltk = "^3.8.1"
pymorphy3 = "^1.2.1" pymorphy3 = "^1.2.1"
pymorphy3-dicts-ru = "^2.4.417150.4580142" pymorphy3-dicts-ru = "^2.4.417150.4580142"
fastapi = "^0.104.1" fastapi = "^0.104.1"
pydantic-settings = "^2.0.3" pydantic-settings = "^2.0.3"
django-elasticsearch-dsl = "^8.0"
elasticsearch-dsl = "^8.11.0"
numpy = "1.25.2"
[build-system] [build-system]