major file updates, added search engine

This commit is contained in:
Alexander Karpov 2023-11-07 03:43:05 +03:00
parent 3f844bbca1
commit 4df9bfb2ec
21 changed files with 489 additions and 338 deletions

View File

View File

@ -0,0 +1,5 @@
from rest_framework import serializers
class StatusSerializer(serializers.Serializer):
status = serializers.CharField(default="pong")

View File

@ -0,0 +1,7 @@
from django.urls import path
from akarpov.about.api.views import PingAPIView
app_name = "about"
urlpatterns = [path("ping", PingAPIView.as_view(), name="ping")]

View File

@ -0,0 +1,11 @@
from rest_framework import generics, permissions, response
from akarpov.about.api.serializers import StatusSerializer
class PingAPIView(generics.GenericAPIView):
serializer_class = StatusSerializer
permission_classes = [permissions.AllowAny]
def get(self, request):
return response.Response(data={"status": "pong"})

View File

@ -0,0 +1,25 @@
# Generated by Django 4.2.6 on 2023-11-06 21:23
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("files", "0028_file_content_file_lang"),
]
operations = [
migrations.RemoveField(
model_name="file",
name="embeddings",
),
migrations.RemoveField(
model_name="file",
name="lang",
),
migrations.AlterField(
model_name="file",
name="content",
field=models.TextField(),
),
]

View File

@ -0,0 +1,19 @@
# Generated by Django 4.2.6 on 2023-11-06 21:23
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("files", "0029_remove_file_embeddings_remove_file_lang_and_more"),
]
operations = [
migrations.RunSQL(
sql="CREATE EXTENSION IF NOT EXISTS unaccent;",
reverse_sql="DROP EXTENSION unaccent;",
),
migrations.RunSQL(
sql="CREATE EXTENSION IF NOT EXISTS pg_trgm;",
reverse_sql="DROP EXTENSION pg_trgm;",
),
]

View File

@ -17,7 +17,6 @@
from django.urls import reverse
from model_utils.fields import AutoCreatedField, AutoLastModifiedField
from model_utils.models import TimeStampedModel
from pgvector.django import VectorField
from polymorphic.models import PolymorphicModel
from akarpov.files.services.files import trash_file_upload, user_unique_file_upload
@ -70,9 +69,8 @@ class File(BaseFileItem, TimeStampedModel, ShortLinkModel, UserHistoryModel):
preview = FileField(blank=True, upload_to="file/previews/")
file_obj = FileField(blank=False, upload_to=user_unique_file_upload)
embeddings = VectorField(dimensions=768, null=True)
content = TextField(max_length=10000)
lang = CharField(max_length=2, choices=[("ru", "ru"), ("en", "en")])
content = TextField()
# lang = CharField(max_length=2, choices=[("ru", "ru"), ("en", "en")])
# meta
name = CharField(max_length=255, null=True, blank=True)

View File

@ -0,0 +1,17 @@
from haystack import indexes
from .models import File
class FileIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name", default="")
description = indexes.CharField(model_attr="description", default="")
content = indexes.CharField(model_attr="content", default="")
def get_model(self):
return File
def index_queryset(self, using=None):
# Return the default queryset to be used for indexing.
return self.get_model().objects.all()

View File

@ -0,0 +1,101 @@
import os
import re
from typing import BinaryIO
from django.conf import settings
from django.contrib.postgres.lookups import Unaccent
from django.contrib.postgres.search import TrigramSimilarity
from django.db.models import Q, QuerySet
from haystack.query import SearchQuerySet
from akarpov.files.models import File
class BaseSearch:
def __init__(self, queryset: QuerySet | None = None):
self.queryset: QuerySet | None = queryset
def search(self, query: str) -> QuerySet | SearchQuerySet | list[File]:
raise NotImplementedError("Subclasses must implement this method")
class NeuroSearch(BaseSearch):
def search(self, query: str) -> SearchQuerySet:
# Search across multiple fields
sqs: SearchQuerySet = SearchQuerySet().filter(content=query)
sqs = sqs.filter_or(name=query)
sqs = sqs.filter_or(description=query)
return sqs
class CaseSensitiveSearch(BaseSearch):
def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for text search")
# Escape any regex special characters in the query string
query_escaped = re.escape(query)
# Use a case-sensitive regex to filter
return self.queryset.filter(
Q(name__regex=query_escaped)
| Q(description__regex=query_escaped)
| Q(content__regex=query_escaped)
)
class ByteSearch(BaseSearch):
def search(self, hex_query: str) -> list[File]:
# Convert the hex query to bytes
try:
byte_query: bytes = bytes.fromhex(hex_query)
except ValueError:
# If hex_query is not a valid hex, return an empty list
return []
matching_files: list[File] = []
if self.queryset is not None:
for file_item in self.queryset:
file_path: str = file_item.file.path
full_path: str = os.path.join(settings.MEDIA_ROOT, file_path)
if os.path.exists(full_path):
with open(full_path, "rb") as file:
if self._byte_search_in_file(file, byte_query):
matching_files.append(file_item)
return matching_files
@staticmethod
def _byte_search_in_file(file: BinaryIO, byte_sequence: bytes) -> bool:
# Read the file in chunks to avoid loading large files into memory
chunk_size: int = 4096 # or another size depending on the expected file sizes
while True:
chunk: bytes = file.read(chunk_size)
if byte_sequence in chunk:
return True
if not chunk: # End of file reached
return False
class SimilaritySearch(BaseSearch):
def __init__(self, queryset: QuerySet[File] | None = None):
super().__init__(queryset)
def search(self, query: str) -> QuerySet[File]:
if self.queryset is None:
raise ValueError("Queryset cannot be None for similarity search")
# Perform a similarity search using trigram comparison
return (
self.queryset.annotate(
name_unaccent=Unaccent("name"),
description_unaccent=Unaccent("description"),
content_unaccent=Unaccent("content"),
)
.annotate(
similarity=TrigramSimilarity("name_unaccent", query)
+ TrigramSimilarity("description_unaccent", query)
+ TrigramSimilarity("content_unaccent", query)
)
.filter(similarity__gt=0.1)
.order_by("-similarity")
)

View File

@ -1,7 +1,18 @@
import chardet
import textract
from textract.exceptions import ExtensionNotSupported
def extract_file_text(file: str) -> str:
try:
text = textract.process(file)
except ExtensionNotSupported:
try:
rawdata = open(file, "rb").read()
enc = chardet.detect(rawdata)
with open(file, encoding=enc["encoding"]) as file:
text = file.read()
except Exception:
return ""
return text

View File

@ -1,11 +1,15 @@
import os
import time
import structlog
from celery import shared_task
from django.core.files import File
from haystack.management.commands import rebuild_index, update_index
from haystack.query import SearchQuerySet
from akarpov.files.models import File as FileModel
from akarpov.files.services.preview import create_preview, get_file_mimetype
from akarpov.files.services.text import extract_file_text
logger = structlog.get_logger(__name__)
@ -28,7 +32,32 @@ def process_file(pk: int):
except Exception as e:
logger.error(e)
file.file_type = get_file_mimetype(file.file.path)
file.save(update_fields=["preview", "name", "file_type"])
file.content = extract_file_text(file.file.path)
file.save(update_fields=["preview", "name", "file_type", "content"])
if pth and os.path.isfile(pth):
os.remove(pth)
return pk
@shared_task
def task_rebuild_index():
start_time = time.time()
rebuild_index.Command().handle(interactive=False)
end_time = time.time()
duration = end_time - start_time
indexed_count = SearchQuerySet().all().count()
logger.info(
"index_rebuild_finished", duration=duration, indexed_count=indexed_count
)
@shared_task
def update_index_task():
start_time = time.time()
update_index.Command().handle(interactive=False)
end_time = time.time()
duration = end_time - start_time
logger.info("update_index_completed", duration=duration)

View File

@ -33,13 +33,43 @@
from akarpov.files.previews import extensions, meta, meta_extensions, previews
from akarpov.files.services.folders import delete_folder
from akarpov.files.services.preview import get_base_meta
from akarpov.files.services.search import (
ByteSearch,
CaseSensitiveSearch,
NeuroSearch,
SimilaritySearch,
)
from akarpov.files.tables import FileTable
from akarpov.notifications.services import send_notification
logger = structlog.get_logger(__name__)
search_classes = {
"neuro": NeuroSearch,
"case_sensitive": CaseSensitiveSearch,
"byte_search": ByteSearch,
"similarity": SimilaritySearch,
}
class TopFolderView(LoginRequiredMixin, ListView):
class FileFilterView(View):
def filter(self, queryset):
if "query" in self.request.GET and "search_type" in self.request.GET:
query = self.request.GET["query"]
search_type = self.request.GET["search_type"]
if not query or not self.request.user.is_authenticated:
return queryset
if search_type in search_classes:
search_instance = search_classes[search_type](
queryset=File.objects.filter(user=self.request.user)
)
queryset = search_instance.search(query)
print(queryset, query)
return queryset
class TopFolderView(LoginRequiredMixin, ListView, FileFilterView):
template_name = "files/list.html"
paginate_by = 18
model = BaseFileItem
@ -55,10 +85,18 @@ def get_context_data(self, **kwargs):
return context
def get_queryset(self):
return BaseFileItem.objects.filter(user=self.request.user, parent__isnull=True)
if (
"query" in self.request.GET
and "search_type" in self.request.GET
and self.request.GET["query"]
):
return self.filter(BaseFileItem.objects.none())
return self.filter(
BaseFileItem.objects.filter(user=self.request.user, parent__isnull=True)
)
class FileFolderView(ListView):
class FileFolderView(ListView, FileFilterView):
template_name = "files/folder.html"
model = BaseFileItem
paginate_by = 38
@ -94,6 +132,13 @@ def get_object(self, *args):
def get_queryset(self):
folder = self.get_object()
if (
"query" in self.request.GET
and "search_type" in self.request.GET
and self.request.GET["query"]
):
return self.filter(BaseFileItem.objects.none())
return BaseFileItem.objects.filter(parent=folder)

View File

@ -15,6 +15,7 @@
{% endblock %}
{% block css %}
<link href=" https://cdn.jsdelivr.net/npm/bootstrap-select@1.13.18/dist/css/bootstrap-select.min.css" rel="stylesheet">
<style>
.row {
display: -webkit-box;
@ -62,6 +63,29 @@
</div>
{% endif %}
</div>
<form id="searchForm" class="row" method="get">
<div class="col-lg-9 col-md-8 col-sm-7">
<input type="text" class="form-control" placeholder="Search..." name="query" aria-label="Search" value="{{ request.GET.query|default_if_none:'' }}">
</div>
<div class="col-lg-2 col-md-3 col-sm-4">
{# <select class="selectpicker form-select" name="search_type" title="Choose...">#}
{# <option data-icon="bi bi-brain" value="neuro" {% if request.GET.search_type == "neuro" %}selected{% endif %}>Neuro Search</option>#}
{# <option data-icon="bi bi-textarea-t" value="case_sensitive" {% if request.GET.search_type == "case_sensitive" %}selected{% endif %}>Case Sensitive</option>#}
{# <option data-icon="bi bi-file-earmark-binary" value="byte_search" {% if request.GET.search_type == "byte_search" %}selected{% endif %}>Byte Search</option>#}
{# <option data-icon="bi bi-stars" value="similarity" {% if request.GET.search_type == "similarity" %}selected{% endif %}>Similarity Search</option>#}
{# </select>#}
<select name="search_type" class="form-select" id="inlineFormSelectPref">
<option data-icon="bi bi-brain" value="neuro" {% if request.GET.search_type == "neuro" %}selected{% endif %}>Neuro Search</option>
<option data-icon="bi bi-textarea-t" value="case_sensitive" {% if request.GET.search_type == "case_sensitive" %}selected{% endif %}>Case Sensitive</option>
<option data-icon="bi bi-file-earmark-binary" value="byte_search" {% if request.GET.search_type == "byte_search" %}selected{% endif %}>Byte Search</option>
<option data-icon="bi bi-stars" value="similarity" {% if request.GET.search_type == "similarity" %}selected{% endif %}>Similarity Search</option>
</select>
</div>
<div class="col-lg-1 col-md-1 col-sm-2">
<button type="submit" class="btn btn-primary w-100"><i class="bi bi-search"></i> Search</button>
</div>
</form>
<div class="row">
{% if request.user.is_authenticated and is_folder_owner %}
<div class="col-lg-2 col-xxl-2 col-md-4 col-sm-6 col-xs-12 mb-3 m-3 d-flex align-items-stretch card">
@ -108,7 +132,7 @@
</div>
</div>
{% endif %}
{% for file in basefileitem_list %}
{% for file in object_list %}
<div class="col-lg-2 col-xxl-2 col-md-4 col-sm-6 col-xs-12 mb-3 m-3 d-flex align-items-stretch card justify-content-center">
{% if file.is_file %}
<div class="card-body d-flex flex-column">
@ -172,7 +196,12 @@
{% endblock %}
{% block inline_javascript %}
<script src="https://cdn.jsdelivr.net/npm/bootstrap-select@1.13.18/js/bootstrap-select.min.js"></script>
<script type="text/javascript">
$(function () {
$('selectpicker').selectpicker();
});
$.notify.defaults(
{
// whether to hide the notification on click
@ -225,7 +254,7 @@
} else {
md5 = spark.end();
}
};
}
function read_next_chunk() {
var reader = new FileReader();

View File

@ -0,0 +1,22 @@
{% extends "base.html" %}
{% load crispy_forms_filters %}
{% load static %}
{% block title %}creating playlist on akarpov{% endblock %}
{% block content %}
<form class="pt-2" enctype="multipart/form-data" method="POST" id="designer-form">
{% csrf_token %}
{{ form.media }}
{% for field in form %}
{{ field|as_crispy_field }}
{% endfor %}
<div class="mt-4 flex justify-end space-x-4">
<button class="btn btn-success" type="submit" id="submit">
<span class="spinner-border spinner-border-sm" id="spinner" role="status" aria-hidden="true" style="display: none"></span>
Create
</button>
</div>
<!-- TODO: add song select via API here -->
</form>
{% endblock %}

View File

@ -0,0 +1,59 @@
{% extends 'base.html' %}
{% block css %}
<style>
.music-container {
width: 300px;
height: 300px;
display: flex;
flex-wrap: wrap;
}
.music-container > div {
flex: 1 0 50%;
height: 50%;
}
.music-container > div:nth-child(1):last-child, .music-container > div:nth-child(2):last-child, .music-container > div:nth-child(3):last-child {
flex-basis: 100%;
}
.music-container > div:nth-child(1):last-child:nth-child(1):last-child, .music-container > div:nth-child(2):last-child:nth-child(1):last-child, .music-container > div:nth-child(3):last-child:nth-child(1):last-child {
height: 100%;
}
.music-container > div img {
width: 100%;
height: 100%;
object-fit: cover;
object-position: center;
}
</style>
{% endblock %}
{% block content %}
<div class="ms-3 row">
<div class="col-lg-2 col-xxl-2 col-md-4 col-sm-6 col-xs-12 mb-3 m-3 d-flex align-items-stretch card">
<div class="card-body d-flex flex-column justify-content-center align-items-center">
<h5 class="card-title">Create Playlist</h5>
<p class="card-text">Create your own playlist</p>
<a href="{% url 'music:create_playlist' %}" class="btn btn-primary"><i style="bi bi-plus"></i></a>
</div>
</div>
{% for playlist in playlist_list %}
<div>
<div class="music-container">
<div>
<img src="https://img.freepik.com/free-photo/people-making-hands-heart-shape-silhouette-sunset_53876-15987.jpg" alt="">
</div>
<div>
<img src="https://thumbs.dreamstime.com/b/bee-flower-27533578.jpg" alt="">
</div>
<div>
<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTszhmO2dRnPW3Co-zQF_rqipldQM77r2Ut6Q&usqp=CAU" alt="">
</div>
<div>
<img src="https://img.freepik.com/free-photo/wide-angle-shot-single-tree-growing-clouded-sky-during-sunset-surrounded-by-grass_181624-22807.jpg" alt="">
</div>
</div>
</div>
{{ playlist }}
{% endfor %}
</div>
{% endblock %}

View File

@ -0,0 +1,3 @@
{{ object.name }}
{{ object.description }}
{{ object.content }}

View File

@ -3,5 +3,6 @@
set -o errexit
set -o nounset
/install_preview_dependencies
celery -A config.celery_app worker --loglevel=info -c 5

View File

@ -5,6 +5,7 @@
import environ
import structlog
from celery.schedules import crontab
from sentry_sdk.integrations.celery import CeleryIntegration
ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent
@ -128,6 +129,7 @@
"django_filters",
"django_tables2",
"location_field",
"haystack",
]
HEALTH_CHECKS = [
@ -398,6 +400,7 @@
structlog.processors.UnicodeDecoder(),
structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
cache_logger_on_first_use=True,
)
@ -425,6 +428,17 @@
CELERY_TASK_SOFT_TIME_LIMIT = 10 * 60
# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-scheduler
CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler"
CELERY_BEAT_SCHEDULE = {
"update-index-every-hour": {
"task": "akarpov.files.tasks.task_rebuild_index",
"schedule": crontab(minute="0"),
},
"rebuild-index-every-day": {
"task": "akarpov.files.tasks.task_rebuild_index",
"schedule": crontab(hour="2", minute="0", day_of_week="*"),
},
}
# django-allauth
# ------------------------------------------------------------------------------
@ -591,3 +605,16 @@
CeleryIntegration(monitor_beat_tasks=True, propagate_traces=True),
],
)
# HAYSTACK
# ------------------------------------------------------------------------------
HAYSTACK_CONNECTIONS = {
"default": {
"ENGINE": "haystack.backends.elasticsearch7_backend.Elasticsearch7SearchEngine",
"URL": "http://127.0.0.1:9200/", # Assuming Elasticsearch is running on localhost
"INDEX_NAME": "haystack",
"TIMEOUT": 60 * 5,
"BATCH_SIZE": 100,
},
}

View File

@ -3,6 +3,7 @@ version: '3'
volumes:
akarpov_local_postgres_data: {}
akarpov_local_postgres_data_backups: {}
akarpov_local_elasticsearch_data: {}
services:
django: &django
@ -16,6 +17,7 @@ services:
- postgres
- redis
- mailhog
- elasticsearch
volumes:
- .:/app:z
env_file:
@ -96,3 +98,15 @@ services:
ports:
- "5555:5555"
command: /start-flower
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:7.9.3 # Use the desired version
container_name: akarpov_local_elasticsearch
environment:
- discovery.type=single-node # This avoids bootstrap checks, suitable for development.
- ES_JAVA_OPTS=-Xms512m -Xmx512m # Set the JVM heap size
volumes:
- akarpov_local_elasticsearch_data:/usr/share/elasticsearch/data
ports:
- "9200:9200"
- "9300:9300"

381
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -108,6 +108,7 @@ pycld2 = "^0.41"
textract = "^1.6.5"
uuid6 = "^2023.5.2"
fastapi = "0.86.0"
django-haystack = {extras = ["elasticsearch"], version = "^3.2.1"}
[build-system]