Merge branch 'main' into speller-translate

This commit is contained in:
Alexandr Karpov 2022-10-22 18:26:28 +03:00 committed by GitHub
commit 84216870eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 413 additions and 61 deletions

View File

@ -1,9 +1,15 @@
from django.urls import path from django.urls import path
from search.api.views import SearchApi, HintApi, AutoCompleteApi from search.api.views import (
SearchApi,
HintApi,
AutoCompleteApi,
IncreaseProductScoreApi,
)
urlpatterns = [ urlpatterns = [
path("search", SearchApi.as_view(), name="search_api"), path("search", SearchApi.as_view(), name="search_api"),
path("hint", HintApi.as_view(), name="hint api"), path("hint", HintApi.as_view(), name="hint_api"),
path('autocomplete_schema', AutoCompleteApi.as_view(), name='autocomplete api') path("autocomplete_schema", AutoCompleteApi.as_view(), name="autocomplete_api"),
path("score/<int:pk>", IncreaseProductScoreApi.as_view(), name="score_api"),
] ]

View File

@ -63,6 +63,7 @@ DJANGO_APPS = [
"django.contrib.humanize", "django.contrib.humanize",
"django.contrib.admin", "django.contrib.admin",
"django.forms", "django.forms",
"django.contrib.postgres",
] ]
THIRD_PARTY_APPS = ["rest_framework", "corsheaders", "drf_yasg"] THIRD_PARTY_APPS = ["rest_framework", "corsheaders", "drf_yasg"]
@ -216,3 +217,5 @@ REST_FRAMEWORK = {
# django-cors-headers # django-cors-headers
CORS_ALLOW_ALL_ORIGINS = True CORS_ALLOW_ALL_ORIGINS = True
YANDEX_DICT_API_KEY = "dict.1.1.20221022T010312Z.55cce597a6cfa505.f416aba85e7642eedc1b76b8e21c06506fb17f83"

View File

@ -2,9 +2,21 @@ from rest_framework import serializers
from django.core.validators import MinLengthValidator, MinValueValidator from django.core.validators import MinLengthValidator, MinValueValidator
class QueryFilterSerializer(serializers.Serializer):
value = serializers.CharField(max_length=100)
type = serializers.CharField(max_length=100)
def create(self, validated_data):
raise NotImplementedError
def update(self, instance, validated_data):
raise NotImplementedError
class SearchSerializer(serializers.Serializer): class SearchSerializer(serializers.Serializer):
body = serializers.CharField(max_length=200) body = serializers.ListSerializer(child=QueryFilterSerializer())
limit = serializers.IntegerField(default=5, min_value=1)
offset = serializers.IntegerField(default=0, min_value=0)
def create(self, validated_data): def create(self, validated_data):
raise NotImplementedError raise NotImplementedError
@ -27,22 +39,50 @@ class HintRequestSerializer(serializers.Serializer):
content = serializers.CharField() content = serializers.CharField()
def create(self, validated_data): def create(self, validated_data):
raise NotImplemented raise NotImplementedError
def update(self, instance, validated_data):
raise NotImplementedError
class HintResponseSerializer(serializers.Serializer): class HintResponseSerializer(serializers.Serializer):
type = serializers.CharField() type = serializers.CharField()
content = serializers.CharField() content = serializers.CharField()
def create(self, validated_data):
raise NotImplementedError
def update(self, instance, validated_data):
raise NotImplementedError
class AutoCompleteRequestSerializer(serializers.Serializer): class AutoCompleteRequestSerializer(serializers.Serializer):
content = serializers.CharField(validators=[MinLengthValidator(3)]) content = serializers.CharField(validators=[MinLengthValidator(3)])
exclude = serializers.ListSerializer(child=QueryFilterSerializer(), default=[])
def create(self, validated_data):
raise NotImplementedError
def update(self, instance, validated_data):
raise NotImplementedError
class AutoCompleteSerializerNode(serializers.Serializer): class AutoCompleteSerializerNode(serializers.Serializer):
coordinate = serializers.IntegerField(validators=[MinValueValidator(0)]) coordinate = serializers.IntegerField(validators=[MinValueValidator(0)])
value = HintResponseSerializer() value = HintResponseSerializer()
def create(self, validated_data):
raise NotImplementedError
def update(self, instance, validated_data):
raise NotImplementedError
class AutoCompleteResponseSerializer(serializers.Serializer): class AutoCompleteResponseSerializer(serializers.Serializer):
nodes = serializers.ListField(child=AutoCompleteSerializerNode()) nodes = serializers.ListField(child=AutoCompleteSerializerNode())
def create(self, validated_data):
raise NotImplementedError
def update(self, instance, validated_data):
raise NotImplementedError

View File

@ -1,51 +1,93 @@
from drf_yasg import openapi from drf_yasg import openapi
from drf_yasg.utils import swagger_auto_schema from drf_yasg.utils import swagger_auto_schema
from rest_framework import status from rest_framework import status
from rest_framework.generics import get_object_or_404
from rest_framework.response import Response from rest_framework.response import Response
from rest_framework.views import APIView from rest_framework.views import APIView
from search.api.serializers import HintRequestSerializer from search.api.serializers import HintRequestSerializer
from search.api.serializers import SearchSerializer, ResponseSerializer, HintResponseSerializer, AutoCompleteRequestSerializer, AutoCompleteResponseSerializer from search.api.serializers import (
from search.services.search import process_string SearchSerializer,
ResponseSerializer,
HintResponseSerializer,
AutoCompleteRequestSerializer,
AutoCompleteResponseSerializer,
)
from search.models import Product
from search.services.search import process_search
from search.services.autocomplete_schema import autocomplete_schema from search.services.autocomplete_schema import autocomplete_schema
from search.services.hints import get_hints from search.services.hints import get_hints
user_response = openapi.Response("search results", ResponseSerializer) user_response = openapi.Response("search results", ResponseSerializer)
hint_response = openapi.Response("hints", HintResponseSerializer) hint_response = openapi.Response("hints", HintResponseSerializer)
autocomplete_response = openapi.Response("autocomplete schema", AutoCompleteResponseSerializer) autocomplete_response = openapi.Response(
"autocomplete schema", AutoCompleteResponseSerializer
)
class SearchApi(APIView): class SearchApi(APIView):
@swagger_auto_schema(request_body=SearchSerializer, responses={200: user_response}) @swagger_auto_schema(request_body=SearchSerializer, responses={200: user_response})
def post(self, request, format=None): def post(self, request):
serializer = SearchSerializer(data=request.data) serializer = SearchSerializer(data=request.data)
serializer.is_valid(raise_exception=True) serializer.is_valid(raise_exception=True)
return Response( return Response(
process_string(serializer.data["body"]), status=status.HTTP_200_OK process_search(
serializer.data["body"],
serializer.data["limit"],
serializer.data["offset"],
),
status=status.HTTP_200_OK,
) )
class HintApi(APIView): class HintApi(APIView):
@swagger_auto_schema(request_body=HintRequestSerializer, responses={200: hint_response}) @swagger_auto_schema(
def post(self, request, format=None): request_body=HintRequestSerializer, responses={200: hint_response}
)
def post(self, request):
serializer = HintRequestSerializer(data=request.data) serializer = HintRequestSerializer(data=request.data)
serializer.is_valid(raise_exception=True) serializer.is_valid(raise_exception=True)
return Response( return Response(
{ {
'type': get_hints(serializer.data['content']), "type": get_hints(serializer.data["content"]),
'value': serializer.data['content'] "value": serializer.data["content"],
}, },
status=status.HTTP_200_OK status=status.HTTP_200_OK,
) )
class AutoCompleteApi(APIView): class AutoCompleteApi(APIView):
@swagger_auto_schema(request_body=AutoCompleteRequestSerializer, responses={200: autocomplete_response}) @swagger_auto_schema(
def post(self, request, format=None): request_body=AutoCompleteRequestSerializer,
responses={200: autocomplete_response},
)
def post(self, request):
serializer = AutoCompleteRequestSerializer(data=request.data) serializer = AutoCompleteRequestSerializer(data=request.data)
serializer.is_valid(raise_exception=True) serializer.is_valid(raise_exception=True)
return Response( return Response(
{ {
'nodes': autocomplete_schema(serializer.data['content']) "nodes": autocomplete_schema(
}, status=status.HTTP_200_OK serializer.data["content"], serializer.data["exclude"]
) )
},
status=status.HTTP_200_OK,
)
class IncreaseProductScoreApi(APIView):
@swagger_auto_schema(
manual_parameters=[
openapi.Parameter(
"id",
openapi.IN_PATH,
description="Product id",
type=openapi.TYPE_INTEGER,
)
]
)
def post(self, request, pk):
product = get_object_or_404(Product, id=pk)
product.score += 1
product.save(update_fields=["score"])
return Response({"score": product.score}, status=status.HTTP_200_OK)

View File

@ -18,13 +18,21 @@ class Characteristic(models.Model):
class UnitCharacteristic(models.Model): class UnitCharacteristic(models.Model):
name = models.TextField("Имя", blank=False) name = models.TextField("Имя", blank=False)
value = models.TextField("Значение", blank=False) value = models.TextField("Значение", blank=False)
numeric_value_min = models.IntegerField(default=0)
numeric_value_max = models.IntegerField(default=0)
unit = models.TextField("Размерность", blank=False) unit = models.TextField("Размерность", blank=False)
def __str__(self): def __str__(self):
return str(self.name) return str(self.name)
def serialize_self(self): def serialize_self(self):
return {"name": self.name, "value": self.value, "unit": self.unit} return {
"name": self.name,
"value": self.numeric_value_min
if self.numeric_value_min == self.numeric_value_max
else f"{self.numeric_value_min}:{self.numeric_value_max}",
"unit": self.unit,
}
class Meta: class Meta:
db_table = "unit_characteristic" db_table = "unit_characteristic"
@ -49,20 +57,28 @@ class Product(models.Model):
Category, related_name="products", on_delete=models.CASCADE Category, related_name="products", on_delete=models.CASCADE
) )
score = models.IntegerField(default=0)
def __str__(self): def __str__(self):
return str(self.name) return str(self.name)
def serialize_self(self) -> dict: def serialize_self(self) -> dict:
return { return {
"id": self.id,
"name": self.name, "name": self.name,
"score": self.score,
"characteristic": [ "characteristic": [
x.serialize_self() for x in self.characteristics.objects.all() x.characteristic.serialize_self() for x in self.characteristics.all()
] ]
+ [x.serialize_self() for x in self.unit_characteristics.objects.all()], + [
x.characteristic.serialize_self()
for x in self.unit_characteristics.all()
],
} }
class Meta: class Meta:
db_table = "product" db_table = "product"
ordering = ["-score"]
class ProductCharacteristic(models.Model): class ProductCharacteristic(models.Model):

View File

@ -1,37 +1,49 @@
from typing import List, Dict
from search.models import Product, Category, Characteristic from search.models import Product, Category, Characteristic
def autocomplete_schema(val: str):
def autocomplete_schema(val: str, exclude: List[Dict]):
exclude = [dict(x) for x in exclude]
name_exclude = [x["value"] for x in exclude if x["type"] == "Name"]
category_exclude = [x["value"] for x in exclude if x["type"] == "Category"]
schema = [] schema = []
if not name_exclude:
schema.extend( schema.extend(
[ [
{ {
'coordinate': product['name'].index(val), "coordinate": product["name"].lower().index(val.lower()),
'value': { "value": {
'type': 'Name', "type": "Name",
'value': product['name'], "value": product["name"],
},
} }
} for product in Product.objects.filter(name__contains=val).values('name')] for product in Product.objects.filter(name__unaccent__icontains=val)[
:20
].values("name")
]
) )
if not category_exclude:
schema.extend( schema.extend(
[ [
{ {
'coordinate': cat['name'].index(val), "coordinate": cat["name"].lower().index(val.lower()),
'value': { "value": {"type": "Category", "value": cat["name"]},
'type': 'Category',
'value': cat['name']
} }
} for cat in Category.objects.filter(name__contains=val).values('name') for cat in Category.objects.filter(name__unaccent__icontains=val)[
:20
].values("name")
] ]
) )
schema.extend( schema.extend(
[ [
{ {
'coordinate': char.name.index(val), "coordinate": char["value"].lower().index(val.lower()),
'value': { "value": {"type": char["name"], "value": char["value"]},
'type': char.name,
'value': char.value
} }
} for char in Characteristic.objects.filter(name__contains=val).values('name', 'value') for char in Characteristic.objects.filter(value__unaccent__icontains=val)[
:20
].values("name", "value")
] ]
) )
return schema return schema

View File

@ -2,11 +2,11 @@ from search.models import Product, Category, Characteristic
def get_hints(content: str) -> str: def get_hints(content: str) -> str:
category = 'Unknown' category = "All"
if content in list(map(lambda product: product.name, Product.objects.all())): if content in list(map(lambda product: product.name, Product.objects.all())):
category = 'Name' category = "Name"
elif content in list(map(lambda category: category.name, Category.objects.all())): elif content in list(map(lambda category: category.name, Category.objects.all())):
category = 'Category' category = "Category"
elif content in list(map(lambda char: char.value, Characteristic.objects.all())): elif content in list(map(lambda char: char.value, Characteristic.objects.all())):
category = Characteristic.objects.get(value=content).name category = Characteristic.objects.filter(value=content).first().name
return category return category

View File

@ -1,3 +1,4 @@
import re
from ast import literal_eval from ast import literal_eval
import pandas as pd import pandas as pd
@ -59,3 +60,27 @@ def load_excel():
# malformed node or string: nan \ duplicate key # malformed node or string: nan \ duplicate key
print("СКОРОСШИВАТЕЛЬ") print("СКОРОСШИВАТЕЛЬ")
continue continue
def process_unit_character():
for el in UnitCharacteristic.objects.all():
nums = re.findall(
"[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", el.value
)
if len(nums) == 1:
try:
el.numeric_value_min = int(float(nums[0].replace(",", ".")))
el.numeric_value_max = int(float(nums[0].replace(",", ".")))
el.save()
except ValueError:
el.delete()
elif len(nums):
try:
nums = [int(float(x.replace(",", "."))) for x in nums]
min_num = min(nums)
max_num = max(nums)
el.numeric_value_min = min_num
el.numeric_value_max = max_num
el.save()
except ValueError:
el.delete()

View File

@ -1,6 +1,206 @@
from search.models import Product import string
from django.db.models import QuerySet
from search.models import (
Product,
Characteristic,
ProductCharacteristic,
ProductUnitCharacteristic,
UnitCharacteristic,
Category,
)
from typing import List from typing import List
from search.services.hints import get_hints
from search.services.spell_check import spell_check
from search.services.translate import translate_en_ru, translate_ru_en
def process_string(text: str) -> List[dict]:
return [x.serialize_self() for x in Product.objects.filter(name__contains=text)[5:]] def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str):
if operation.startswith("<=") or operation.startswith("=<"):
return unit.filter(
characteristic__numeric_value_max__lte=int(float(operation[2:]))
)
elif operation.startswith("=>") or operation.startswith(">="):
return unit.filter(
characteristic__numeric_value_min__gte=int(float(operation[2:]))
)
elif operation.startswith(">"):
return unit.filter(
characteristic__numeric_value_min__gt=int(float(operation[1:]))
)
elif operation.startswith("<"):
return unit.filter(
characteristic__numeric_value_max__lt=int(float(operation[1:]))
)
elif operation.startswith("="):
return unit.filter(
characteristic__numeric_value_min__gte=int(float(operation[1:])),
characteristic__numeric_value_max__lte=int(float(operation[1:])),
)
return unit
def _clean_text(text: str) -> List[str]:
for st in [".", ",", "!", "?"]:
text = text.replace(st, " ")
text = text.split()
return text
def apply_qs_search(qs: Product.objects, text: str):
text = _clean_text(text)
words = Product.objects.none()
for word in text:
words = (
words
| Product.objects.filter(name__unaccent__trigram_similar=word)
| Product.objects.filter(name__unaccent__icontains=word)
)
print(words)
qs = qs | words
print(qs)
return qs
def apply_all_qs_search(orig_qs, text: str):
# words
qs = apply_qs_search(Product.objects.none(), text)
text = _clean_text(text)
# categories
cats = Category.objects.none()
for word in text:
cats = cats | cats.filter(name__icontains=word)
qs = qs | Product.objects.filter(category__in=cats)
# characteristics
chars = Characteristic.objects.none()
for word in text:
chars = chars | chars.filter(
value__icontains=word,
)
qs = qs | Product.objects.filter(characteristics__characteristic__in=chars)
# print(qs)
return qs & orig_qs
def process_search(data: List[dict], limit=5, offset=0) -> List[dict]:
prep_data = []
prep_dict = {}
prep_dict_char_type = {}
# --------------------------------------- prepare filters -------------------------------------------------------- #
for x in data:
dat = dict(x)
if x["type"] in ["Name", "Category", "Characteristic", "All"]:
prep_data.append(
{
"type": dat["type"],
"value": spell_check(
dat["value"],
),
}
)
elif x["type"] == "Unknown":
type = get_hints(dat["value"])
prep_data.append(
{
"type": type,
"value": spell_check(
dat["value"],
),
}
)
else:
val = spell_check(
dat["value"],
)
if x["type"] in list(prep_dict.keys()):
if x["type"].startswith("*"):
unit = ProductUnitCharacteristic.objects.filter(
characteristic__in=prep_dict_char_type[x["type"]],
)
prep_dict[x["type"]] = prep_dict[
x["type"]
] | process_unit_operation(unit, x["value"])
else:
prep_dict[x["type"]] = (
prep_dict[x["type"]]
| ProductCharacteristic.objects.filter(
characteristic__in=prep_dict_char_type[x["type"]],
characteristic__value__unaccent__trigram_similar=val,
)
| ProductCharacteristic.objects.filter(
characteristic__in=prep_dict_char_type[x["type"]],
characteristic__value__icontains=val,
)
)
else:
if x["type"].startswith("*"):
prep_dict_char_type[x["type"]] = UnitCharacteristic.objects.filter(
name__unaccent__trigram_similar=x["type"]
) | UnitCharacteristic.objects.filter(name__icontains=x["type"])
unit = ProductUnitCharacteristic.objects.filter(
characteristic__in=prep_dict_char_type[x["type"]],
)
prep_dict[x["type"]] = process_unit_operation(unit, x["value"])
else:
prep_dict_char_type[x["type"]] = Characteristic.objects.filter(
name__unaccent__trigram_similar=x["type"]
) | Characteristic.objects.filter(name__icontains=x["type"])
prep_dict[x["type"]] = ProductCharacteristic.objects.filter(
characteristic__in=prep_dict_char_type[x["type"]],
characteristic__value__unaccent__trigram_similar=val,
) | ProductCharacteristic.objects.filter(
characteristic__in=prep_dict_char_type[x["type"]],
characteristic__value__icontains=val,
)
for el, val in prep_dict.items():
prep_data.append({"type": el, "value": val})
# ----------------------------------- apply filters on QuerySet -------------------------------------------------- #
qs = Product.objects.filter()
for x in prep_data:
typ = x["type"]
val = x["value"]
if typ == "Name":
qs = apply_qs_search(qs, val)
elif typ == "All":
qs = apply_all_qs_search(qs, val)
elif typ == "Category":
qs = qs.filter(category__name__unaccent__trigram_similar=val) | qs.filter(
category__name__icontains=val
)
elif typ == "Characteristic":
char = ProductCharacteristic.objects.filter(product__in=qs)
char = char.filter(characteristic__value__icontains=val) | char.filter(
characteristic__value__unaccent__trigram_similar=val
)
qs = qs.filter(characteristics__in=char)
elif typ == "Unknown":
if val[0] in string.printable:
val = "".join(translate_en_ru(val))
else:
val = "".join(translate_ru_en(val))
type = get_hints(val)
if type == "Name":
qs = apply_qs_search(qs, val)
elif type == "Category":
qs = qs.filter(category__name__unaccent__trigram_similar=val)
elif type == "Unknown":
continue
else:
qs = qs.filter(
characteristics__characteristic__name__unaccent__trigram_similar=val
)
continue
else:
if typ.startswith("*"):
qs = qs.filter(unit_characteristics__in=val)
else:
qs = qs.filter(characteristics__in=val)
return [
x.serialize_self()
for x in qs.distinct().order_by("-score")[offset : offset + limit]
]

View File

@ -4,6 +4,7 @@ from spellchecker import SpellChecker
speller_ru = SpellChecker(language='ru') speller_ru = SpellChecker(language='ru')
speller_eng = SpellChecker(language='en') speller_eng = SpellChecker(language='en')
def spell_check_ru(word: str) -> str: def spell_check_ru(word: str) -> str:
res = speller_ru.correction(word) res = speller_ru.correction(word)
if not len(res): if not len(res):

View File

@ -5,9 +5,14 @@ from typing import List
def translate_ru_en(word: str) -> List[str]: def translate_ru_en(word: str) -> List[str]:
res = r.get(f"https://dictionary.yandex.net/api/v1/dicservice.json/lookup?key={YANDEX_DICT_API_KEY}&lang=ru-en&text={word}") res = r.get(
return [i['text'] for i in chain(*[j['tr']for j in res.json()['def']])] f"https://dictionary.yandex.net/api/v1/dicservice.json/lookup?key={YANDEX_DICT_API_KEY}&lang=ru-en&text={word}"
)
return [i["text"] for i in chain(*[j["tr"] for j in res.json()["def"]])]
def translate_en_ru(word: str) -> List[str]: def translate_en_ru(word: str) -> List[str]:
res = r.get(f"https://dictionary.yandex.net/api/v1/dicservice.json/lookup?key={YANDEX_DICT_API_KEY}&lang=en-ru&text={word}") res = r.get(
return [i['text'] for i in chain(*[j['tr']for j in res.json()['def']])] f"https://dictionary.yandex.net/api/v1/dicservice.json/lookup?key={YANDEX_DICT_API_KEY}&lang=en-ru&text={word}"
)
return [i["text"] for i in chain(*[j["tr"] for j in res.json()["def"]])]

2
pg.sql Normal file
View File

@ -0,0 +1,2 @@
CREATE EXTENSION unaccent;
CREATE EXTENSION pg_trgm;