mirror of
https://github.com/magnum-opus-tender-hack/backend.git
synced 2024-11-13 04:56:33 +03:00
added russian transliteration
This commit is contained in:
parent
08a442d8f3
commit
fc0049d816
|
@ -73,7 +73,7 @@ $ ./app/manage.py runserver
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"value": "каучук",
|
"value": "каучук",
|
||||||
"type": "Матерьял"
|
"type": "Материал"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"value": "синий",
|
"value": "синий",
|
||||||
|
|
|
@ -216,3 +216,4 @@ REST_FRAMEWORK = {
|
||||||
|
|
||||||
# django-cors-headers
|
# django-cors-headers
|
||||||
CORS_ALLOW_ALL_ORIGINS = True
|
CORS_ALLOW_ALL_ORIGINS = True
|
||||||
|
YANDEX_DICT_API_KEY = env.str('YANDEX_DICT')
|
||||||
|
|
25
app/search/services/search/bert.py
Normal file
25
app/search/services/search/bert.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
from transformers import BertTokenizer, BertModel
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from scipy.spatial import distance
|
||||||
|
|
||||||
|
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
|
||||||
|
model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding(word):
|
||||||
|
inputs = tokenizer(word, return_tensors="pt")
|
||||||
|
outputs = model(**inputs)
|
||||||
|
word_vect = outputs.pooler_output.detach().numpy()
|
||||||
|
return word_vect
|
||||||
|
|
||||||
|
|
||||||
|
def get_distance(first_word, second_word):
|
||||||
|
w1 = get_embedding(first_word)
|
||||||
|
w2 = get_embedding(second_word)
|
||||||
|
cos_distance = np.round(distance.cosine(w1, w2), 2)
|
||||||
|
return 1 - cos_distance
|
||||||
|
|
||||||
|
|
||||||
|
get_distance("электрогитара", "электрическая гитара")
|
|
@ -21,7 +21,7 @@ def process_search(data: List[dict], limit=5, offset=0) -> List[dict]:
|
||||||
qs = qs & apply_qs_search(val)
|
qs = qs & apply_qs_search(val)
|
||||||
qs = qs.order_by("-score")
|
qs = qs.order_by("-score")
|
||||||
elif typ == "All":
|
elif typ == "All":
|
||||||
qs = apply_all_qs_search(qs, val) & qs
|
qs = apply_all_qs_search(val) & qs
|
||||||
elif typ == "Category":
|
elif typ == "Category":
|
||||||
qs = apply_qs_category(qs, val)
|
qs = apply_qs_category(qs, val)
|
||||||
qs = qs.order_by("-score")
|
qs = qs.order_by("-score")
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
|
from functools import cache
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from django.utils.text import slugify
|
||||||
|
|
||||||
from search.models import (
|
from search.models import (
|
||||||
Product,
|
Product,
|
||||||
ProductCharacteristic,
|
ProductCharacteristic,
|
||||||
ProductUnitCharacteristic,
|
ProductUnitCharacteristic,
|
||||||
)
|
)
|
||||||
from search.services.spell_check import pos
|
from search.services.spell_check import pos, spell_check
|
||||||
|
|
||||||
|
|
||||||
def _clean_text(text: str) -> List[str]:
|
def _clean_text(text: str) -> List[str]:
|
||||||
|
@ -13,9 +16,11 @@ def _clean_text(text: str) -> List[str]:
|
||||||
text = text.replace(st, " ")
|
text = text.replace(st, " ")
|
||||||
text = text.split()
|
text = text.split()
|
||||||
functors_pos = {"INTJ", "PRCL", "CONJ", "PREP"} # function words
|
functors_pos = {"INTJ", "PRCL", "CONJ", "PREP"} # function words
|
||||||
return [word for word in text if pos(word) not in functors_pos]
|
text = [word for word in text if pos(word) not in functors_pos]
|
||||||
|
return [spell_check(x) for x in text]
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str):
|
def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str):
|
||||||
if operation.startswith("<=") or operation.startswith("=<"):
|
if operation.startswith("<=") or operation.startswith("=<"):
|
||||||
return unit.filter(
|
return unit.filter(
|
||||||
|
@ -41,20 +46,20 @@ def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: s
|
||||||
return unit
|
return unit
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def apply_qs_search(text: str):
|
def apply_qs_search(text: str):
|
||||||
text = _clean_text(text)
|
text = _clean_text(text)
|
||||||
products = Product.objects.none()
|
qs = Product.objects.filter()
|
||||||
for word in text:
|
for word in text:
|
||||||
products = (
|
qs = qs.filter(name__unaccent__trigram_similar=word) | qs.filter(
|
||||||
products
|
name__unaccent__icontains=word
|
||||||
| Product.objects.filter(name__unaccent__icontains=word)
|
|
||||||
| Product.objects.filter(name__unaccent__trigram_similar=word)
|
|
||||||
)
|
)
|
||||||
products = products.order_by("-score")
|
products = qs.order_by("-score")
|
||||||
return products
|
return products
|
||||||
|
|
||||||
|
|
||||||
def apply_all_qs_search(orig_qs, text: str):
|
@cache
|
||||||
|
def apply_all_qs_search(text: str):
|
||||||
# words
|
# words
|
||||||
text = _clean_text(text)
|
text = _clean_text(text)
|
||||||
|
|
||||||
|
@ -105,9 +110,23 @@ def apply_all_qs_search(orig_qs, text: str):
|
||||||
)
|
)
|
||||||
qs = (
|
qs = (
|
||||||
Product.objects.filter(name__icontains=word)
|
Product.objects.filter(name__icontains=word)
|
||||||
|
| Product.objects.filter(name__trigram_similar=word)
|
||||||
| Product.objects.filter(category__name__icontains=word)
|
| Product.objects.filter(category__name__icontains=word)
|
||||||
| Product.objects.filter(characteristics__in=car)
|
| Product.objects.filter(characteristics__in=car)
|
||||||
)
|
)
|
||||||
|
if any(
|
||||||
|
x in word
|
||||||
|
for x in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
|
||||||
|
):
|
||||||
|
qs = qs | Product.objects.filter(
|
||||||
|
name__icontains=word.translate(
|
||||||
|
str.maketrans(
|
||||||
|
"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
|
||||||
|
"abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(qs)
|
||||||
prod = prod & qs
|
prod = prod & qs
|
||||||
|
|
||||||
if u_qs:
|
if u_qs:
|
||||||
|
@ -116,11 +135,13 @@ def apply_all_qs_search(orig_qs, text: str):
|
||||||
return prod
|
return prod
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def apply_qs_category(qs, name: str):
|
def apply_qs_category(qs, name: str):
|
||||||
qs = qs.filter(category__name__icontains=name)
|
qs = qs.filter(category__name__icontains=name)
|
||||||
return qs
|
return qs
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
def appy_qs_characteristic(qs, name: str):
|
def appy_qs_characteristic(qs, name: str):
|
||||||
char = ProductCharacteristic.objects.filter(product__in=qs)
|
char = ProductCharacteristic.objects.filter(product__in=qs)
|
||||||
char = char.filter(characteristic__value__icontains=name) | char.filter(
|
char = char.filter(characteristic__value__icontains=name) | char.filter(
|
||||||
|
|
|
@ -2,11 +2,15 @@ from typing import List, Dict
|
||||||
|
|
||||||
from rest_framework.exceptions import ValidationError
|
from rest_framework.exceptions import ValidationError
|
||||||
|
|
||||||
from search.models import Characteristic, ProductCharacteristic, ProductUnitCharacteristic, UnitCharacteristic
|
from search.models import (
|
||||||
|
Characteristic,
|
||||||
|
ProductCharacteristic,
|
||||||
|
ProductUnitCharacteristic,
|
||||||
|
UnitCharacteristic,
|
||||||
|
)
|
||||||
from search.services.hints import get_hints
|
from search.services.hints import get_hints
|
||||||
from search.services.search.methods import process_unit_operation
|
from search.services.search.methods import process_unit_operation
|
||||||
)
|
from search.services.spell_check import spell_check
|
||||||
from search.services.spell_check import spell_check_ru as spell_check
|
|
||||||
|
|
||||||
|
|
||||||
def apply_union(data: List[Dict]) -> List[Dict]:
|
def apply_union(data: List[Dict]) -> List[Dict]:
|
||||||
|
|
|
@ -13,3 +13,8 @@ celery==5.2.7
|
||||||
|
|
||||||
pyspellchecker==0.7.0
|
pyspellchecker==0.7.0
|
||||||
pymorphy2
|
pymorphy2
|
||||||
|
|
||||||
|
transformers
|
||||||
|
torch
|
||||||
|
scipy
|
||||||
|
numpy
|
||||||
|
|
Loading…
Reference in New Issue
Block a user