added russian transliteration

This commit is contained in:
Alexander Karpov 2022-10-23 14:30:37 +03:00
parent 08a442d8f3
commit fc0049d816
7 changed files with 71 additions and 15 deletions

View File

@ -73,7 +73,7 @@ $ ./app/manage.py runserver
}, },
{ {
"value": "каучук", "value": "каучук",
"type": "Матерьял" "type": "Материал"
}, },
{ {
"value": "синий", "value": "синий",

View File

@ -216,3 +216,4 @@ REST_FRAMEWORK = {
# django-cors-headers # django-cors-headers
CORS_ALLOW_ALL_ORIGINS = True CORS_ALLOW_ALL_ORIGINS = True
YANDEX_DICT_API_KEY = env.str('YANDEX_DICT')

View File

@ -0,0 +1,25 @@
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from scipy.spatial import distance
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
def get_embedding(word):
inputs = tokenizer(word, return_tensors="pt")
outputs = model(**inputs)
word_vect = outputs.pooler_output.detach().numpy()
return word_vect
def get_distance(first_word, second_word):
w1 = get_embedding(first_word)
w2 = get_embedding(second_word)
cos_distance = np.round(distance.cosine(w1, w2), 2)
return 1 - cos_distance
get_distance("электрогитара", "электрическая гитара")

View File

@ -21,7 +21,7 @@ def process_search(data: List[dict], limit=5, offset=0) -> List[dict]:
qs = qs & apply_qs_search(val) qs = qs & apply_qs_search(val)
qs = qs.order_by("-score") qs = qs.order_by("-score")
elif typ == "All": elif typ == "All":
qs = apply_all_qs_search(qs, val) & qs qs = apply_all_qs_search(val) & qs
elif typ == "Category": elif typ == "Category":
qs = apply_qs_category(qs, val) qs = apply_qs_category(qs, val)
qs = qs.order_by("-score") qs = qs.order_by("-score")
@ -35,4 +35,4 @@ def process_search(data: List[dict], limit=5, offset=0) -> List[dict]:
qs = qs.filter(unit_characteristics__in=val) qs = qs.filter(unit_characteristics__in=val)
else: else:
qs = qs.filter(characteristics__in=val) qs = qs.filter(characteristics__in=val)
return [x.serialize_self() for x in qs.distinct()[offset: offset + limit]] return [x.serialize_self() for x in qs.distinct()[offset : offset + limit]]

View File

@ -1,11 +1,14 @@
from functools import cache
from typing import List from typing import List
from django.utils.text import slugify
from search.models import ( from search.models import (
Product, Product,
ProductCharacteristic, ProductCharacteristic,
ProductUnitCharacteristic, ProductUnitCharacteristic,
) )
from search.services.spell_check import pos from search.services.spell_check import pos, spell_check
def _clean_text(text: str) -> List[str]: def _clean_text(text: str) -> List[str]:
@ -13,9 +16,11 @@ def _clean_text(text: str) -> List[str]:
text = text.replace(st, " ") text = text.replace(st, " ")
text = text.split() text = text.split()
functors_pos = {"INTJ", "PRCL", "CONJ", "PREP"} # function words functors_pos = {"INTJ", "PRCL", "CONJ", "PREP"} # function words
return [word for word in text if pos(word) not in functors_pos] text = [word for word in text if pos(word) not in functors_pos]
return [spell_check(x) for x in text]
@cache
def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str): def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str):
if operation.startswith("<=") or operation.startswith("=<"): if operation.startswith("<=") or operation.startswith("=<"):
return unit.filter( return unit.filter(
@ -41,20 +46,20 @@ def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: s
return unit return unit
@cache
def apply_qs_search(text: str): def apply_qs_search(text: str):
text = _clean_text(text) text = _clean_text(text)
products = Product.objects.none() qs = Product.objects.filter()
for word in text: for word in text:
products = ( qs = qs.filter(name__unaccent__trigram_similar=word) | qs.filter(
products name__unaccent__icontains=word
| Product.objects.filter(name__unaccent__icontains=word)
| Product.objects.filter(name__unaccent__trigram_similar=word)
) )
products = products.order_by("-score") products = qs.order_by("-score")
return products return products
def apply_all_qs_search(orig_qs, text: str): @cache
def apply_all_qs_search(text: str):
# words # words
text = _clean_text(text) text = _clean_text(text)
@ -105,9 +110,23 @@ def apply_all_qs_search(orig_qs, text: str):
) )
qs = ( qs = (
Product.objects.filter(name__icontains=word) Product.objects.filter(name__icontains=word)
| Product.objects.filter(name__trigram_similar=word)
| Product.objects.filter(category__name__icontains=word) | Product.objects.filter(category__name__icontains=word)
| Product.objects.filter(characteristics__in=car) | Product.objects.filter(characteristics__in=car)
) )
if any(
x in word
for x in "абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
):
qs = qs | Product.objects.filter(
name__icontains=word.translate(
str.maketrans(
"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
"abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA",
)
)
)
print(qs)
prod = prod & qs prod = prod & qs
if u_qs: if u_qs:
@ -116,11 +135,13 @@ def apply_all_qs_search(orig_qs, text: str):
return prod return prod
@cache
def apply_qs_category(qs, name: str): def apply_qs_category(qs, name: str):
qs = qs.filter(category__name__icontains=name) qs = qs.filter(category__name__icontains=name)
return qs return qs
@cache
def appy_qs_characteristic(qs, name: str): def appy_qs_characteristic(qs, name: str):
char = ProductCharacteristic.objects.filter(product__in=qs) char = ProductCharacteristic.objects.filter(product__in=qs)
char = char.filter(characteristic__value__icontains=name) | char.filter( char = char.filter(characteristic__value__icontains=name) | char.filter(

View File

@ -2,11 +2,15 @@ from typing import List, Dict
from rest_framework.exceptions import ValidationError from rest_framework.exceptions import ValidationError
from search.models import Characteristic, ProductCharacteristic, ProductUnitCharacteristic, UnitCharacteristic from search.models import (
Characteristic,
ProductCharacteristic,
ProductUnitCharacteristic,
UnitCharacteristic,
)
from search.services.hints import get_hints from search.services.hints import get_hints
from search.services.search.methods import process_unit_operation from search.services.search.methods import process_unit_operation
) from search.services.spell_check import spell_check
from search.services.spell_check import spell_check_ru as spell_check
def apply_union(data: List[Dict]) -> List[Dict]: def apply_union(data: List[Dict]) -> List[Dict]:

View File

@ -13,3 +13,8 @@ celery==5.2.7
pyspellchecker==0.7.0 pyspellchecker==0.7.0
pymorphy2 pymorphy2
transformers
torch
scipy
numpy