From 08a442d8f33e89e7d45692fcc911e47d6c79f9d3 Mon Sep 17 00:00:00 2001 From: Alexandr Karpov Date: Sun, 23 Oct 2022 12:11:53 +0300 Subject: [PATCH] Reformed project, improved text search --- app/search/api/views.py | 2 +- app/search/services/autocomplete_schema.py | 22 +- app/search/services/search.py | 236 --------------------- app/search/services/search/__init__.py | 0 app/search/services/search/main.py | 38 ++++ app/search/services/search/methods.py | 130 ++++++++++++ app/search/services/search/prepare.py | 87 ++++++++ app/search/services/spell_check.py | 7 +- requirements/base.txt | 3 +- 9 files changed, 274 insertions(+), 251 deletions(-) delete mode 100644 app/search/services/search.py create mode 100644 app/search/services/search/__init__.py create mode 100644 app/search/services/search/main.py create mode 100644 app/search/services/search/methods.py create mode 100644 app/search/services/search/prepare.py diff --git a/app/search/api/views.py b/app/search/api/views.py index 734a775..bf8481c 100644 --- a/app/search/api/views.py +++ b/app/search/api/views.py @@ -15,7 +15,7 @@ from search.api.serializers import ( ) from search.models import Product from search.services.colors import group -from search.services.search import process_search +from search.services.search.main import process_search from search.services.autocomplete_schema import autocomplete_schema from search.services.hints import get_hints diff --git a/app/search/services/autocomplete_schema.py b/app/search/services/autocomplete_schema.py index 96ae0f3..8681dcb 100644 --- a/app/search/services/autocomplete_schema.py +++ b/app/search/services/autocomplete_schema.py @@ -8,6 +8,17 @@ def autocomplete_schema(val: str, exclude: List[Dict]): name_exclude = [x["value"] for x in exclude if x["type"] == "Name"] category_exclude = [x["value"] for x in exclude if x["type"] == "Category"] schema = [] + schema.extend( + [ + { + "coordinate": char["name"].lower().replace("ё", "е").index(val.lower()), + "value": {"type": char["name"] + "_numeric", "value": char["name"]}, + } + for char in UnitCharacteristic.objects.filter( + name__unaccent__icontains=val + )[:20].values("name", "value") + ] + ) if not category_exclude: schema.extend( [ @@ -64,15 +75,4 @@ def autocomplete_schema(val: str, exclude: List[Dict]): .values("name", "value") ] ) - schema.extend( - [ - { - "coordinate": char["name"].lower().replace("ё", "е").index(val.lower()), - "value": {"type": char["name"] + "_numeric", "value": char["name"]}, - } - for char in UnitCharacteristic.objects.filter( - name__unaccent__icontains=val - )[:20].values("name", "value") - ] - ) return schema diff --git a/app/search/services/search.py b/app/search/services/search.py deleted file mode 100644 index cbd4e70..0000000 --- a/app/search/services/search.py +++ /dev/null @@ -1,236 +0,0 @@ -from search.models import ( - Product, - Characteristic, - ProductCharacteristic, - ProductUnitCharacteristic, - UnitCharacteristic, - Category, -) -from typing import List - -from search.services.hints import get_hints -from search.services.spell_check import spell_check_ru as spell_check, lemmatize - - -def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str): - if operation.startswith("<=") or operation.startswith("=<"): - return unit.filter( - characteristic__numeric_value_max__lte=int(float(operation[2:])) - ) - elif operation.startswith("=>") or operation.startswith(">="): - return unit.filter( - characteristic__numeric_value_min__gte=int(float(operation[2:])) - ) - elif operation.startswith(">"): - return unit.filter( - characteristic__numeric_value_min__gt=int(float(operation[1:])) - ) - elif operation.startswith("<"): - return unit.filter( - characteristic__numeric_value_max__lt=int(float(operation[1:])) - ) - elif operation.startswith("="): - return unit.filter( - characteristic__numeric_value_min__gte=int(float(operation[1:])), - characteristic__numeric_value_max__lte=int(float(operation[1:])), - ) - return unit - - -def _clean_text(text: str) -> List[str]: - for st in [".", ",", "!", "?"]: - text = text.replace(st, " ") - text = text.split() - re = [] - for word in text: - re.append(word) - return re - - -def apply_qs_search(text: str): - text = _clean_text(text) - products = Product.objects.none() - for word in text: - products = ( - products - | Product.objects.filter(name__unaccent__trigram_similar=word) - | Product.objects.filter(name__unaccent__icontains=word) - ) - products = products.order_by("-score") - return products - - -def apply_all_qs_search(orig_qs, text: str): - # words - text = _clean_text(text) - - u_qs = None - - # try to find Unit characteristics - if any(x.isnumeric() for x in text): - u_qs = ProductUnitCharacteristic.objects.filter() - for i in range(len(text)): - el = text[i] - if el.isnumeric(): - if i == len(text) - 1: - if ProductUnitCharacteristic.objects.filter( - characteristic__name__icontains=text[i - 1] - ).exists(): - unit = ProductUnitCharacteristic.objects.filter( - characteristic__name__icontains=text[i - 1] - ) - u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") - del text[i] - del text[i - 1] - break - elif len(text) - 1 > i >= 1: - if ProductUnitCharacteristic.objects.filter( - characteristic__name__icontains=text[i - 1] - ).exists(): - unit = ProductUnitCharacteristic.objects.filter( - characteristic__name__icontains=text[i - 1] - )[0] - u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") - del text[i] - del text[i - 1] - break - elif ProductUnitCharacteristic.objects.filter( - characteristic__name__icontains=text[i + 1] - ).exists(): - unit = UnitCharacteristic.objects.filter( - ProductUnitCharacteristic=text[i + 1] - )[0] - u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") - del text[i] - del text[i + 1] - break - else: - if ProductUnitCharacteristic.objects.filter( - characteristic__name__icontains=text[i + 1] - ).exists(): - unit = ProductUnitCharacteristic.objects.filter( - characteristic__name__icontains=text[i + 1] - )[0] - u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") - del text[i] - del text[i + 1] - break - - prod = Product.objects.filter() - for word in text: - car = ProductCharacteristic.objects.filter( - characteristic__value__icontains=word, - ) - qs = ( - Product.objects.filter(name__icontains=word) - | Product.objects.filter(name__unaccent__trigram_similar=word) - | Product.objects.filter(category__name__icontains=word) - | Product.objects.filter(characteristics__in=car) - ) - prod = prod & qs - if u_qs: - prod = prod & Product.objects.filter(unit_characteristics__in=u_qs) - - return prod - - -def process_search(data: List[dict], limit=5, offset=0) -> List[dict]: - prep_data = [] - prep_dict = {} - prep_dict_char_type = {} - # --------------------------------------- prepare filters -------------------------------------------------------- # - for x in data: - dat = dict(x) - if x["type"] in ["Name", "Category", "Characteristic", "All"]: - prep_data.append( - { - "type": dat["type"], - "value": spell_check( - dat["value"], - ), - } - ) - elif x["type"] == "Unknown": - type = get_hints(dat["value"]) - prep_data.append( - { - "type": type, - "value": spell_check( - dat["value"], - ), - } - ) - else: - val = spell_check( - dat["value"], - ) - if x["type"] in list(prep_dict.keys()): - if x["type"].startswith("*"): - unit = ProductUnitCharacteristic.objects.filter( - characteristic__in=prep_dict_char_type[x["type"]], - ) - prep_dict[x["type"]] = prep_dict[ - x["type"] - ] | process_unit_operation(unit, x["value"]) - else: - prep_dict[x["type"]] = ( - prep_dict[x["type"]] - | ProductCharacteristic.objects.filter( - characteristic__in=prep_dict_char_type[x["type"]], - characteristic__value__unaccent__trigram_similar=val, - ) - | ProductCharacteristic.objects.filter( - characteristic__in=prep_dict_char_type[x["type"]], - characteristic__value__icontains=val, - ) - ) - else: - if x["type"].startswith("*"): - prep_dict_char_type[x["type"]] = UnitCharacteristic.objects.filter( - name__unaccent__trigram_similar=x["type"] - ) | UnitCharacteristic.objects.filter(name__icontains=x["type"]) - unit = ProductUnitCharacteristic.objects.filter( - characteristic__in=prep_dict_char_type[x["type"]], - ) - prep_dict[x["type"]] = process_unit_operation(unit, x["value"]) - else: - prep_dict_char_type[x["type"]] = Characteristic.objects.filter( - name__unaccent__trigram_similar=x["type"] - ) | Characteristic.objects.filter(name__icontains=x["type"]) - prep_dict[x["type"]] = ProductCharacteristic.objects.filter( - characteristic__in=prep_dict_char_type[x["type"]], - characteristic__value__unaccent__trigram_similar=val, - ) | ProductCharacteristic.objects.filter( - characteristic__in=prep_dict_char_type[x["type"]], - characteristic__value__icontains=val, - ) - for el, val in prep_dict.items(): - prep_data.append({"type": el, "value": val}) - # ----------------------------------- apply filters on QuerySet -------------------------------------------------- # - qs = Product.objects.filter() - for x in prep_data: - typ = x["type"] - val = x["value"] - if typ == "Name": - qs = qs & apply_qs_search(val) - qs = qs.order_by("-score") - elif typ == "All": - qs = apply_all_qs_search(qs, val) & qs - elif typ == "Category": - qs = qs.filter(category__name__icontains=val) - qs = qs.order_by("-score") - elif typ == "Characteristic": - char = ProductCharacteristic.objects.filter(product__in=qs) - char = char.filter(characteristic__value__icontains=val) | char.filter( - characteristic__value__unaccent__trigram_similar=val - ) - qs = qs.filter(characteristics__in=char) - qs = qs.order_by("-score") - elif typ == "Unknown": - continue - else: - if typ.startswith("*"): - qs = qs.filter(unit_characteristics__in=val) - else: - qs = qs.filter(characteristics__in=val) - return [x.serialize_self() for x in qs.distinct()[offset : offset + limit]] diff --git a/app/search/services/search/__init__.py b/app/search/services/search/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/search/services/search/main.py b/app/search/services/search/main.py new file mode 100644 index 0000000..a0450f6 --- /dev/null +++ b/app/search/services/search/main.py @@ -0,0 +1,38 @@ +from typing import List + +from search.services.search.methods import ( + apply_qs_search, + apply_all_qs_search, + apply_qs_category, + appy_qs_characteristic, +) +from search.services.search.prepare import apply_union +from search.models import Product + + +def process_search(data: List[dict], limit=5, offset=0) -> List[dict]: + prep_data = apply_union(data) + # ----------------------------------- apply filters on QuerySet -------------------------------------------------- # + qs = Product.objects.filter() + for x in prep_data: + typ = x["type"] + val = x["value"] + if typ == "Name": + qs = qs & apply_qs_search(val) + qs = qs.order_by("-score") + elif typ == "All": + qs = apply_all_qs_search(qs, val) & qs + elif typ == "Category": + qs = apply_qs_category(qs, val) + qs = qs.order_by("-score") + elif typ == "Characteristic": + qs = appy_qs_characteristic(qs, val) + qs = qs.order_by("-score") + elif typ == "Unknown": + continue + else: + if typ.startswith("*"): + qs = qs.filter(unit_characteristics__in=val) + else: + qs = qs.filter(characteristics__in=val) + return [x.serialize_self() for x in qs.distinct()[offset: offset + limit]] diff --git a/app/search/services/search/methods.py b/app/search/services/search/methods.py new file mode 100644 index 0000000..e25e74a --- /dev/null +++ b/app/search/services/search/methods.py @@ -0,0 +1,130 @@ +from typing import List + +from search.models import ( + Product, + ProductCharacteristic, + ProductUnitCharacteristic, +) +from search.services.spell_check import pos + + +def _clean_text(text: str) -> List[str]: + for st in [".", ",", "!", "?"]: + text = text.replace(st, " ") + text = text.split() + functors_pos = {"INTJ", "PRCL", "CONJ", "PREP"} # function words + return [word for word in text if pos(word) not in functors_pos] + + +def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str): + if operation.startswith("<=") or operation.startswith("=<"): + return unit.filter( + characteristic__numeric_value_max__lte=int(float(operation[2:])) + ) + elif operation.startswith("=>") or operation.startswith(">="): + return unit.filter( + characteristic__numeric_value_min__gte=int(float(operation[2:])) + ) + elif operation.startswith(">"): + return unit.filter( + characteristic__numeric_value_min__gt=int(float(operation[1:])) + ) + elif operation.startswith("<"): + return unit.filter( + characteristic__numeric_value_max__lt=int(float(operation[1:])) + ) + elif operation.startswith("="): + return unit.filter( + characteristic__numeric_value_min__gte=int(float(operation[1:])), + characteristic__numeric_value_max__lte=int(float(operation[1:])), + ) + return unit + + +def apply_qs_search(text: str): + text = _clean_text(text) + products = Product.objects.none() + for word in text: + products = ( + products + | Product.objects.filter(name__unaccent__icontains=word) + | Product.objects.filter(name__unaccent__trigram_similar=word) + ) + products = products.order_by("-score") + return products + + +def apply_all_qs_search(orig_qs, text: str): + # words + text = _clean_text(text) + + u_qs = None + + # try to find Unit characteristics + if any(x.isnumeric() for x in text): + u_qs = ProductUnitCharacteristic.objects.filter() + for i in range(len(text)): + el = text[i] + if el.isnumeric(): + if i == len(text) - 1: + if unit := ProductUnitCharacteristic.objects.filter( + characteristic__name__icontains=text[i - 1] + ): + u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") + del text[i - 1] + del text[i - 1] + break + elif len(text) - 1 > i >= 1: + if unit := ProductUnitCharacteristic.objects.filter( + characteristic__name__icontains=text[i + 1] + ): + u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") + del text[i] + del text[i] + break + elif unit := ProductUnitCharacteristic.objects.filter( + characteristic__name__icontains=text[i - 1] + ): + u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") + del text[i - 1] + del text[i - 1] + break + else: + if unit := ProductUnitCharacteristic.objects.filter( + characteristic__name__icontains=text[i + 1] + ): + u_qs = u_qs & process_unit_operation(unit, f"={text[i]}") + del text[i] + del text[i] + break + + prod = Product.objects.filter() + for word in text: + car = ProductCharacteristic.objects.filter( + characteristic__value__icontains=word, + ) + qs = ( + Product.objects.filter(name__icontains=word) + | Product.objects.filter(category__name__icontains=word) + | Product.objects.filter(characteristics__in=car) + ) + prod = prod & qs + + if u_qs: + prod = prod & Product.objects.filter(unit_characteristics__in=u_qs) + + return prod + + +def apply_qs_category(qs, name: str): + qs = qs.filter(category__name__icontains=name) + return qs + + +def appy_qs_characteristic(qs, name: str): + char = ProductCharacteristic.objects.filter(product__in=qs) + char = char.filter(characteristic__value__icontains=name) | char.filter( + characteristic__value__unaccent__trigram_similar=name + ) + qs = qs.filter(characteristics__in=char) + return qs diff --git a/app/search/services/search/prepare.py b/app/search/services/search/prepare.py new file mode 100644 index 0000000..6e19e80 --- /dev/null +++ b/app/search/services/search/prepare.py @@ -0,0 +1,87 @@ +from typing import List, Dict + +from rest_framework.exceptions import ValidationError + +from search.models import Characteristic, ProductCharacteristic, ProductUnitCharacteristic, UnitCharacteristic +from search.services.hints import get_hints +from search.services.search.methods import process_unit_operation +) +from search.services.spell_check import spell_check_ru as spell_check + + +def apply_union(data: List[Dict]) -> List[Dict]: + prep_data = [] + prep_dict = {} + prep_dict_char_type = {} + # --------------------------------------- prepare filters -------------------------------------------------------- # + for x in data: + dat = dict(x) + if "type" not in dat or "value" not in dat: + raise ValidationError("Improper body structure") + + if x["type"] in ["Name", "Category", "Characteristic", "All"]: + prep_data.append( + { + "type": dat["type"], + "value": spell_check( + dat["value"], + ), + } + ) + elif x["type"] == "Unknown": + type = get_hints(dat["value"]) + prep_data.append( + { + "type": type, + "value": spell_check( + dat["value"], + ), + } + ) + else: + val = spell_check( + dat["value"], + ) + if x["type"] in list(prep_dict.keys()): + if x["type"].startswith("*"): + unit = ProductUnitCharacteristic.objects.filter( + characteristic__in=prep_dict_char_type[x["type"]], + ) + prep_dict[x["type"]] = prep_dict[ + x["type"] + ] | process_unit_operation(unit, x["value"]) + else: + prep_dict[x["type"]] = ( + prep_dict[x["type"]] + | ProductCharacteristic.objects.filter( + characteristic__in=prep_dict_char_type[x["type"]], + characteristic__value__unaccent__trigram_similar=val, + ) + | ProductCharacteristic.objects.filter( + characteristic__in=prep_dict_char_type[x["type"]], + characteristic__value__icontains=val, + ) + ) + else: + if x["type"].startswith("*"): + prep_dict_char_type[x["type"]] = UnitCharacteristic.objects.filter( + name__unaccent__trigram_similar=x["type"] + ) | UnitCharacteristic.objects.filter(name__icontains=x["type"]) + unit = ProductUnitCharacteristic.objects.filter( + characteristic__in=prep_dict_char_type[x["type"]], + ) + prep_dict[x["type"]] = process_unit_operation(unit, x["value"]) + else: + prep_dict_char_type[x["type"]] = Characteristic.objects.filter( + name__unaccent__trigram_similar=x["type"] + ) | Characteristic.objects.filter(name__icontains=x["type"]) + prep_dict[x["type"]] = ProductCharacteristic.objects.filter( + characteristic__in=prep_dict_char_type[x["type"]], + characteristic__value__unaccent__trigram_similar=val, + ) | ProductCharacteristic.objects.filter( + characteristic__in=prep_dict_char_type[x["type"]], + characteristic__value__icontains=val, + ) + for el, val in prep_dict.items(): + prep_data.append({"type": el, "value": val}) + return prep_data diff --git a/app/search/services/spell_check.py b/app/search/services/spell_check.py index 9995cac..00697b3 100644 --- a/app/search/services/spell_check.py +++ b/app/search/services/spell_check.py @@ -23,5 +23,8 @@ morph = pymorphy2.MorphAnalyzer() def lemmatize(word): - p = morph.parse(word)[0] - return p.normal_form + return morph.parse(word)[0].normal_form + + +def pos(word): + return morph.parse(word)[0].tag.POS diff --git a/requirements/base.txt b/requirements/base.txt index aae357c..bab147f 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -11,4 +11,5 @@ psycopg2-binary==2.9.4 celery==5.2.7 -pyspellchecker==0.7.0 \ No newline at end of file +pyspellchecker==0.7.0 +pymorphy2