From 08a442d8f33e89e7d45692fcc911e47d6c79f9d3 Mon Sep 17 00:00:00 2001
From: Alexandr Karpov <alexandr.d.karpov@gmail.com>
Date: Sun, 23 Oct 2022 12:11:53 +0300
Subject: [PATCH] Reformed project, improved text search

---
 app/search/api/views.py                    |   2 +-
 app/search/services/autocomplete_schema.py |  22 +-
 app/search/services/search.py              | 236 ---------------------
 app/search/services/search/__init__.py     |   0
 app/search/services/search/main.py         |  38 ++++
 app/search/services/search/methods.py      | 130 ++++++++++++
 app/search/services/search/prepare.py      |  87 ++++++++
 app/search/services/spell_check.py         |   7 +-
 requirements/base.txt                      |   3 +-
 9 files changed, 274 insertions(+), 251 deletions(-)
 delete mode 100644 app/search/services/search.py
 create mode 100644 app/search/services/search/__init__.py
 create mode 100644 app/search/services/search/main.py
 create mode 100644 app/search/services/search/methods.py
 create mode 100644 app/search/services/search/prepare.py

diff --git a/app/search/api/views.py b/app/search/api/views.py
index 734a775..bf8481c 100644
--- a/app/search/api/views.py
+++ b/app/search/api/views.py
@@ -15,7 +15,7 @@ from search.api.serializers import (
 )
 from search.models import Product
 from search.services.colors import group
-from search.services.search import process_search
+from search.services.search.main import process_search
 from search.services.autocomplete_schema import autocomplete_schema
 
 from search.services.hints import get_hints
diff --git a/app/search/services/autocomplete_schema.py b/app/search/services/autocomplete_schema.py
index 96ae0f3..8681dcb 100644
--- a/app/search/services/autocomplete_schema.py
+++ b/app/search/services/autocomplete_schema.py
@@ -8,6 +8,17 @@ def autocomplete_schema(val: str, exclude: List[Dict]):
     name_exclude = [x["value"] for x in exclude if x["type"] == "Name"]
     category_exclude = [x["value"] for x in exclude if x["type"] == "Category"]
     schema = []
+    schema.extend(
+        [
+            {
+                "coordinate": char["name"].lower().replace("ё", "е").index(val.lower()),
+                "value": {"type": char["name"] + "_numeric", "value": char["name"]},
+            }
+            for char in UnitCharacteristic.objects.filter(
+                name__unaccent__icontains=val
+            )[:20].values("name", "value")
+        ]
+    )
     if not category_exclude:
         schema.extend(
             [
@@ -64,15 +75,4 @@ def autocomplete_schema(val: str, exclude: List[Dict]):
             .values("name", "value")
         ]
     )
-    schema.extend(
-        [
-            {
-                "coordinate": char["name"].lower().replace("ё", "е").index(val.lower()),
-                "value": {"type": char["name"] + "_numeric", "value": char["name"]},
-            }
-            for char in UnitCharacteristic.objects.filter(
-                name__unaccent__icontains=val
-            )[:20].values("name", "value")
-        ]
-    )
     return schema
diff --git a/app/search/services/search.py b/app/search/services/search.py
deleted file mode 100644
index cbd4e70..0000000
--- a/app/search/services/search.py
+++ /dev/null
@@ -1,236 +0,0 @@
-from search.models import (
-    Product,
-    Characteristic,
-    ProductCharacteristic,
-    ProductUnitCharacteristic,
-    UnitCharacteristic,
-    Category,
-)
-from typing import List
-
-from search.services.hints import get_hints
-from search.services.spell_check import spell_check_ru as spell_check, lemmatize
-
-
-def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str):
-    if operation.startswith("<=") or operation.startswith("=<"):
-        return unit.filter(
-            characteristic__numeric_value_max__lte=int(float(operation[2:]))
-        )
-    elif operation.startswith("=>") or operation.startswith(">="):
-        return unit.filter(
-            characteristic__numeric_value_min__gte=int(float(operation[2:]))
-        )
-    elif operation.startswith(">"):
-        return unit.filter(
-            characteristic__numeric_value_min__gt=int(float(operation[1:]))
-        )
-    elif operation.startswith("<"):
-        return unit.filter(
-            characteristic__numeric_value_max__lt=int(float(operation[1:]))
-        )
-    elif operation.startswith("="):
-        return unit.filter(
-            characteristic__numeric_value_min__gte=int(float(operation[1:])),
-            characteristic__numeric_value_max__lte=int(float(operation[1:])),
-        )
-    return unit
-
-
-def _clean_text(text: str) -> List[str]:
-    for st in [".", ",", "!", "?"]:
-        text = text.replace(st, " ")
-    text = text.split()
-    re = []
-    for word in text:
-        re.append(word)
-    return re
-
-
-def apply_qs_search(text: str):
-    text = _clean_text(text)
-    products = Product.objects.none()
-    for word in text:
-        products = (
-            products
-            | Product.objects.filter(name__unaccent__trigram_similar=word)
-            | Product.objects.filter(name__unaccent__icontains=word)
-        )
-    products = products.order_by("-score")
-    return products
-
-
-def apply_all_qs_search(orig_qs, text: str):
-    # words
-    text = _clean_text(text)
-
-    u_qs = None
-
-    # try to find Unit characteristics
-    if any(x.isnumeric() for x in text):
-        u_qs = ProductUnitCharacteristic.objects.filter()
-        for i in range(len(text)):
-            el = text[i]
-            if el.isnumeric():
-                if i == len(text) - 1:
-                    if ProductUnitCharacteristic.objects.filter(
-                        characteristic__name__icontains=text[i - 1]
-                    ).exists():
-                        unit = ProductUnitCharacteristic.objects.filter(
-                            characteristic__name__icontains=text[i - 1]
-                        )
-                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
-                        del text[i]
-                        del text[i - 1]
-                        break
-                elif len(text) - 1 > i >= 1:
-                    if ProductUnitCharacteristic.objects.filter(
-                        characteristic__name__icontains=text[i - 1]
-                    ).exists():
-                        unit = ProductUnitCharacteristic.objects.filter(
-                            characteristic__name__icontains=text[i - 1]
-                        )[0]
-                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
-                        del text[i]
-                        del text[i - 1]
-                        break
-                    elif ProductUnitCharacteristic.objects.filter(
-                        characteristic__name__icontains=text[i + 1]
-                    ).exists():
-                        unit = UnitCharacteristic.objects.filter(
-                            ProductUnitCharacteristic=text[i + 1]
-                        )[0]
-                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
-                        del text[i]
-                        del text[i + 1]
-                        break
-                else:
-                    if ProductUnitCharacteristic.objects.filter(
-                        characteristic__name__icontains=text[i + 1]
-                    ).exists():
-                        unit = ProductUnitCharacteristic.objects.filter(
-                            characteristic__name__icontains=text[i + 1]
-                        )[0]
-                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
-                        del text[i]
-                        del text[i + 1]
-                        break
-
-    prod = Product.objects.filter()
-    for word in text:
-        car = ProductCharacteristic.objects.filter(
-            characteristic__value__icontains=word,
-        )
-        qs = (
-            Product.objects.filter(name__icontains=word)
-            | Product.objects.filter(name__unaccent__trigram_similar=word)
-            | Product.objects.filter(category__name__icontains=word)
-            | Product.objects.filter(characteristics__in=car)
-        )
-        prod = prod & qs
-        if u_qs:
-            prod = prod & Product.objects.filter(unit_characteristics__in=u_qs)
-
-    return prod
-
-
-def process_search(data: List[dict], limit=5, offset=0) -> List[dict]:
-    prep_data = []
-    prep_dict = {}
-    prep_dict_char_type = {}
-    # --------------------------------------- prepare filters -------------------------------------------------------- #
-    for x in data:
-        dat = dict(x)
-        if x["type"] in ["Name", "Category", "Characteristic", "All"]:
-            prep_data.append(
-                {
-                    "type": dat["type"],
-                    "value": spell_check(
-                        dat["value"],
-                    ),
-                }
-            )
-        elif x["type"] == "Unknown":
-            type = get_hints(dat["value"])
-            prep_data.append(
-                {
-                    "type": type,
-                    "value": spell_check(
-                        dat["value"],
-                    ),
-                }
-            )
-        else:
-            val = spell_check(
-                dat["value"],
-            )
-            if x["type"] in list(prep_dict.keys()):
-                if x["type"].startswith("*"):
-                    unit = ProductUnitCharacteristic.objects.filter(
-                        characteristic__in=prep_dict_char_type[x["type"]],
-                    )
-                    prep_dict[x["type"]] = prep_dict[
-                        x["type"]
-                    ] | process_unit_operation(unit, x["value"])
-                else:
-                    prep_dict[x["type"]] = (
-                        prep_dict[x["type"]]
-                        | ProductCharacteristic.objects.filter(
-                            characteristic__in=prep_dict_char_type[x["type"]],
-                            characteristic__value__unaccent__trigram_similar=val,
-                        )
-                        | ProductCharacteristic.objects.filter(
-                            characteristic__in=prep_dict_char_type[x["type"]],
-                            characteristic__value__icontains=val,
-                        )
-                    )
-            else:
-                if x["type"].startswith("*"):
-                    prep_dict_char_type[x["type"]] = UnitCharacteristic.objects.filter(
-                        name__unaccent__trigram_similar=x["type"]
-                    ) | UnitCharacteristic.objects.filter(name__icontains=x["type"])
-                    unit = ProductUnitCharacteristic.objects.filter(
-                        characteristic__in=prep_dict_char_type[x["type"]],
-                    )
-                    prep_dict[x["type"]] = process_unit_operation(unit, x["value"])
-                else:
-                    prep_dict_char_type[x["type"]] = Characteristic.objects.filter(
-                        name__unaccent__trigram_similar=x["type"]
-                    ) | Characteristic.objects.filter(name__icontains=x["type"])
-                    prep_dict[x["type"]] = ProductCharacteristic.objects.filter(
-                        characteristic__in=prep_dict_char_type[x["type"]],
-                        characteristic__value__unaccent__trigram_similar=val,
-                    ) | ProductCharacteristic.objects.filter(
-                        characteristic__in=prep_dict_char_type[x["type"]],
-                        characteristic__value__icontains=val,
-                    )
-    for el, val in prep_dict.items():
-        prep_data.append({"type": el, "value": val})
-    # ----------------------------------- apply filters on QuerySet -------------------------------------------------- #
-    qs = Product.objects.filter()
-    for x in prep_data:
-        typ = x["type"]
-        val = x["value"]
-        if typ == "Name":
-            qs = qs & apply_qs_search(val)
-            qs = qs.order_by("-score")
-        elif typ == "All":
-            qs = apply_all_qs_search(qs, val) & qs
-        elif typ == "Category":
-            qs = qs.filter(category__name__icontains=val)
-            qs = qs.order_by("-score")
-        elif typ == "Characteristic":
-            char = ProductCharacteristic.objects.filter(product__in=qs)
-            char = char.filter(characteristic__value__icontains=val) | char.filter(
-                characteristic__value__unaccent__trigram_similar=val
-            )
-            qs = qs.filter(characteristics__in=char)
-            qs = qs.order_by("-score")
-        elif typ == "Unknown":
-            continue
-        else:
-            if typ.startswith("*"):
-                qs = qs.filter(unit_characteristics__in=val)
-            else:
-                qs = qs.filter(characteristics__in=val)
-    return [x.serialize_self() for x in qs.distinct()[offset : offset + limit]]
diff --git a/app/search/services/search/__init__.py b/app/search/services/search/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/search/services/search/main.py b/app/search/services/search/main.py
new file mode 100644
index 0000000..a0450f6
--- /dev/null
+++ b/app/search/services/search/main.py
@@ -0,0 +1,38 @@
+from typing import List
+
+from search.services.search.methods import (
+    apply_qs_search,
+    apply_all_qs_search,
+    apply_qs_category,
+    appy_qs_characteristic,
+)
+from search.services.search.prepare import apply_union
+from search.models import Product
+
+
+def process_search(data: List[dict], limit=5, offset=0) -> List[dict]:
+    prep_data = apply_union(data)
+    # ----------------------------------- apply filters on QuerySet -------------------------------------------------- #
+    qs = Product.objects.filter()
+    for x in prep_data:
+        typ = x["type"]
+        val = x["value"]
+        if typ == "Name":
+            qs = qs & apply_qs_search(val)
+            qs = qs.order_by("-score")
+        elif typ == "All":
+            qs = apply_all_qs_search(qs, val) & qs
+        elif typ == "Category":
+            qs = apply_qs_category(qs, val)
+            qs = qs.order_by("-score")
+        elif typ == "Characteristic":
+            qs = appy_qs_characteristic(qs, val)
+            qs = qs.order_by("-score")
+        elif typ == "Unknown":
+            continue
+        else:
+            if typ.startswith("*"):
+                qs = qs.filter(unit_characteristics__in=val)
+            else:
+                qs = qs.filter(characteristics__in=val)
+    return [x.serialize_self() for x in qs.distinct()[offset: offset + limit]]
diff --git a/app/search/services/search/methods.py b/app/search/services/search/methods.py
new file mode 100644
index 0000000..e25e74a
--- /dev/null
+++ b/app/search/services/search/methods.py
@@ -0,0 +1,130 @@
+from typing import List
+
+from search.models import (
+    Product,
+    ProductCharacteristic,
+    ProductUnitCharacteristic,
+)
+from search.services.spell_check import pos
+
+
+def _clean_text(text: str) -> List[str]:
+    for st in [".", ",", "!", "?"]:
+        text = text.replace(st, " ")
+    text = text.split()
+    functors_pos = {"INTJ", "PRCL", "CONJ", "PREP"}  # function words
+    return [word for word in text if pos(word) not in functors_pos]
+
+
+def process_unit_operation(unit: ProductUnitCharacteristic.objects, operation: str):
+    if operation.startswith("<=") or operation.startswith("=<"):
+        return unit.filter(
+            characteristic__numeric_value_max__lte=int(float(operation[2:]))
+        )
+    elif operation.startswith("=>") or operation.startswith(">="):
+        return unit.filter(
+            characteristic__numeric_value_min__gte=int(float(operation[2:]))
+        )
+    elif operation.startswith(">"):
+        return unit.filter(
+            characteristic__numeric_value_min__gt=int(float(operation[1:]))
+        )
+    elif operation.startswith("<"):
+        return unit.filter(
+            characteristic__numeric_value_max__lt=int(float(operation[1:]))
+        )
+    elif operation.startswith("="):
+        return unit.filter(
+            characteristic__numeric_value_min__gte=int(float(operation[1:])),
+            characteristic__numeric_value_max__lte=int(float(operation[1:])),
+        )
+    return unit
+
+
+def apply_qs_search(text: str):
+    text = _clean_text(text)
+    products = Product.objects.none()
+    for word in text:
+        products = (
+            products
+            | Product.objects.filter(name__unaccent__icontains=word)
+            | Product.objects.filter(name__unaccent__trigram_similar=word)
+        )
+    products = products.order_by("-score")
+    return products
+
+
+def apply_all_qs_search(orig_qs, text: str):
+    # words
+    text = _clean_text(text)
+
+    u_qs = None
+
+    # try to find Unit characteristics
+    if any(x.isnumeric() for x in text):
+        u_qs = ProductUnitCharacteristic.objects.filter()
+        for i in range(len(text)):
+            el = text[i]
+            if el.isnumeric():
+                if i == len(text) - 1:
+                    if unit := ProductUnitCharacteristic.objects.filter(
+                        characteristic__name__icontains=text[i - 1]
+                    ):
+                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
+                        del text[i - 1]
+                        del text[i - 1]
+                        break
+                elif len(text) - 1 > i >= 1:
+                    if unit := ProductUnitCharacteristic.objects.filter(
+                        characteristic__name__icontains=text[i + 1]
+                    ):
+                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
+                        del text[i]
+                        del text[i]
+                        break
+                    elif unit := ProductUnitCharacteristic.objects.filter(
+                        characteristic__name__icontains=text[i - 1]
+                    ):
+                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
+                        del text[i - 1]
+                        del text[i - 1]
+                        break
+                else:
+                    if unit := ProductUnitCharacteristic.objects.filter(
+                        characteristic__name__icontains=text[i + 1]
+                    ):
+                        u_qs = u_qs & process_unit_operation(unit, f"={text[i]}")
+                        del text[i]
+                        del text[i]
+                        break
+
+    prod = Product.objects.filter()
+    for word in text:
+        car = ProductCharacteristic.objects.filter(
+            characteristic__value__icontains=word,
+        )
+        qs = (
+            Product.objects.filter(name__icontains=word)
+            | Product.objects.filter(category__name__icontains=word)
+            | Product.objects.filter(characteristics__in=car)
+        )
+        prod = prod & qs
+
+        if u_qs:
+            prod = prod & Product.objects.filter(unit_characteristics__in=u_qs)
+
+    return prod
+
+
+def apply_qs_category(qs, name: str):
+    qs = qs.filter(category__name__icontains=name)
+    return qs
+
+
+def appy_qs_characteristic(qs, name: str):
+    char = ProductCharacteristic.objects.filter(product__in=qs)
+    char = char.filter(characteristic__value__icontains=name) | char.filter(
+        characteristic__value__unaccent__trigram_similar=name
+    )
+    qs = qs.filter(characteristics__in=char)
+    return qs
diff --git a/app/search/services/search/prepare.py b/app/search/services/search/prepare.py
new file mode 100644
index 0000000..6e19e80
--- /dev/null
+++ b/app/search/services/search/prepare.py
@@ -0,0 +1,87 @@
+from typing import List, Dict
+
+from rest_framework.exceptions import ValidationError
+
+from search.models import Characteristic, ProductCharacteristic, ProductUnitCharacteristic, UnitCharacteristic
+from search.services.hints import get_hints
+from search.services.search.methods import process_unit_operation
+)
+from search.services.spell_check import spell_check_ru as spell_check
+
+
+def apply_union(data: List[Dict]) -> List[Dict]:
+    prep_data = []
+    prep_dict = {}
+    prep_dict_char_type = {}
+    # --------------------------------------- prepare filters -------------------------------------------------------- #
+    for x in data:
+        dat = dict(x)
+        if "type" not in dat or "value" not in dat:
+            raise ValidationError("Improper body structure")
+
+        if x["type"] in ["Name", "Category", "Characteristic", "All"]:
+            prep_data.append(
+                {
+                    "type": dat["type"],
+                    "value": spell_check(
+                        dat["value"],
+                    ),
+                }
+            )
+        elif x["type"] == "Unknown":
+            type = get_hints(dat["value"])
+            prep_data.append(
+                {
+                    "type": type,
+                    "value": spell_check(
+                        dat["value"],
+                    ),
+                }
+            )
+        else:
+            val = spell_check(
+                dat["value"],
+            )
+            if x["type"] in list(prep_dict.keys()):
+                if x["type"].startswith("*"):
+                    unit = ProductUnitCharacteristic.objects.filter(
+                        characteristic__in=prep_dict_char_type[x["type"]],
+                    )
+                    prep_dict[x["type"]] = prep_dict[
+                        x["type"]
+                    ] | process_unit_operation(unit, x["value"])
+                else:
+                    prep_dict[x["type"]] = (
+                        prep_dict[x["type"]]
+                        | ProductCharacteristic.objects.filter(
+                            characteristic__in=prep_dict_char_type[x["type"]],
+                            characteristic__value__unaccent__trigram_similar=val,
+                        )
+                        | ProductCharacteristic.objects.filter(
+                            characteristic__in=prep_dict_char_type[x["type"]],
+                            characteristic__value__icontains=val,
+                        )
+                    )
+            else:
+                if x["type"].startswith("*"):
+                    prep_dict_char_type[x["type"]] = UnitCharacteristic.objects.filter(
+                        name__unaccent__trigram_similar=x["type"]
+                    ) | UnitCharacteristic.objects.filter(name__icontains=x["type"])
+                    unit = ProductUnitCharacteristic.objects.filter(
+                        characteristic__in=prep_dict_char_type[x["type"]],
+                    )
+                    prep_dict[x["type"]] = process_unit_operation(unit, x["value"])
+                else:
+                    prep_dict_char_type[x["type"]] = Characteristic.objects.filter(
+                        name__unaccent__trigram_similar=x["type"]
+                    ) | Characteristic.objects.filter(name__icontains=x["type"])
+                    prep_dict[x["type"]] = ProductCharacteristic.objects.filter(
+                        characteristic__in=prep_dict_char_type[x["type"]],
+                        characteristic__value__unaccent__trigram_similar=val,
+                    ) | ProductCharacteristic.objects.filter(
+                        characteristic__in=prep_dict_char_type[x["type"]],
+                        characteristic__value__icontains=val,
+                    )
+    for el, val in prep_dict.items():
+        prep_data.append({"type": el, "value": val})
+    return prep_data
diff --git a/app/search/services/spell_check.py b/app/search/services/spell_check.py
index 9995cac..00697b3 100644
--- a/app/search/services/spell_check.py
+++ b/app/search/services/spell_check.py
@@ -23,5 +23,8 @@ morph = pymorphy2.MorphAnalyzer()
 
 
 def lemmatize(word):
-    p = morph.parse(word)[0]
-    return p.normal_form
+    return morph.parse(word)[0].normal_form
+
+
+def pos(word):
+    return morph.parse(word)[0].tag.POS
diff --git a/requirements/base.txt b/requirements/base.txt
index aae357c..bab147f 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -11,4 +11,5 @@ psycopg2-binary==2.9.4
 
 celery==5.2.7
 
-pyspellchecker==0.7.0
\ No newline at end of file
+pyspellchecker==0.7.0
+pymorphy2