Support list values and INTERSECTS in Matcher (#8784)

* Support list values and IS_INTERSECT in Matcher * Support list values as token attributes for set operators, not just as pattern values. * Add `IS_INTERSECT` operator. * Fix incorrect `ISSUBSET` and `ISSUPERSET` in schema and docs. * Rename IS_INTERSECT to INTERSECTS
2025-07-15 10:42:34 +03:00 · 2021-08-02 19:39:26 +02:00 · 2021-08-02 19:39:26 +02:00 · 175847f92c
commit 175847f92c
parent fbbbda1954
5 changed files with 106 additions and 21 deletions
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -845,7 +845,7 @@ class _RegexPredicate:
 class _SetPredicate:
-    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
+    operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
        self.i = i
@ -868,14 +868,16 @@ class _SetPredicate:
        else:
            value = get_token_attr_for_matcher(token.c, self.attr)
-        if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
+        if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
            if self.attr == MORPH:
                # break up MORPH into individual Feat=Val values
                value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
            else:
-                # IS_SUBSET for other attrs will be equivalent to "IN"
+                # treat a single value as a list
-                # IS_SUPERSET will only match for other attrs with 0 or 1 values
+                if isinstance(value, (str, int)):
-                value = set([value])
+                    value = set([get_string_id(value)])
                else:
                    value = set(get_string_id(v) for v in value)
        if self.predicate == "IN":
            return value in self.value
        elif self.predicate == "NOT_IN":
@ -884,6 +886,8 @@ class _SetPredicate:
            return value <= self.value
        elif self.predicate == "IS_SUPERSET":
            return value >= self.value
        elif self.predicate == "INTERSECTS":
            return bool(value & self.value)
    def __repr__(self):
        return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
@ -928,6 +932,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
        "NOT_IN": _SetPredicate,
        "IS_SUBSET": _SetPredicate,
        "IS_SUPERSET": _SetPredicate,
        "INTERSECTS": _SetPredicate,
        "==": _ComparisonPredicate,
        "!=": _ComparisonPredicate,
        ">=": _ComparisonPredicate,
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -159,6 +159,7 @@ class TokenPatternString(BaseModel):
    NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
    IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
    IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
    class Config:
        extra = "forbid"
@ -175,8 +176,9 @@ class TokenPatternNumber(BaseModel):
    REGEX: Optional[StrictStr] = Field(None, alias="regex")
    IN: Optional[List[StrictInt]] = Field(None, alias="in")
    NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
-    ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
+    IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
-    ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
+    IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
    INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
    EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
    NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
    GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab):
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0
    # IS_SUBSET with a list value
    Token.set_extension("ext", default=[])
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A"]
    doc[1]._.ext = ["C", "D"]
    assert len(matcher(doc)) == 2
 def test_matcher_superset_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
@ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab):
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 3
    # IS_SUPERSET with a list value
    Token.set_extension("ext", default=[])
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A", "B"]
    assert len(matcher(doc)) == 1
 def test_matcher_intersect_value_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    assert len(matcher(doc)) == 0
    doc[0].set_morph("Feat=Val")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
    assert len(matcher(doc)) == 1
    doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
    assert len(matcher(doc)) == 1
    # INTERSECTS with a single value is the same as IN
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 1
    # INTERSECTS with an empty pattern list matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"TAG": {"INTERSECTS": []}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0].tag_ = "A"
    assert len(matcher(doc)) == 0
    # INTERSECTS with a list value
    Token.set_extension("ext", default=[])
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A", "B"]
    assert len(matcher(doc)) == 1
    # INTERSECTS with an empty pattern list matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = ["A", "B"]
    assert len(matcher(doc)) == 0
    # INTERSECTS with an empty value matches nothing
    matcher = Matcher(en_vocab)
    pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}]
    matcher.add("M", [pattern])
    doc = Doc(en_vocab, words=["a", "b", "c"])
    doc[0]._.ext = []
    assert len(matcher(doc)) == 0
 def test_matcher_morph_handling(en_vocab):
    # order of features in pattern doesn't matter
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -77,13 +77,14 @@ it compares to another value.
 > ]
 > ```
-| Attribute                  | Description                                                                                             |
+| Attribute                  | Description                                                                                              |
-| -------------------------- | ------------------------------------------------------------------------------------------------------- |
+| -------------------------- | -------------------------------------------------------------------------------------------------------- |
-| `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
+| `IN`                       | Attribute value is member of a list. ~~Any~~                                                             |
-| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                       |
-| `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `IS_SUBSET`                | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~                   |
-| `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
+| `IS_SUPERSET`              | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~                 |
-| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
+| `INTERSECTS`               | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~  |
 ## Matcher.\_\_init\_\_ {#init tag="method"}
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -240,13 +240,14 @@ following rich comparison attributes are available:
 > # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset
 > ```
-| Attribute                  | Description                                                                                             |
+| Attribute                  | Description                                                                                               |
-| -------------------------- | ------------------------------------------------------------------------------------------------------- |
+| -------------------------- | --------------------------------------------------------------------------------------------------------- |
-| `IN`                       | Attribute value is member of a list. ~~Any~~                                                            |
+| `IN`                       | Attribute value is member of a list. ~~Any~~                                                              |
-| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      |
+| `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                        |
-| `IS_SUBSET`                | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          |
+| `IS_SUBSET`                | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~                    |
-| `IS_SUPERSET`              | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        |
+| `IS_SUPERSET`              | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~                  |
-| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
+| `INTERSECTS`               | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ |
 | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~   |
 #### Regular expressions {#regex new="2.1"}