Support list values and INTERSECTS in Matcher (#8784)

* Support list values and IS_INTERSECT in Matcher

* Support list values as token attributes for set operators, not just as
pattern values.

* Add `IS_INTERSECT` operator.

* Fix incorrect `ISSUBSET` and `ISSUPERSET` in schema and docs.

* Rename IS_INTERSECT to INTERSECTS
This commit is contained in:
Adriane Boyd 2021-08-02 19:39:26 +02:00 committed by svlandeg
parent d79dbd0624
commit c1caa47aa7
5 changed files with 106 additions and 21 deletions

View File

@ -845,7 +845,7 @@ class _RegexPredicate:
class _SetPredicate:
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
@ -868,14 +868,16 @@ class _SetPredicate:
else:
value = get_token_attr_for_matcher(token.c, self.attr)
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
if self.attr == MORPH:
# break up MORPH into individual Feat=Val values
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
else:
# IS_SUBSET for other attrs will be equivalent to "IN"
# IS_SUPERSET will only match for other attrs with 0 or 1 values
value = set([value])
# treat a single value as a list
if isinstance(value, (str, int)):
value = set([get_string_id(value)])
else:
value = set(get_string_id(v) for v in value)
if self.predicate == "IN":
return value in self.value
elif self.predicate == "NOT_IN":
@ -884,6 +886,8 @@ class _SetPredicate:
return value <= self.value
elif self.predicate == "IS_SUPERSET":
return value >= self.value
elif self.predicate == "INTERSECTS":
return bool(value & self.value)
def __repr__(self):
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
@ -928,6 +932,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
"NOT_IN": _SetPredicate,
"IS_SUBSET": _SetPredicate,
"IS_SUPERSET": _SetPredicate,
"INTERSECTS": _SetPredicate,
"==": _ComparisonPredicate,
"!=": _ComparisonPredicate,
">=": _ComparisonPredicate,

View File

@ -159,6 +159,7 @@ class TokenPatternString(BaseModel):
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
class Config:
extra = "forbid"
@ -175,8 +176,9 @@ class TokenPatternNumber(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictInt]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")

View File

@ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab):
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
# IS_SUBSET with a list value
Token.set_extension("ext", default=[])
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = ["A"]
doc[1]._.ext = ["C", "D"]
assert len(matcher(doc)) == 2
def test_matcher_superset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
@ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab):
doc[0].tag_ = "A"
assert len(matcher(doc)) == 3
# IS_SUPERSET with a list value
Token.set_extension("ext", default=[])
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 1
def test_matcher_intersect_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].set_morph("Feat=Val")
assert len(matcher(doc)) == 1
doc[0].set_morph("Feat=Val|Feat2=Val2")
assert len(matcher(doc)) == 1
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
assert len(matcher(doc)) == 1
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
assert len(matcher(doc)) == 1
# INTERSECTS with a single value is the same as IN
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 1
# INTERSECTS with an empty pattern list matches nothing
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"INTERSECTS": []}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
# INTERSECTS with a list value
Token.set_extension("ext", default=[])
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 1
# INTERSECTS with an empty pattern list matches nothing
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = ["A", "B"]
assert len(matcher(doc)) == 0
# INTERSECTS with an empty value matches nothing
matcher = Matcher(en_vocab)
pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0]._.ext = []
assert len(matcher(doc)) == 0
def test_matcher_morph_handling(en_vocab):
# order of features in pattern doesn't matter

View File

@ -77,13 +77,14 @@ it compares to another value.
> ]
> ```
| Attribute | Description |
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
| Attribute | Description |
| -------------------------- | -------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
## Matcher.\_\_init\_\_ {#init tag="method"}

View File

@ -240,13 +240,14 @@ following rich comparison attributes are available:
> # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset
> ```
| Attribute | Description |
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `IS_SUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `IS_SUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
| Attribute | Description |
| -------------------------- | --------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
#### Regular expressions {#regex new="2.1"}