mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Support list values and INTERSECTS in Matcher (#8784)
* Support list values and IS_INTERSECT in Matcher * Support list values as token attributes for set operators, not just as pattern values. * Add `IS_INTERSECT` operator. * Fix incorrect `ISSUBSET` and `ISSUPERSET` in schema and docs. * Rename IS_INTERSECT to INTERSECTS
This commit is contained in:
parent
fbbbda1954
commit
175847f92c
|
@ -845,7 +845,7 @@ class _RegexPredicate:
|
||||||
|
|
||||||
|
|
||||||
class _SetPredicate:
|
class _SetPredicate:
|
||||||
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
|
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
|
@ -868,14 +868,16 @@ class _SetPredicate:
|
||||||
else:
|
else:
|
||||||
value = get_token_attr_for_matcher(token.c, self.attr)
|
value = get_token_attr_for_matcher(token.c, self.attr)
|
||||||
|
|
||||||
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
|
if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"):
|
||||||
if self.attr == MORPH:
|
if self.attr == MORPH:
|
||||||
# break up MORPH into individual Feat=Val values
|
# break up MORPH into individual Feat=Val values
|
||||||
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
|
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
|
||||||
else:
|
else:
|
||||||
# IS_SUBSET for other attrs will be equivalent to "IN"
|
# treat a single value as a list
|
||||||
# IS_SUPERSET will only match for other attrs with 0 or 1 values
|
if isinstance(value, (str, int)):
|
||||||
value = set([value])
|
value = set([get_string_id(value)])
|
||||||
|
else:
|
||||||
|
value = set(get_string_id(v) for v in value)
|
||||||
if self.predicate == "IN":
|
if self.predicate == "IN":
|
||||||
return value in self.value
|
return value in self.value
|
||||||
elif self.predicate == "NOT_IN":
|
elif self.predicate == "NOT_IN":
|
||||||
|
@ -884,6 +886,8 @@ class _SetPredicate:
|
||||||
return value <= self.value
|
return value <= self.value
|
||||||
elif self.predicate == "IS_SUPERSET":
|
elif self.predicate == "IS_SUPERSET":
|
||||||
return value >= self.value
|
return value >= self.value
|
||||||
|
elif self.predicate == "INTERSECTS":
|
||||||
|
return bool(value & self.value)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
|
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
|
||||||
|
@ -928,6 +932,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
"NOT_IN": _SetPredicate,
|
"NOT_IN": _SetPredicate,
|
||||||
"IS_SUBSET": _SetPredicate,
|
"IS_SUBSET": _SetPredicate,
|
||||||
"IS_SUPERSET": _SetPredicate,
|
"IS_SUPERSET": _SetPredicate,
|
||||||
|
"INTERSECTS": _SetPredicate,
|
||||||
"==": _ComparisonPredicate,
|
"==": _ComparisonPredicate,
|
||||||
"!=": _ComparisonPredicate,
|
"!=": _ComparisonPredicate,
|
||||||
">=": _ComparisonPredicate,
|
">=": _ComparisonPredicate,
|
||||||
|
|
|
@ -159,6 +159,7 @@ class TokenPatternString(BaseModel):
|
||||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||||
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||||
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||||
|
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
@ -175,8 +176,9 @@ class TokenPatternNumber(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
||||||
ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
|
IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset")
|
||||||
ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
|
IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset")
|
||||||
|
INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects")
|
||||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
||||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
||||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
||||||
|
|
|
@ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab):
|
||||||
doc[0].tag_ = "A"
|
doc[0].tag_ = "A"
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
# IS_SUBSET with a list value
|
||||||
|
Token.set_extension("ext", default=[])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0]._.ext = ["A"]
|
||||||
|
doc[1]._.ext = ["C", "D"]
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_superset_value_operator(en_vocab):
|
def test_matcher_superset_value_operator(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
@ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab):
|
||||||
doc[0].tag_ = "A"
|
doc[0].tag_ = "A"
|
||||||
assert len(matcher(doc)) == 3
|
assert len(matcher(doc)) == 3
|
||||||
|
|
||||||
|
# IS_SUPERSET with a list value
|
||||||
|
Token.set_extension("ext", default=[])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0]._.ext = ["A", "B"]
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_intersect_value_operator(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
doc[0].set_morph("Feat=Val")
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
doc[0].set_morph("Feat=Val|Feat2=Val2")
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3")
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4")
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
# INTERSECTS with a single value is the same as IN
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
# INTERSECTS with an empty pattern list matches nothing
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"TAG": {"INTERSECTS": []}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
# INTERSECTS with a list value
|
||||||
|
Token.set_extension("ext", default=[])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0]._.ext = ["A", "B"]
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
# INTERSECTS with an empty pattern list matches nothing
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"_": {"ext": {"INTERSECTS": []}}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0]._.ext = ["A", "B"]
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
# INTERSECTS with an empty value matches nothing
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0]._.ext = []
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_morph_handling(en_vocab):
|
def test_matcher_morph_handling(en_vocab):
|
||||||
# order of features in pattern doesn't matter
|
# order of features in pattern doesn't matter
|
||||||
|
|
|
@ -77,13 +77,14 @@ it compares to another value.
|
||||||
> ]
|
> ]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
| -------------------------- | -------------------------------------------------------------------------------------------------------- |
|
||||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||||
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
|
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
|
||||||
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
|
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
|
||||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ |
|
||||||
|
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||||
|
|
||||||
## Matcher.\_\_init\_\_ {#init tag="method"}
|
## Matcher.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -240,13 +240,14 @@ following rich comparison attributes are available:
|
||||||
> # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset
|
> # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
| -------------------------- | --------------------------------------------------------------------------------------------------------- |
|
||||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||||
| `IS_SUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
|
| `IS_SUBSET` | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~ |
|
||||||
| `IS_SUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
|
| `IS_SUPERSET` | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~ |
|
||||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
| `INTERSECTS` | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ |
|
||||||
|
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||||
|
|
||||||
#### Regular expressions {#regex new="2.1"}
|
#### Regular expressions {#regex new="2.1"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user