Add MORPH handling to Matcher (#6107)

* Add MORPH handling to Matcher

* Add `MORPH` to `Matcher` schema
* Rename `_SetMemberPredicate` to `_SetPredicate`
* Add `ISSUBSET` and `ISSUPERSET` operators to `_SetPredicate`
  * Add special handling for normalization and conversion of morph
    values into sets
  * For other attrs, `ISSUBSET` acts like `IN` and `ISSUPERSET` only
    matches for 0 or 1 values

* Update test

* Rename to IS_SUBSET and IS_SUPERSET
This commit is contained in:
Adriane Boyd 2020-09-24 16:55:09 +02:00 committed by GitHub
parent 59340606b7
commit 3c062b3911
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 174 additions and 43 deletions

View File

@ -17,6 +17,7 @@ from ..vocab cimport Vocab
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
from ..tokens.span cimport Span
from ..tokens.token cimport Token
from ..tokens.morphanalysis cimport MorphAnalysis
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
from ..schemas import validate_token_pattern
@ -124,7 +125,7 @@ cdef class Matcher:
key = self._normalize_key(key)
for pattern in patterns:
try:
specs = _preprocess_pattern(pattern, self.vocab.strings,
specs = _preprocess_pattern(pattern, self.vocab,
self._extensions, self._extra_predicates)
self.patterns.push_back(init_pattern(self.mem, key, specs))
for spec in specs:
@ -663,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
return id_attr.value
def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
"""This function interprets the pattern, converting the various bits of
syntactic sugar before we compile it into a struct with init_pattern.
@ -678,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
extra_predicates.
"""
tokens = []
string_store = vocab.strings
for spec in token_specs:
if not spec:
# Signifier for 'any token'
@ -688,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
ops = _get_operators(spec)
attr_values = _get_attr_values(spec, string_store)
extensions = _get_extensions(spec, string_store, extensions_table)
predicates = _get_extra_predicates(spec, extra_predicates)
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
for op in ops:
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
return tokens
@ -732,7 +734,7 @@ def _get_attr_values(spec, string_store):
class _RegexPredicate:
operators = ("REGEX",)
def __init__(self, i, attr, value, predicate, is_extension=False):
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = re.compile(value)
@ -750,13 +752,18 @@ class _RegexPredicate:
return bool(self.value.search(value))
class _SetMemberPredicate:
operators = ("IN", "NOT_IN")
class _SetPredicate:
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
def __init__(self, i, attr, value, predicate, is_extension=False):
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = set(get_string_id(v) for v in value)
self.vocab = vocab
if self.attr == MORPH:
# normalize morph strings
self.value = set(self.vocab.morphology.add(v) for v in value)
else:
self.value = set(get_string_id(v) for v in value)
self.predicate = predicate
self.is_extension = is_extension
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
@ -768,19 +775,32 @@ class _SetMemberPredicate:
value = get_string_id(token._.get(self.attr))
else:
value = get_token_attr_for_matcher(token.c, self.attr)
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
if self.attr == MORPH:
# break up MORPH into individual Feat=Val values
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
else:
# IS_SUBSET for other attrs will be equivalent to "IN"
# IS_SUPERSET will only match for other attrs with 0 or 1 values
value = set([value])
if self.predicate == "IN":
return value in self.value
else:
elif self.predicate == "NOT_IN":
return value not in self.value
elif self.predicate == "IS_SUBSET":
return value <= self.value
elif self.predicate == "IS_SUPERSET":
return value >= self.value
def __repr__(self):
return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
class _ComparisonPredicate:
operators = ("==", "!=", ">=", "<=", ">", "<")
def __init__(self, i, attr, value, predicate, is_extension=False):
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
self.i = i
self.attr = attr
self.value = value
@ -809,11 +829,13 @@ class _ComparisonPredicate:
return value < self.value
def _get_extra_predicates(spec, extra_predicates):
def _get_extra_predicates(spec, extra_predicates, vocab):
predicate_types = {
"REGEX": _RegexPredicate,
"IN": _SetMemberPredicate,
"NOT_IN": _SetMemberPredicate,
"IN": _SetPredicate,
"NOT_IN": _SetPredicate,
"IS_SUBSET": _SetPredicate,
"IS_SUPERSET": _SetPredicate,
"==": _ComparisonPredicate,
"!=": _ComparisonPredicate,
">=": _ComparisonPredicate,
@ -841,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
for type_, cls in predicate_types.items():
if type_ in value_with_upper_keys:
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
# Don't create a redundant predicates.
# This helps with efficiency, as we're caching the results.
if predicate.key in seen_predicates:

View File

@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictStr]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
class Config:
extra = "forbid"
@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
REGEX: Optional[StrictStr] = Field(None, alias="regex")
IN: Optional[List[StrictInt]] = Field(None, alias="in")
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
lower: Optional[StringValue] = None
pos: Optional[StringValue] = None
tag: Optional[StringValue] = None
morph: Optional[StringValue] = None
dep: Optional[StringValue] = None
lemma: Optional[StringValue] = None
shape: Optional[StringValue] = None

View File

@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
assert len(matches) == 1
def test_matcher_subset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val"
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2"
assert len(matcher(doc)) == 3
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
assert len(matcher(doc)) == 2
# IS_SUBSET acts like "IN" for attrs other than MORPH
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 1
# IS_SUBSET with an empty list matches nothing
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUBSET": []}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
def test_matcher_superset_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2"
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
assert len(matcher(doc)) == 1
# IS_SUPERSET with more than one value only matches for MORPH
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 0
# IS_SUPERSET with one value is the same as ==
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 1
# IS_SUPERSET with an empty value matches everything
matcher = Matcher(en_vocab)
pattern = [{"TAG": {"IS_SUPERSET": []}}]
matcher.add("M", [pattern])
doc = Doc(en_vocab, words=["a", "b", "c"])
doc[0].tag_ = "A"
assert len(matcher(doc)) == 3
def test_matcher_morph_handling(en_vocab):
# order of features in pattern doesn't matter
matcher = Matcher(en_vocab)
pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
matcher.add("M", [pattern1])
matcher.add("N", [pattern2])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
assert len(matcher(doc)) == 2
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
assert len(matcher(doc)) == 2
# multiple values are split
matcher = Matcher(en_vocab)
pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
matcher.add("M", [pattern1])
matcher.add("N", [pattern2])
doc = Doc(en_vocab, words=["a", "b", "c"])
assert len(matcher(doc)) == 0
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
assert len(matcher(doc)) == 1
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
assert len(matcher(doc)) == 2
def test_matcher_regex(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]

View File

@ -30,20 +30,20 @@ pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
| Attribute |  Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
| Attribute |  Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
Operators and quantifiers define **how often** a token pattern should be
matched:
@ -79,6 +79,8 @@ it compares to another value.
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
## Matcher.\_\_init\_\_ {#init tag="method"}

View File

@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
| Attribute |  Description |
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
| Attribute |  Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -236,6 +236,8 @@ following rich comparison attributes are available:
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
#### Regular expressions {#regex new="2.1"}