mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Add MORPH handling to Matcher (#6107)
* Add MORPH handling to Matcher * Add `MORPH` to `Matcher` schema * Rename `_SetMemberPredicate` to `_SetPredicate` * Add `ISSUBSET` and `ISSUPERSET` operators to `_SetPredicate` * Add special handling for normalization and conversion of morph values into sets * For other attrs, `ISSUBSET` acts like `IN` and `ISSUPERSET` only matches for 0 or 1 values * Update test * Rename to IS_SUBSET and IS_SUPERSET
This commit is contained in:
parent
59340606b7
commit
3c062b3911
|
@ -17,6 +17,7 @@ from ..vocab cimport Vocab
|
||||||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||||
|
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
|
@ -124,7 +125,7 @@ cdef class Matcher:
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab.strings,
|
specs = _preprocess_pattern(pattern, self.vocab,
|
||||||
self._extensions, self._extra_predicates)
|
self._extensions, self._extra_predicates)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
|
@ -663,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
return id_attr.value
|
return id_attr.value
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates):
|
def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
|
||||||
"""This function interprets the pattern, converting the various bits of
|
"""This function interprets the pattern, converting the various bits of
|
||||||
syntactic sugar before we compile it into a struct with init_pattern.
|
syntactic sugar before we compile it into a struct with init_pattern.
|
||||||
|
|
||||||
|
@ -678,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
|
||||||
extra_predicates.
|
extra_predicates.
|
||||||
"""
|
"""
|
||||||
tokens = []
|
tokens = []
|
||||||
|
string_store = vocab.strings
|
||||||
for spec in token_specs:
|
for spec in token_specs:
|
||||||
if not spec:
|
if not spec:
|
||||||
# Signifier for 'any token'
|
# Signifier for 'any token'
|
||||||
|
@ -688,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi
|
||||||
ops = _get_operators(spec)
|
ops = _get_operators(spec)
|
||||||
attr_values = _get_attr_values(spec, string_store)
|
attr_values = _get_attr_values(spec, string_store)
|
||||||
extensions = _get_extensions(spec, string_store, extensions_table)
|
extensions = _get_extensions(spec, string_store, extensions_table)
|
||||||
predicates = _get_extra_predicates(spec, extra_predicates)
|
predicates = _get_extra_predicates(spec, extra_predicates, vocab)
|
||||||
for op in ops:
|
for op in ops:
|
||||||
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
|
tokens.append((op, list(attr_values), list(extensions), list(predicates)))
|
||||||
return tokens
|
return tokens
|
||||||
|
@ -732,7 +734,7 @@ def _get_attr_values(spec, string_store):
|
||||||
class _RegexPredicate:
|
class _RegexPredicate:
|
||||||
operators = ("REGEX",)
|
operators = ("REGEX",)
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = re.compile(value)
|
self.value = re.compile(value)
|
||||||
|
@ -750,13 +752,18 @@ class _RegexPredicate:
|
||||||
return bool(self.value.search(value))
|
return bool(self.value.search(value))
|
||||||
|
|
||||||
|
|
||||||
class _SetMemberPredicate:
|
class _SetPredicate:
|
||||||
operators = ("IN", "NOT_IN")
|
operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = set(get_string_id(v) for v in value)
|
self.vocab = vocab
|
||||||
|
if self.attr == MORPH:
|
||||||
|
# normalize morph strings
|
||||||
|
self.value = set(self.vocab.morphology.add(v) for v in value)
|
||||||
|
else:
|
||||||
|
self.value = set(get_string_id(v) for v in value)
|
||||||
self.predicate = predicate
|
self.predicate = predicate
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||||
|
@ -768,19 +775,32 @@ class _SetMemberPredicate:
|
||||||
value = get_string_id(token._.get(self.attr))
|
value = get_string_id(token._.get(self.attr))
|
||||||
else:
|
else:
|
||||||
value = get_token_attr_for_matcher(token.c, self.attr)
|
value = get_token_attr_for_matcher(token.c, self.attr)
|
||||||
|
|
||||||
|
if self.predicate in ("IS_SUBSET", "IS_SUPERSET"):
|
||||||
|
if self.attr == MORPH:
|
||||||
|
# break up MORPH into individual Feat=Val values
|
||||||
|
value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value))
|
||||||
|
else:
|
||||||
|
# IS_SUBSET for other attrs will be equivalent to "IN"
|
||||||
|
# IS_SUPERSET will only match for other attrs with 0 or 1 values
|
||||||
|
value = set([value])
|
||||||
if self.predicate == "IN":
|
if self.predicate == "IN":
|
||||||
return value in self.value
|
return value in self.value
|
||||||
else:
|
elif self.predicate == "NOT_IN":
|
||||||
return value not in self.value
|
return value not in self.value
|
||||||
|
elif self.predicate == "IS_SUBSET":
|
||||||
|
return value <= self.value
|
||||||
|
elif self.predicate == "IS_SUPERSET":
|
||||||
|
return value >= self.value
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate))
|
return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate))
|
||||||
|
|
||||||
|
|
||||||
class _ComparisonPredicate:
|
class _ComparisonPredicate:
|
||||||
operators = ("==", "!=", ">=", "<=", ">", "<")
|
operators = ("==", "!=", ">=", "<=", ">", "<")
|
||||||
|
|
||||||
def __init__(self, i, attr, value, predicate, is_extension=False):
|
def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
|
||||||
self.i = i
|
self.i = i
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
self.value = value
|
self.value = value
|
||||||
|
@ -809,11 +829,13 @@ class _ComparisonPredicate:
|
||||||
return value < self.value
|
return value < self.value
|
||||||
|
|
||||||
|
|
||||||
def _get_extra_predicates(spec, extra_predicates):
|
def _get_extra_predicates(spec, extra_predicates, vocab):
|
||||||
predicate_types = {
|
predicate_types = {
|
||||||
"REGEX": _RegexPredicate,
|
"REGEX": _RegexPredicate,
|
||||||
"IN": _SetMemberPredicate,
|
"IN": _SetPredicate,
|
||||||
"NOT_IN": _SetMemberPredicate,
|
"NOT_IN": _SetPredicate,
|
||||||
|
"IS_SUBSET": _SetPredicate,
|
||||||
|
"IS_SUPERSET": _SetPredicate,
|
||||||
"==": _ComparisonPredicate,
|
"==": _ComparisonPredicate,
|
||||||
"!=": _ComparisonPredicate,
|
"!=": _ComparisonPredicate,
|
||||||
">=": _ComparisonPredicate,
|
">=": _ComparisonPredicate,
|
||||||
|
@ -841,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates):
|
||||||
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
value_with_upper_keys = {k.upper(): v for k, v in value.items()}
|
||||||
for type_, cls in predicate_types.items():
|
for type_, cls in predicate_types.items():
|
||||||
if type_ in value_with_upper_keys:
|
if type_ in value_with_upper_keys:
|
||||||
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
|
predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
|
||||||
# Don't create a redundant predicates.
|
# Don't create a redundant predicates.
|
||||||
# This helps with efficiency, as we're caching the results.
|
# This helps with efficiency, as we're caching the results.
|
||||||
if predicate.key in seen_predicates:
|
if predicate.key in seen_predicates:
|
||||||
|
|
|
@ -61,6 +61,8 @@ class TokenPatternString(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
IN: Optional[List[StrictStr]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in")
|
||||||
|
IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset")
|
||||||
|
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
extra = "forbid"
|
extra = "forbid"
|
||||||
|
@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel):
|
||||||
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
REGEX: Optional[StrictStr] = Field(None, alias="regex")
|
||||||
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
IN: Optional[List[StrictInt]] = Field(None, alias="in")
|
||||||
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in")
|
||||||
|
ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset")
|
||||||
|
ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset")
|
||||||
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==")
|
||||||
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=")
|
||||||
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=")
|
||||||
|
@ -115,6 +119,7 @@ class TokenPattern(BaseModel):
|
||||||
lower: Optional[StringValue] = None
|
lower: Optional[StringValue] = None
|
||||||
pos: Optional[StringValue] = None
|
pos: Optional[StringValue] = None
|
||||||
tag: Optional[StringValue] = None
|
tag: Optional[StringValue] = None
|
||||||
|
morph: Optional[StringValue] = None
|
||||||
dep: Optional[StringValue] = None
|
dep: Optional[StringValue] = None
|
||||||
lemma: Optional[StringValue] = None
|
lemma: Optional[StringValue] = None
|
||||||
shape: Optional[StringValue] = None
|
shape: Optional[StringValue] = None
|
||||||
|
|
|
@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab):
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_subset_value_operator(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc)) == 3
|
||||||
|
doc[0].morph_ = "Feat=Val"
|
||||||
|
assert len(matcher(doc)) == 3
|
||||||
|
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
||||||
|
assert len(matcher(doc)) == 3
|
||||||
|
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
|
# IS_SUBSET acts like "IN" for attrs other than MORPH
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
# IS_SUBSET with an empty list matches nothing
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"TAG": {"IS_SUBSET": []}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_superset_value_operator(en_vocab):
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
doc[0].morph_ = "Feat=Val|Feat2=Val2"
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3"
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4"
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
# IS_SUPERSET with more than one value only matches for MORPH
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
# IS_SUPERSET with one value is the same as ==
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
# IS_SUPERSET with an empty value matches everything
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"TAG": {"IS_SUPERSET": []}}]
|
||||||
|
matcher.add("M", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
doc[0].tag_ = "A"
|
||||||
|
assert len(matcher(doc)) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_morph_handling(en_vocab):
|
||||||
|
# order of features in pattern doesn't matter
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}]
|
||||||
|
pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}]
|
||||||
|
matcher.add("M", [pattern1])
|
||||||
|
matcher.add("N", [pattern2])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
doc[0].morph_ = "Feat2=Val2|Feat1=Val1"
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
doc[0].morph_ = "Feat1=Val1|Feat2=Val2"
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
|
# multiple values are split
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}]
|
||||||
|
pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}]
|
||||||
|
matcher.add("M", [pattern1])
|
||||||
|
matcher.add("N", [pattern2])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1"
|
||||||
|
assert len(matcher(doc)) == 1
|
||||||
|
doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2"
|
||||||
|
assert len(matcher(doc)) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_regex(en_vocab):
|
def test_matcher_regex(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
|
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
|
||||||
|
|
|
@ -30,20 +30,20 @@ pattern keys correspond to a number of
|
||||||
[`Token` attributes](/api/token#attributes). The supported attributes for
|
[`Token` attributes](/api/token#attributes). The supported attributes for
|
||||||
rule-based matching are:
|
rule-based matching are:
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
||||||
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
|
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
|
||||||
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
||||||
| `LENGTH` | The length of the token text. ~~int~~ |
|
| `LENGTH` | The length of the token text. ~~int~~ |
|
||||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|
||||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|
||||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
||||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
||||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
|
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||||
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
|
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
|
||||||
|
|
||||||
Operators and quantifiers define **how often** a token pattern should be
|
Operators and quantifiers define **how often** a token pattern should be
|
||||||
matched:
|
matched:
|
||||||
|
@ -79,6 +79,8 @@ it compares to another value.
|
||||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||||
|
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
|
||||||
|
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
|
||||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||||
|
|
||||||
## Matcher.\_\_init\_\_ {#init tag="method"}
|
## Matcher.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
|
@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of
|
||||||
[`Token` attributes](/api/token#attributes). The supported attributes for
|
[`Token` attributes](/api/token#attributes). The supported attributes for
|
||||||
rule-based matching are:
|
rule-based matching are:
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
||||||
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
|
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
|
||||||
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
||||||
| `LENGTH` | The length of the token text. ~~int~~ |
|
| `LENGTH` | The length of the token text. ~~int~~ |
|
||||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|
||||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|
||||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
||||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
||||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
|
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||||
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
|
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
|
||||||
|
|
||||||
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
|
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
|
||||||
|
|
||||||
|
@ -236,6 +236,8 @@ following rich comparison attributes are available:
|
||||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||||
|
| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ |
|
||||||
|
| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ |
|
||||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||||
|
|
||||||
#### Regular expressions {#regex new="2.1"}
|
#### Regular expressions {#regex new="2.1"}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user