mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Add MORPH handling to Matcher (#6107)
* Add MORPH handling to Matcher
* Add `MORPH` to `Matcher` schema
* Rename `_SetMemberPredicate` to `_SetPredicate`
* Add `ISSUBSET` and `ISSUPERSET` operators to `_SetPredicate`
  * Add special handling for normalization and conversion of morph
    values into sets
  * For other attrs, `ISSUBSET` acts like `IN` and `ISSUPERSET` only
    matches for 0 or 1 values
* Update test
* Rename to IS_SUBSET and IS_SUPERSET
			
			
This commit is contained in:
		
							parent
							
								
									59340606b7
								
							
						
					
					
						commit
						3c062b3911
					
				|  | @ -17,6 +17,7 @@ from ..vocab cimport Vocab | |||
| from ..tokens.doc cimport Doc, get_token_attr_for_matcher | ||||
| from ..tokens.span cimport Span | ||||
| from ..tokens.token cimport Token | ||||
| from ..tokens.morphanalysis cimport MorphAnalysis | ||||
| from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH | ||||
| 
 | ||||
| from ..schemas import validate_token_pattern | ||||
|  | @ -124,7 +125,7 @@ cdef class Matcher: | |||
|         key = self._normalize_key(key) | ||||
|         for pattern in patterns: | ||||
|             try: | ||||
|                 specs = _preprocess_pattern(pattern, self.vocab.strings, | ||||
|                 specs = _preprocess_pattern(pattern, self.vocab, | ||||
|                     self._extensions, self._extra_predicates) | ||||
|                 self.patterns.push_back(init_pattern(self.mem, key, specs)) | ||||
|                 for spec in specs: | ||||
|  | @ -663,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: | |||
|     return id_attr.value | ||||
| 
 | ||||
| 
 | ||||
| def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates): | ||||
| def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): | ||||
|     """This function interprets the pattern, converting the various bits of | ||||
|     syntactic sugar before we compile it into a struct with init_pattern. | ||||
| 
 | ||||
|  | @ -678,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi | |||
|         extra_predicates. | ||||
|     """ | ||||
|     tokens = [] | ||||
|     string_store = vocab.strings | ||||
|     for spec in token_specs: | ||||
|         if not spec: | ||||
|             # Signifier for 'any token' | ||||
|  | @ -688,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi | |||
|         ops = _get_operators(spec) | ||||
|         attr_values = _get_attr_values(spec, string_store) | ||||
|         extensions = _get_extensions(spec, string_store, extensions_table) | ||||
|         predicates = _get_extra_predicates(spec, extra_predicates) | ||||
|         predicates = _get_extra_predicates(spec, extra_predicates, vocab) | ||||
|         for op in ops: | ||||
|             tokens.append((op, list(attr_values), list(extensions), list(predicates))) | ||||
|     return tokens | ||||
|  | @ -732,7 +734,7 @@ def _get_attr_values(spec, string_store): | |||
| class _RegexPredicate: | ||||
|     operators = ("REGEX",) | ||||
| 
 | ||||
|     def __init__(self, i, attr, value, predicate, is_extension=False): | ||||
|     def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): | ||||
|         self.i = i | ||||
|         self.attr = attr | ||||
|         self.value = re.compile(value) | ||||
|  | @ -750,13 +752,18 @@ class _RegexPredicate: | |||
|         return bool(self.value.search(value)) | ||||
| 
 | ||||
| 
 | ||||
| class _SetMemberPredicate: | ||||
|     operators = ("IN", "NOT_IN") | ||||
| class _SetPredicate: | ||||
|     operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET") | ||||
| 
 | ||||
|     def __init__(self, i, attr, value, predicate, is_extension=False): | ||||
|     def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): | ||||
|         self.i = i | ||||
|         self.attr = attr | ||||
|         self.value = set(get_string_id(v) for v in value) | ||||
|         self.vocab = vocab | ||||
|         if self.attr == MORPH: | ||||
|             # normalize morph strings | ||||
|             self.value = set(self.vocab.morphology.add(v) for v in value) | ||||
|         else: | ||||
|             self.value = set(get_string_id(v) for v in value) | ||||
|         self.predicate = predicate | ||||
|         self.is_extension = is_extension | ||||
|         self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) | ||||
|  | @ -768,19 +775,32 @@ class _SetMemberPredicate: | |||
|             value = get_string_id(token._.get(self.attr)) | ||||
|         else: | ||||
|             value = get_token_attr_for_matcher(token.c, self.attr) | ||||
| 
 | ||||
|         if self.predicate in ("IS_SUBSET", "IS_SUPERSET"): | ||||
|             if self.attr == MORPH: | ||||
|                 # break up MORPH into individual Feat=Val values | ||||
|                 value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value)) | ||||
|             else: | ||||
|                 # IS_SUBSET for other attrs will be equivalent to "IN" | ||||
|                 # IS_SUPERSET will only match for other attrs with 0 or 1 values | ||||
|                 value = set([value]) | ||||
|         if self.predicate == "IN": | ||||
|             return value in self.value | ||||
|         else: | ||||
|         elif self.predicate == "NOT_IN": | ||||
|             return value not in self.value | ||||
|         elif self.predicate == "IS_SUBSET": | ||||
|             return value <= self.value | ||||
|         elif self.predicate == "IS_SUPERSET": | ||||
|             return value >= self.value | ||||
| 
 | ||||
|     def __repr__(self): | ||||
|         return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate)) | ||||
|         return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate)) | ||||
| 
 | ||||
| 
 | ||||
| class _ComparisonPredicate: | ||||
|     operators = ("==", "!=", ">=", "<=", ">", "<") | ||||
| 
 | ||||
|     def __init__(self, i, attr, value, predicate, is_extension=False): | ||||
|     def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): | ||||
|         self.i = i | ||||
|         self.attr = attr | ||||
|         self.value = value | ||||
|  | @ -809,11 +829,13 @@ class _ComparisonPredicate: | |||
|             return value < self.value | ||||
| 
 | ||||
| 
 | ||||
| def _get_extra_predicates(spec, extra_predicates): | ||||
| def _get_extra_predicates(spec, extra_predicates, vocab): | ||||
|     predicate_types = { | ||||
|         "REGEX": _RegexPredicate, | ||||
|         "IN": _SetMemberPredicate, | ||||
|         "NOT_IN": _SetMemberPredicate, | ||||
|         "IN": _SetPredicate, | ||||
|         "NOT_IN": _SetPredicate, | ||||
|         "IS_SUBSET": _SetPredicate, | ||||
|         "IS_SUPERSET": _SetPredicate, | ||||
|         "==": _ComparisonPredicate, | ||||
|         "!=": _ComparisonPredicate, | ||||
|         ">=": _ComparisonPredicate, | ||||
|  | @ -841,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates): | |||
|             value_with_upper_keys = {k.upper(): v for k, v in value.items()} | ||||
|             for type_, cls in predicate_types.items(): | ||||
|                 if type_ in value_with_upper_keys: | ||||
|                     predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_) | ||||
|                     predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab) | ||||
|                     # Don't create a redundant predicates. | ||||
|                     # This helps with efficiency, as we're caching the results. | ||||
|                     if predicate.key in seen_predicates: | ||||
|  |  | |||
|  | @ -61,6 +61,8 @@ class TokenPatternString(BaseModel): | |||
|     REGEX: Optional[StrictStr] = Field(None, alias="regex") | ||||
|     IN: Optional[List[StrictStr]] = Field(None, alias="in") | ||||
|     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") | ||||
|     IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") | ||||
|     IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") | ||||
| 
 | ||||
|     class Config: | ||||
|         extra = "forbid" | ||||
|  | @ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel): | |||
|     REGEX: Optional[StrictStr] = Field(None, alias="regex") | ||||
|     IN: Optional[List[StrictInt]] = Field(None, alias="in") | ||||
|     NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") | ||||
|     ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset") | ||||
|     ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset") | ||||
|     EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") | ||||
|     NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") | ||||
|     GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") | ||||
|  | @ -115,6 +119,7 @@ class TokenPattern(BaseModel): | |||
|     lower: Optional[StringValue] = None | ||||
|     pos: Optional[StringValue] = None | ||||
|     tag: Optional[StringValue] = None | ||||
|     morph: Optional[StringValue] = None | ||||
|     dep: Optional[StringValue] = None | ||||
|     lemma: Optional[StringValue] = None | ||||
|     shape: Optional[StringValue] = None | ||||
|  |  | |||
|  | @ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab): | |||
|     assert len(matches) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_matcher_subset_value_operator(en_vocab): | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     assert len(matcher(doc)) == 3 | ||||
|     doc[0].morph_ = "Feat=Val" | ||||
|     assert len(matcher(doc)) == 3 | ||||
|     doc[0].morph_ = "Feat=Val|Feat2=Val2" | ||||
|     assert len(matcher(doc)) == 3 | ||||
|     doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" | ||||
|     assert len(matcher(doc)) == 2 | ||||
|     doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" | ||||
|     assert len(matcher(doc)) == 2 | ||||
| 
 | ||||
|     # IS_SUBSET acts like "IN" for attrs other than MORPH | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 1 | ||||
| 
 | ||||
|     # IS_SUBSET with an empty list matches nothing | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"TAG": {"IS_SUBSET": []}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_matcher_superset_value_operator(en_vocab): | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     assert len(matcher(doc)) == 0 | ||||
|     doc[0].morph_ = "Feat=Val|Feat2=Val2" | ||||
|     assert len(matcher(doc)) == 0 | ||||
|     doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" | ||||
|     assert len(matcher(doc)) == 1 | ||||
|     doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" | ||||
|     assert len(matcher(doc)) == 1 | ||||
| 
 | ||||
|     # IS_SUPERSET with more than one value only matches for MORPH | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
|     # IS_SUPERSET with one value is the same as == | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 1 | ||||
| 
 | ||||
|     # IS_SUPERSET with an empty value matches everything | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"TAG": {"IS_SUPERSET": []}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_matcher_morph_handling(en_vocab): | ||||
|     # order of features in pattern doesn't matter | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}] | ||||
|     pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}] | ||||
|     matcher.add("M", [pattern1]) | ||||
|     matcher.add("N", [pattern2]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
|     doc[0].morph_ = "Feat2=Val2|Feat1=Val1" | ||||
|     assert len(matcher(doc)) == 2 | ||||
|     doc[0].morph_ = "Feat1=Val1|Feat2=Val2" | ||||
|     assert len(matcher(doc)) == 2 | ||||
| 
 | ||||
|     # multiple values are split | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}] | ||||
|     pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}] | ||||
|     matcher.add("M", [pattern1]) | ||||
|     matcher.add("N", [pattern2]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
|     doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1" | ||||
|     assert len(matcher(doc)) == 1 | ||||
|     doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2" | ||||
|     assert len(matcher(doc)) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_matcher_regex(en_vocab): | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}] | ||||
|  |  | |||
|  | @ -30,20 +30,20 @@ pattern keys correspond to a number of | |||
| [`Token` attributes](/api/token#attributes). The supported attributes for | ||||
| rule-based matching are: | ||||
| 
 | ||||
| | Attribute                              |  Description                                                                                                              | | ||||
| | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             | | ||||
| |  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     | | ||||
| |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          | | ||||
| |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                | | ||||
| |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     | | ||||
| |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       | | ||||
| |  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               | | ||||
| | `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         | | ||||
| | `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | ||||
| | `OP`                                   | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           | | ||||
| | Attribute                                       |  Description                                                                                                              | | ||||
| | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             | | ||||
| |  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     | | ||||
| |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          | | ||||
| |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                | | ||||
| |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     | | ||||
| |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       | | ||||
| |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       | | ||||
| | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         | | ||||
| | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | ||||
| | `OP`                                            | Operator or quantifier to determine how often to match a token pattern. ~~str~~                                           | | ||||
| 
 | ||||
| Operators and quantifiers define **how often** a token pattern should be | ||||
| matched: | ||||
|  | @ -79,6 +79,8 @@ it compares to another value. | |||
| | -------------------------- | ------------------------------------------------------------------------------------------------------- | | ||||
| | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            | | ||||
| | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      | | ||||
| | `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          | | ||||
| | `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        | | ||||
| | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | | ||||
| 
 | ||||
| ## Matcher.\_\_init\_\_ {#init tag="method"} | ||||
|  |  | |||
|  | @ -158,20 +158,20 @@ The available token pattern keys correspond to a number of | |||
| [`Token` attributes](/api/token#attributes). The supported attributes for | ||||
| rule-based matching are: | ||||
| 
 | ||||
| | Attribute                              |  Description                                                                                                              | | ||||
| | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `ORTH`                                 | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `TEXT` <Tag variant="new">2.1</Tag>    | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `LOWER`                                | The lowercase form of the token text. ~~str~~                                                                             | | ||||
| |  `LENGTH`                              | The length of the token text. ~~int~~                                                                                     | | ||||
| |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          | | ||||
| |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                | | ||||
| |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     | | ||||
| |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | Token text resembles a number, URL, email. ~~bool~~                                                                       | | ||||
| |  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~                               | | ||||
| | `ENT_TYPE`                             | The token's entity label. ~~str~~                                                                                         | | ||||
| | `_` <Tag variant="new">2.1</Tag>       | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | ||||
| | `OP`                                   | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           | | ||||
| | Attribute                                       |  Description                                                                                                              | | ||||
| | ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `ORTH`                                          | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `TEXT` <Tag variant="new">2.1</Tag>             | The exact verbatim text of a token. ~~str~~                                                                               | | ||||
| | `LOWER`                                         | The lowercase form of the token text. ~~str~~                                                                             | | ||||
| |  `LENGTH`                                       | The length of the token text. ~~int~~                                                                                     | | ||||
| |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`             | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~                                          | | ||||
| |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`             | Token text is in lowercase, uppercase, titlecase. ~~bool~~                                                                | | ||||
| |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`              | Token is punctuation, whitespace, stop word. ~~bool~~                                                                     | | ||||
| |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`           | Token text resembles a number, URL, email. ~~bool~~                                                                       | | ||||
| |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       | | ||||
| | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         | | ||||
| | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | ||||
| | `OP`                                            | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~                           | | ||||
| 
 | ||||
| <Accordion title="Does it matter if the attribute names are uppercase or lowercase?"> | ||||
| 
 | ||||
|  | @ -236,6 +236,8 @@ following rich comparison attributes are available: | |||
| | -------------------------- | ------------------------------------------------------------------------------------------------------- | | ||||
| | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            | | ||||
| | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      | | ||||
| | `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          | | ||||
| | `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        | | ||||
| | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | | ||||
| 
 | ||||
| #### Regular expressions {#regex new="2.1"} | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user