mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Support list values and INTERSECTS in Matcher (#8784)
* Support list values and IS_INTERSECT in Matcher * Support list values as token attributes for set operators, not just as pattern values. * Add `IS_INTERSECT` operator. * Fix incorrect `ISSUBSET` and `ISSUPERSET` in schema and docs. * Rename IS_INTERSECT to INTERSECTS
This commit is contained in:
		
							parent
							
								
									d79dbd0624
								
							
						
					
					
						commit
						c1caa47aa7
					
				|  | @ -845,7 +845,7 @@ class _RegexPredicate: | |||
| 
 | ||||
| 
 | ||||
| class _SetPredicate: | ||||
|     operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET") | ||||
|     operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS") | ||||
| 
 | ||||
|     def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): | ||||
|         self.i = i | ||||
|  | @ -868,14 +868,16 @@ class _SetPredicate: | |||
|         else: | ||||
|             value = get_token_attr_for_matcher(token.c, self.attr) | ||||
| 
 | ||||
|         if self.predicate in ("IS_SUBSET", "IS_SUPERSET"): | ||||
|         if self.predicate in ("IS_SUBSET", "IS_SUPERSET", "INTERSECTS"): | ||||
|             if self.attr == MORPH: | ||||
|                 # break up MORPH into individual Feat=Val values | ||||
|                 value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value)) | ||||
|             else: | ||||
|                 # IS_SUBSET for other attrs will be equivalent to "IN" | ||||
|                 # IS_SUPERSET will only match for other attrs with 0 or 1 values | ||||
|                 value = set([value]) | ||||
|                 # treat a single value as a list | ||||
|                 if isinstance(value, (str, int)): | ||||
|                     value = set([get_string_id(value)]) | ||||
|                 else: | ||||
|                     value = set(get_string_id(v) for v in value) | ||||
|         if self.predicate == "IN": | ||||
|             return value in self.value | ||||
|         elif self.predicate == "NOT_IN": | ||||
|  | @ -884,6 +886,8 @@ class _SetPredicate: | |||
|             return value <= self.value | ||||
|         elif self.predicate == "IS_SUPERSET": | ||||
|             return value >= self.value | ||||
|         elif self.predicate == "INTERSECTS": | ||||
|             return bool(value & self.value) | ||||
| 
 | ||||
|     def __repr__(self): | ||||
|         return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate)) | ||||
|  | @ -928,6 +932,7 @@ def _get_extra_predicates(spec, extra_predicates, vocab): | |||
|         "NOT_IN": _SetPredicate, | ||||
|         "IS_SUBSET": _SetPredicate, | ||||
|         "IS_SUPERSET": _SetPredicate, | ||||
|         "INTERSECTS": _SetPredicate, | ||||
|         "==": _ComparisonPredicate, | ||||
|         "!=": _ComparisonPredicate, | ||||
|         ">=": _ComparisonPredicate, | ||||
|  |  | |||
|  | @ -159,6 +159,7 @@ class TokenPatternString(BaseModel): | |||
|     NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") | ||||
|     IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") | ||||
|     IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") | ||||
|     INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") | ||||
| 
 | ||||
|     class Config: | ||||
|         extra = "forbid" | ||||
|  | @ -175,8 +176,9 @@ class TokenPatternNumber(BaseModel): | |||
|     REGEX: Optional[StrictStr] = Field(None, alias="regex") | ||||
|     IN: Optional[List[StrictInt]] = Field(None, alias="in") | ||||
|     NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") | ||||
|     ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset") | ||||
|     ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset") | ||||
|     IS_SUBSET: Optional[List[StrictInt]] = Field(None, alias="is_subset") | ||||
|     IS_SUPERSET: Optional[List[StrictInt]] = Field(None, alias="is_superset") | ||||
|     INTERSECTS: Optional[List[StrictInt]] = Field(None, alias="intersects") | ||||
|     EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") | ||||
|     NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") | ||||
|     GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") | ||||
|  |  | |||
|  | @ -270,6 +270,16 @@ def test_matcher_subset_value_operator(en_vocab): | |||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
|     # IS_SUBSET with a list value | ||||
|     Token.set_extension("ext", default=[]) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"_": {"ext": {"IS_SUBSET": ["A", "B"]}}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0]._.ext = ["A"] | ||||
|     doc[1]._.ext = ["C", "D"] | ||||
|     assert len(matcher(doc)) == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_matcher_superset_value_operator(en_vocab): | ||||
|     matcher = Matcher(en_vocab) | ||||
|  | @ -308,6 +318,72 @@ def test_matcher_superset_value_operator(en_vocab): | |||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 3 | ||||
| 
 | ||||
|     # IS_SUPERSET with a list value | ||||
|     Token.set_extension("ext", default=[]) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"_": {"ext": {"IS_SUPERSET": ["A"]}}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0]._.ext = ["A", "B"] | ||||
|     assert len(matcher(doc)) == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_matcher_intersect_value_operator(en_vocab): | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"MORPH": {"INTERSECTS": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     assert len(matcher(doc)) == 0 | ||||
|     doc[0].set_morph("Feat=Val") | ||||
|     assert len(matcher(doc)) == 1 | ||||
|     doc[0].set_morph("Feat=Val|Feat2=Val2") | ||||
|     assert len(matcher(doc)) == 1 | ||||
|     doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3") | ||||
|     assert len(matcher(doc)) == 1 | ||||
|     doc[0].set_morph("Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4") | ||||
|     assert len(matcher(doc)) == 1 | ||||
| 
 | ||||
|     # INTERSECTS with a single value is the same as IN | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"TAG": {"INTERSECTS": ["A", "B"]}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 1 | ||||
| 
 | ||||
|     # INTERSECTS with an empty pattern list matches nothing | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"TAG": {"INTERSECTS": []}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0].tag_ = "A" | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
|     # INTERSECTS with a list value | ||||
|     Token.set_extension("ext", default=[]) | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"_": {"ext": {"INTERSECTS": ["A", "C"]}}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0]._.ext = ["A", "B"] | ||||
|     assert len(matcher(doc)) == 1 | ||||
| 
 | ||||
|     # INTERSECTS with an empty pattern list matches nothing | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"_": {"ext": {"INTERSECTS": []}}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0]._.ext = ["A", "B"] | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
|     # INTERSECTS with an empty value matches nothing | ||||
|     matcher = Matcher(en_vocab) | ||||
|     pattern = [{"_": {"ext": {"INTERSECTS": ["A", "B"]}}}] | ||||
|     matcher.add("M", [pattern]) | ||||
|     doc = Doc(en_vocab, words=["a", "b", "c"]) | ||||
|     doc[0]._.ext = [] | ||||
|     assert len(matcher(doc)) == 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_matcher_morph_handling(en_vocab): | ||||
|     # order of features in pattern doesn't matter | ||||
|  |  | |||
|  | @ -77,13 +77,14 @@ it compares to another value. | |||
| > ] | ||||
| > ``` | ||||
| 
 | ||||
| | Attribute                  | Description                                                                                             | | ||||
| | -------------------------- | ------------------------------------------------------------------------------------------------------- | | ||||
| | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            | | ||||
| | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      | | ||||
| | `ISSUBSET`                 | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          | | ||||
| | `ISSUPERSET`               | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        | | ||||
| | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | | ||||
| | Attribute                  | Description                                                                                              | | ||||
| | -------------------------- | -------------------------------------------------------------------------------------------------------- | | ||||
| | `IN`                       | Attribute value is member of a list. ~~Any~~                                                             | | ||||
| | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                       | | ||||
| | `IS_SUBSET`                | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~                   | | ||||
| | `IS_SUPERSET`              | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~                 | | ||||
| | `INTERSECTS`               | Attribute value (for `MORPH` or custom list attribute) has a non-empty intersection with a list. ~~Any~~ | | ||||
| | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~  | | ||||
| 
 | ||||
| ## Matcher.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -240,13 +240,14 @@ following rich comparison attributes are available: | |||
| > # "Number=Sing|Gender=Neut|Polite=Infm" will not match because it's a superset | ||||
| > ``` | ||||
| 
 | ||||
| | Attribute                  | Description                                                                                             | | ||||
| | -------------------------- | ------------------------------------------------------------------------------------------------------- | | ||||
| | `IN`                       | Attribute value is member of a list. ~~Any~~                                                            | | ||||
| | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                      | | ||||
| | `IS_SUBSET`                | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~                                          | | ||||
| | `IS_SUPERSET`              | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~                                        | | ||||
| | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | | ||||
| | Attribute                  | Description                                                                                               | | ||||
| | -------------------------- | --------------------------------------------------------------------------------------------------------- | | ||||
| | `IN`                       | Attribute value is member of a list. ~~Any~~                                                              | | ||||
| | `NOT_IN`                   | Attribute value is _not_ member of a list. ~~Any~~                                                        | | ||||
| | `IS_SUBSET`                | Attribute value (for `MORPH` or custom list attributes) is a subset of a list. ~~Any~~                    | | ||||
| | `IS_SUPERSET`              | Attribute value (for `MORPH` or custom list attributes) is a superset of a list. ~~Any~~                  | | ||||
| | `INTERSECTS`               | Attribute value (for `MORPH` or custom list attributes) has a non-empty intersection with a list. ~~Any~~ | | ||||
| | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~   | | ||||
| 
 | ||||
| #### Regular expressions {#regex new="2.1"} | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user