From a4c43e5c577d7a143ef7e2fd74ccea33aace96b7 Mon Sep 17 00:00:00 2001 From: Natalia Rodnova <4512370+nrodnova@users.noreply.github.com> Date: Wed, 24 Nov 2021 02:37:10 -0700 Subject: [PATCH] Allow Matcher to match on ENT_ID and ENT_KB_ID (#9688) * Added ENT_ID and ENT_KB_ID into the list of the attributes that Matcher matches on * Added ENT_ID and ENT_KB_ID to TEST_PATTERNS in test_pattern_validation.py. Disabled tests that I added before * Update website/docs/api/matcher.md * Format * Remove skipped tests Co-authored-by: Adriane Boyd --- spacy/schemas.py | 2 ++ spacy/tests/matcher/test_pattern_validation.py | 4 ++++ website/docs/api/matcher.md | 2 ++ 3 files changed, 8 insertions(+) diff --git a/spacy/schemas.py b/spacy/schemas.py index b3ea11d8b..cf58688ef 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -222,6 +222,8 @@ class TokenPattern(BaseModel): lemma: Optional[StringValue] = None shape: Optional[StringValue] = None ent_type: Optional[StringValue] = None + ent_id: Optional[StringValue] = None + ent_kb_id: Optional[StringValue] = None norm: Optional[StringValue] = None length: Optional[NumberValue] = None spacy: Optional[StrictBool] = None diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 4d21aea81..74feb7c5d 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -22,6 +22,8 @@ TEST_PATTERNS = [ ([{"TEXT": {"VALUE": "foo"}}], 2, 0), # prev: (1, 0) ([{"IS_DIGIT": -1}], 1, 0), ([{"ORTH": -1}], 1, 0), + ([{"ENT_ID": -1}], 1, 0), + ([{"ENT_KB_ID": -1}], 1, 0), # Good patterns ([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0), ([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0), @@ -33,6 +35,8 @@ TEST_PATTERNS = [ ([{"orth": "foo"}], 0, 0), # prev: xfail ([{"IS_SENT_START": True}], 0, 0), ([{"SENT_START": True}], 0, 0), + ([{"ENT_ID": "STRING"}], 0, 0), + ([{"ENT_KB_ID": "STRING"}], 0, 0), ] diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index c34560dec..803105ba2 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -44,6 +44,8 @@ rule-based matching are: | `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | | `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |