diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 745d7cf43..6aa58f0e3 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token from ..tokens.morphanalysis cimport MorphAnalysis -from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH +from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB from ..schemas import validate_token_pattern from ..errors import Errors, MatchPatternError, Warnings @@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store): attr = "SENT_START" attr = IDS.get(attr) if isinstance(value, str): - value = string_store.add(value) + if attr == ENT_IOB and value in Token.iob_strings(): + value = Token.iob_strings().index(value) + else: + value = string_store.add(value) elif isinstance(value, bool): value = int(value) elif isinstance(value, int): diff --git a/spacy/schemas.py b/spacy/schemas.py index cf58688ef..1dfd8ee85 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -1,5 +1,6 @@ from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple from typing import Iterable, TypeVar, TYPE_CHECKING +from .compat import Literal from enum import Enum from pydantic import BaseModel, Field, ValidationError, validator, create_model from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool @@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat] UnderscoreValue = Union[ TokenPatternString, TokenPatternNumber, str, int, float, list, bool ] +IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3] class TokenPattern(BaseModel): @@ -222,6 +224,7 @@ class TokenPattern(BaseModel): lemma: Optional[StringValue] = None shape: Optional[StringValue] = None ent_type: Optional[StringValue] = None + ent_iob: Optional[IobValue] = None ent_id: Optional[StringValue] = None ent_kb_id: Optional[StringValue] = None norm: Optional[StringValue] = None diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c02d65cdf..a27baf130 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab): matcher = Matcher(en_vocab) matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]]) assert len(matcher(doc)) == 0 + + +def test_matcher_ent_iob_key(en_vocab): + """Test that patterns with ent_iob works correctly.""" + matcher = Matcher(en_vocab) + matcher.add("Rule", [[{"ENT_IOB": "I"}]]) + doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"]) + doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")] + doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"]) + doc2.ents = [Span(doc2, 4, 5, label="PERSON")] + matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)] + matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)] + assert len(matches1) == 1 + assert matches1[0] == "York" + assert len(matches2) == 0 + + matcher = Matcher(en_vocab) # Test iob pattern with operators + matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]]) + doc = Doc( + en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"] + ) + doc.ents = [Span(doc, 4, 7, label="PERSON")] + matches = [doc[start:end].text for _, start, end in matcher(doc)] + assert len(matches) == 3 + assert matches[0] == "Maria" + assert matches[1] == "Maria Esperanza" + assert matches[2] == "Esperanza" diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 74feb7c5d..8c265785c 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -12,6 +12,7 @@ TEST_PATTERNS = [ ([{"IS_PUNCT": True, "OP": "$"}], 1, 1), ([{"_": "foo"}], 1, 1), ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1), + ([{"ENT_IOB": "foo"}], 1, 1), ([1, 2, 3], 3, 1), # Bad patterns flagged outside of Matcher ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0) diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 803105ba2..3e7f9dc04 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -44,6 +44,7 @@ rule-based matching are: | `SPACY` | Token has a trailing space. ~~bool~~ | |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ | | `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |