Add ENT_IOB key to Matcher (#9649)

* added new field * added exception for IOb strings * minor refinement to schema * removed field * fixed typo * imported numeriacla val * changed the code bit * cosmetics * added test for matcher * set ents of moc docs * added invalid pattern * minor update to documentation * blacked matcher * added pattern validation * add IOB vals to schema * changed into test * mypy compat * cleaned left over * added compat import * changed type * added compat import * changed literal a bit * went back to old * made explicit type * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-07-01 02:13:07 +03:00 · 2022-01-20 13:18:39 +01:00 · 2022-01-20 13:18:39 +01:00 · 268ddf8a06
commit 268ddf8a06
parent 32bd3856b3
5 changed files with 37 additions and 2 deletions
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
 from ..tokens.span cimport Span
 from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
-from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
+from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB

 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
                attr = "SENT_START"
            attr = IDS.get(attr)
        if isinstance(value, str):
+            if attr == ENT_IOB and value in Token.iob_strings():
+                value = Token.iob_strings().index(value)
+            else:
                value = string_store.add(value)
        elif isinstance(value, bool):
            value = int(value)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -1,5 +1,6 @@
 from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
 from typing import Iterable, TypeVar, TYPE_CHECKING
+from .compat import Literal
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
 from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
    TokenPatternString, TokenPatternNumber, str, int, float, list, bool
 ]
+IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]


 class TokenPattern(BaseModel):
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
    lemma: Optional[StringValue] = None
    shape: Optional[StringValue] = None
    ent_type: Optional[StringValue] = None
+    ent_iob: Optional[IobValue] = None
    ent_id: Optional[StringValue] = None
    ent_kb_id: Optional[StringValue] = None
    norm: Optional[StringValue] = None
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
    matcher = Matcher(en_vocab)
    matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
    assert len(matcher(doc)) == 0
+
+
+def test_matcher_ent_iob_key(en_vocab):
+    """Test that patterns with ent_iob works correctly."""
+    matcher = Matcher(en_vocab)
+    matcher.add("Rule", [[{"ENT_IOB": "I"}]])
+    doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
+    doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
+    doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
+    doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
+    matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
+    matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
+    assert len(matches1) == 1
+    assert matches1[0] == "York"
+    assert len(matches2) == 0
+
+    matcher = Matcher(en_vocab)  # Test iob pattern with operators
+    matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
+    doc = Doc(
+        en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
+    )
+    doc.ents = [Span(doc, 4, 7, label="PERSON")]
+    matches = [doc[start:end].text for _, start, end in matcher(doc)]
+    assert len(matches) == 3
+    assert matches[0] == "Maria"
+    assert matches[1] == "Maria Esperanza"
+    assert matches[2] == "Esperanza"
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@ -12,6 +12,7 @@ TEST_PATTERNS = [
    ([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
    ([{"_": "foo"}], 1, 1),
    ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
+    ([{"ENT_IOB": "foo"}], 1, 1),
    ([1, 2, 3], 3, 1),
    # Bad patterns flagged outside of Matcher
    ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0),  # prev: (1, 0)
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -44,6 +44,7 @@ rule-based matching are:
 | `SPACY`                                         | Token has a trailing space. ~~bool~~                                                                                      |
 |  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~       |
 | `ENT_TYPE`                                      | The token's entity label. ~~str~~                                                                                         |
+| `ENT_IOB`                                       | The IOB part of the token's entity tag. ~~str~~                                                                           |
 | `ENT_ID`                                        | The token's entity ID (`ent_id`). ~~str~~                                                                                 |
 | `ENT_KB_ID`                                     | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~                                                               |
 | `_` <Tag variant="new">2.1</Tag>                | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |