mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add ENT_IOB key to Matcher (#9649)
* added new field * added exception for IOb strings * minor refinement to schema * removed field * fixed typo * imported numeriacla val * changed the code bit * cosmetics * added test for matcher * set ents of moc docs * added invalid pattern * minor update to documentation * blacked matcher * added pattern validation * add IOB vals to schema * changed into test * mypy compat * cleaned left over * added compat import * changed type * added compat import * changed literal a bit * went back to old * made explicit type * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
32bd3856b3
commit
268ddf8a06
|
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||||
|
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
|
@ -798,7 +798,10 @@ def _get_attr_values(spec, string_store):
|
||||||
attr = "SENT_START"
|
attr = "SENT_START"
|
||||||
attr = IDS.get(attr)
|
attr = IDS.get(attr)
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
value = string_store.add(value)
|
if attr == ENT_IOB and value in Token.iob_strings():
|
||||||
|
value = Token.iob_strings().index(value)
|
||||||
|
else:
|
||||||
|
value = string_store.add(value)
|
||||||
elif isinstance(value, bool):
|
elif isinstance(value, bool):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
elif isinstance(value, int):
|
elif isinstance(value, int):
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
|
from .compat import Literal
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||||
|
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
||||||
UnderscoreValue = Union[
|
UnderscoreValue = Union[
|
||||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
||||||
]
|
]
|
||||||
|
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
class TokenPattern(BaseModel):
|
class TokenPattern(BaseModel):
|
||||||
|
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
|
||||||
lemma: Optional[StringValue] = None
|
lemma: Optional[StringValue] = None
|
||||||
shape: Optional[StringValue] = None
|
shape: Optional[StringValue] = None
|
||||||
ent_type: Optional[StringValue] = None
|
ent_type: Optional[StringValue] = None
|
||||||
|
ent_iob: Optional[IobValue] = None
|
||||||
ent_id: Optional[StringValue] = None
|
ent_id: Optional[StringValue] = None
|
||||||
ent_kb_id: Optional[StringValue] = None
|
ent_kb_id: Optional[StringValue] = None
|
||||||
norm: Optional[StringValue] = None
|
norm: Optional[StringValue] = None
|
||||||
|
|
|
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_ent_iob_key(en_vocab):
|
||||||
|
"""Test that patterns with ent_iob works correctly."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
|
||||||
|
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
|
||||||
|
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
|
||||||
|
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
|
||||||
|
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
|
||||||
|
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
|
||||||
|
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
|
||||||
|
assert len(matches1) == 1
|
||||||
|
assert matches1[0] == "York"
|
||||||
|
assert len(matches2) == 0
|
||||||
|
|
||||||
|
matcher = Matcher(en_vocab) # Test iob pattern with operators
|
||||||
|
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
|
||||||
|
)
|
||||||
|
doc.ents = [Span(doc, 4, 7, label="PERSON")]
|
||||||
|
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
|
assert len(matches) == 3
|
||||||
|
assert matches[0] == "Maria"
|
||||||
|
assert matches[1] == "Maria Esperanza"
|
||||||
|
assert matches[2] == "Esperanza"
|
||||||
|
|
|
@ -12,6 +12,7 @@ TEST_PATTERNS = [
|
||||||
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
||||||
([{"_": "foo"}], 1, 1),
|
([{"_": "foo"}], 1, 1),
|
||||||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||||
|
([{"ENT_IOB": "foo"}], 1, 1),
|
||||||
([1, 2, 3], 3, 1),
|
([1, 2, 3], 3, 1),
|
||||||
# Bad patterns flagged outside of Matcher
|
# Bad patterns flagged outside of Matcher
|
||||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||||
|
|
|
@ -44,6 +44,7 @@ rule-based matching are:
|
||||||
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||||
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
|
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
|
||||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||||
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
|
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
|
||||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user