mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Add ENT_IOB key to Matcher (#9649)
* added new field * added exception for IOb strings * minor refinement to schema * removed field * fixed typo * imported numeriacla val * changed the code bit * cosmetics * added test for matcher * set ents of moc docs * added invalid pattern * minor update to documentation * blacked matcher * added pattern validation * add IOB vals to schema * changed into test * mypy compat * cleaned left over * added compat import * changed type * added compat import * changed literal a bit * went back to old * made explicit type * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/schemas.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
32bd3856b3
commit
268ddf8a06
|
@ -18,7 +18,7 @@ from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
|||
from ..tokens.span cimport Span
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH
|
||||
from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
|
||||
|
||||
from ..schemas import validate_token_pattern
|
||||
from ..errors import Errors, MatchPatternError, Warnings
|
||||
|
@ -798,6 +798,9 @@ def _get_attr_values(spec, string_store):
|
|||
attr = "SENT_START"
|
||||
attr = IDS.get(attr)
|
||||
if isinstance(value, str):
|
||||
if attr == ENT_IOB and value in Token.iob_strings():
|
||||
value = Token.iob_strings().index(value)
|
||||
else:
|
||||
value = string_store.add(value)
|
||||
elif isinstance(value, bool):
|
||||
value = int(value)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
|
||||
from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||
from .compat import Literal
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
|
@ -209,6 +210,7 @@ NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
|||
UnderscoreValue = Union[
|
||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
||||
]
|
||||
IobValue = Literal["", "I", "O", "B", 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TokenPattern(BaseModel):
|
||||
|
@ -222,6 +224,7 @@ class TokenPattern(BaseModel):
|
|||
lemma: Optional[StringValue] = None
|
||||
shape: Optional[StringValue] = None
|
||||
ent_type: Optional[StringValue] = None
|
||||
ent_iob: Optional[IobValue] = None
|
||||
ent_id: Optional[StringValue] = None
|
||||
ent_kb_id: Optional[StringValue] = None
|
||||
norm: Optional[StringValue] = None
|
||||
|
|
|
@ -642,3 +642,30 @@ def test_matcher_no_zero_length(en_vocab):
|
|||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"TAG": "C", "OP": "?"}]])
|
||||
assert len(matcher(doc)) == 0
|
||||
|
||||
|
||||
def test_matcher_ent_iob_key(en_vocab):
|
||||
"""Test that patterns with ent_iob works correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("Rule", [[{"ENT_IOB": "I"}]])
|
||||
doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
|
||||
doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
|
||||
doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
|
||||
doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
|
||||
matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
|
||||
matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
|
||||
assert len(matches1) == 1
|
||||
assert matches1[0] == "York"
|
||||
assert len(matches2) == 0
|
||||
|
||||
matcher = Matcher(en_vocab) # Test iob pattern with operators
|
||||
matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
|
||||
doc = Doc(
|
||||
en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
|
||||
)
|
||||
doc.ents = [Span(doc, 4, 7, label="PERSON")]
|
||||
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches) == 3
|
||||
assert matches[0] == "Maria"
|
||||
assert matches[1] == "Maria Esperanza"
|
||||
assert matches[2] == "Esperanza"
|
||||
|
|
|
@ -12,6 +12,7 @@ TEST_PATTERNS = [
|
|||
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
|
||||
([{"_": "foo"}], 1, 1),
|
||||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||
([{"ENT_IOB": "foo"}], 1, 1),
|
||||
([1, 2, 3], 3, 1),
|
||||
# Bad patterns flagged outside of Matcher
|
||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||
|
|
|
@ -44,6 +44,7 @@ rule-based matching are:
|
|||
| `SPACY` | Token has a trailing space. ~~bool~~ |
|
||||
| `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ |
|
||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||
| `ENT_IOB` | The IOB part of the token's entity tag. ~~str~~ |
|
||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||
| `ENT_KB_ID` | The token's entity knowledge base ID (`ent_kb_id`). ~~str~~ |
|
||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user