mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Intify IOB (#9738)
* added iob to int * added tests * added iob strings * added error * blacked attrs * Update spacy/tests/lang/test_attrs.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/attrs.pyx Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * added iob strings as global * minor refinement with iob * removed iob strings from token * changed to uppercase * cleaned and went back to master version * imported iob from attrs * Update and format errors * Support and test both str and int ENT_IOB key Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
268ddf8a06
commit
47a2916801
|
@ -1,3 +1,6 @@
|
|||
from .errors import Errors
|
||||
|
||||
IOB_STRINGS = ("", "I", "O", "B")
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
|
@ -64,7 +67,6 @@ IDS = {
|
|||
"FLAG61": FLAG61,
|
||||
"FLAG62": FLAG62,
|
||||
"FLAG63": FLAG63,
|
||||
|
||||
"ID": ID,
|
||||
"ORTH": ORTH,
|
||||
"LOWER": LOWER,
|
||||
|
@ -72,7 +74,6 @@ IDS = {
|
|||
"SHAPE": SHAPE,
|
||||
"PREFIX": PREFIX,
|
||||
"SUFFIX": SUFFIX,
|
||||
|
||||
"LENGTH": LENGTH,
|
||||
"LEMMA": LEMMA,
|
||||
"POS": POS,
|
||||
|
@ -87,7 +88,7 @@ IDS = {
|
|||
"SPACY": SPACY,
|
||||
"LANG": LANG,
|
||||
"MORPH": MORPH,
|
||||
"IDX": IDX
|
||||
"IDX": IDX,
|
||||
}
|
||||
|
||||
|
||||
|
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if 'F' in stringy_attrs:
|
||||
if "F" in stringy_attrs:
|
||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||
if 'L' in stringy_attrs:
|
||||
if "L" in stringy_attrs:
|
||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||
if 'pos' in stringy_attrs:
|
||||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if 'morph' in stringy_attrs:
|
||||
morphs = stringy_attrs.pop('morph')
|
||||
if 'number' in stringy_attrs:
|
||||
stringy_attrs.pop('number')
|
||||
if 'tenspect' in stringy_attrs:
|
||||
stringy_attrs.pop('tenspect')
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph")
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
stringy_attrs.pop("tenspect")
|
||||
morph_keys = [
|
||||
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
|
||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
||||
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
|
||||
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
|
||||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
||||
"PunctType",
|
||||
"PunctSide",
|
||||
"Other",
|
||||
"Degree",
|
||||
"AdvType",
|
||||
"Number",
|
||||
"VerbForm",
|
||||
"PronType",
|
||||
"Aspect",
|
||||
"Tense",
|
||||
"PartType",
|
||||
"Poss",
|
||||
"Hyph",
|
||||
"ConjType",
|
||||
"NumType",
|
||||
"Foreign",
|
||||
"VerbType",
|
||||
"NounType",
|
||||
"Gender",
|
||||
"Mood",
|
||||
"Negative",
|
||||
"Tense",
|
||||
"Voice",
|
||||
"Abbr",
|
||||
"Derivation",
|
||||
"Echo",
|
||||
"Foreign",
|
||||
"NameType",
|
||||
"NounType",
|
||||
"NumForm",
|
||||
"NumValue",
|
||||
"PartType",
|
||||
"Polite",
|
||||
"StyleVariant",
|
||||
"PronType",
|
||||
"AdjType",
|
||||
"Person",
|
||||
"Variant",
|
||||
"AdpType",
|
||||
"Reflex",
|
||||
"Negative",
|
||||
"Mood",
|
||||
"Aspect",
|
||||
"Case",
|
||||
"Polarity",
|
||||
"PrepCase",
|
||||
"Animacy", # U20
|
||||
]
|
||||
for key in morph_keys:
|
||||
if key in stringy_attrs:
|
||||
|
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
for name, value in stringy_attrs.items():
|
||||
int_key = intify_attr(name)
|
||||
if int_key is not None:
|
||||
if int_key == ENT_IOB:
|
||||
if value in IOB_STRINGS:
|
||||
value = IOB_STRINGS.index(value)
|
||||
elif isinstance(value, str):
|
||||
raise ValueError(Errors.E1025.format(value=value))
|
||||
if strings_map is not None and isinstance(value, str):
|
||||
if hasattr(strings_map, 'add'):
|
||||
if hasattr(strings_map, "add"):
|
||||
value = strings_map.add(value)
|
||||
else:
|
||||
value = strings_map[value]
|
||||
|
|
|
@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||
"Non-UD tags should use the `tag` property.")
|
||||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
|
||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
|
||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||
"exist.")
|
||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
|
||||
"patterns.")
|
||||
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||
"supported values are: 'I', 'O', 'B' and ''")
|
||||
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
OLD_MODEL_SHORTCUTS = {
|
||||
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
from spacy.attrs import intify_attrs, ENT_IOB
|
||||
|
||||
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
|
||||
from spacy.lang.en.stop_words import STOP_WORDS
|
||||
|
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
|
|||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
def test_attrs_ent_iob_intify():
|
||||
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||
assert int_attrs == {ENT_IOB: 0}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "I"})
|
||||
assert int_attrs == {ENT_IOB: 1}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "O"})
|
||||
assert int_attrs == {ENT_IOB: 2}
|
||||
|
||||
int_attrs = intify_attrs({"ENT_IOB": "B"})
|
||||
assert int_attrs == {ENT_IOB: 3}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: ""})
|
||||
assert int_attrs == {ENT_IOB: 0}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "I"})
|
||||
assert int_attrs == {ENT_IOB: 1}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "O"})
|
||||
assert int_attrs == {ENT_IOB: 2}
|
||||
|
||||
int_attrs = intify_attrs({ENT_IOB: "B"})
|
||||
assert int_attrs == {ENT_IOB: 3}
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
int_attrs = intify_attrs({"ENT_IOB": "XX"})
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
int_attrs = intify_attrs({ENT_IOB: "XX"})
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
||||
def test_lex_attrs_is_punct(text, match):
|
||||
assert is_punct(text) == match
|
||||
|
|
|
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
|
|||
|
||||
from .. import parts_of_speech
|
||||
from ..errors import Errors, Warnings
|
||||
from ..attrs import IOB_STRINGS
|
||||
from .underscore import Underscore, get_ext_args
|
||||
|
||||
|
||||
|
@ -745,7 +746,7 @@ cdef class Token:
|
|||
|
||||
@classmethod
|
||||
def iob_strings(cls):
|
||||
return ("", "I", "O", "B")
|
||||
return IOB_STRINGS
|
||||
|
||||
@property
|
||||
def ent_iob_(self):
|
||||
|
|
Loading…
Reference in New Issue
Block a user