mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Intify IOB (#9738)
* added iob to int * added tests * added iob strings * added error * blacked attrs * Update spacy/tests/lang/test_attrs.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/attrs.pyx Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * added iob strings as global * minor refinement with iob * removed iob strings from token * changed to uppercase * cleaned and went back to master version * imported iob from attrs * Update and format errors * Support and test both str and int ENT_IOB key Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
268ddf8a06
commit
47a2916801
|
@ -1,3 +1,6 @@
|
||||||
|
from .errors import Errors
|
||||||
|
|
||||||
|
IOB_STRINGS = ("", "I", "O", "B")
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NULL_ATTR,
|
"": NULL_ATTR,
|
||||||
|
@ -64,7 +67,6 @@ IDS = {
|
||||||
"FLAG61": FLAG61,
|
"FLAG61": FLAG61,
|
||||||
"FLAG62": FLAG62,
|
"FLAG62": FLAG62,
|
||||||
"FLAG63": FLAG63,
|
"FLAG63": FLAG63,
|
||||||
|
|
||||||
"ID": ID,
|
"ID": ID,
|
||||||
"ORTH": ORTH,
|
"ORTH": ORTH,
|
||||||
"LOWER": LOWER,
|
"LOWER": LOWER,
|
||||||
|
@ -72,7 +74,6 @@ IDS = {
|
||||||
"SHAPE": SHAPE,
|
"SHAPE": SHAPE,
|
||||||
"PREFIX": PREFIX,
|
"PREFIX": PREFIX,
|
||||||
"SUFFIX": SUFFIX,
|
"SUFFIX": SUFFIX,
|
||||||
|
|
||||||
"LENGTH": LENGTH,
|
"LENGTH": LENGTH,
|
||||||
"LEMMA": LEMMA,
|
"LEMMA": LEMMA,
|
||||||
"POS": POS,
|
"POS": POS,
|
||||||
|
@ -87,7 +88,7 @@ IDS = {
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
"MORPH": MORPH,
|
"MORPH": MORPH,
|
||||||
"IDX": IDX
|
"IDX": IDX,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
"""
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
if _do_deprecated:
|
||||||
if 'F' in stringy_attrs:
|
if "F" in stringy_attrs:
|
||||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||||
if 'L' in stringy_attrs:
|
if "L" in stringy_attrs:
|
||||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||||
if 'pos' in stringy_attrs:
|
if "pos" in stringy_attrs:
|
||||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||||
if 'morph' in stringy_attrs:
|
if "morph" in stringy_attrs:
|
||||||
morphs = stringy_attrs.pop('morph')
|
morphs = stringy_attrs.pop("morph")
|
||||||
if 'number' in stringy_attrs:
|
if "number" in stringy_attrs:
|
||||||
stringy_attrs.pop('number')
|
stringy_attrs.pop("number")
|
||||||
if 'tenspect' in stringy_attrs:
|
if "tenspect" in stringy_attrs:
|
||||||
stringy_attrs.pop('tenspect')
|
stringy_attrs.pop("tenspect")
|
||||||
morph_keys = [
|
morph_keys = [
|
||||||
'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
|
"PunctType",
|
||||||
'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
|
"PunctSide",
|
||||||
'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
|
"Other",
|
||||||
'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
|
"Degree",
|
||||||
'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
|
"AdvType",
|
||||||
'NumValue', 'PartType', 'Polite', 'StyleVariant',
|
"Number",
|
||||||
'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
|
"VerbForm",
|
||||||
'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
|
"PronType",
|
||||||
'Polarity', 'PrepCase', 'Animacy' # U20
|
"Aspect",
|
||||||
|
"Tense",
|
||||||
|
"PartType",
|
||||||
|
"Poss",
|
||||||
|
"Hyph",
|
||||||
|
"ConjType",
|
||||||
|
"NumType",
|
||||||
|
"Foreign",
|
||||||
|
"VerbType",
|
||||||
|
"NounType",
|
||||||
|
"Gender",
|
||||||
|
"Mood",
|
||||||
|
"Negative",
|
||||||
|
"Tense",
|
||||||
|
"Voice",
|
||||||
|
"Abbr",
|
||||||
|
"Derivation",
|
||||||
|
"Echo",
|
||||||
|
"Foreign",
|
||||||
|
"NameType",
|
||||||
|
"NounType",
|
||||||
|
"NumForm",
|
||||||
|
"NumValue",
|
||||||
|
"PartType",
|
||||||
|
"Polite",
|
||||||
|
"StyleVariant",
|
||||||
|
"PronType",
|
||||||
|
"AdjType",
|
||||||
|
"Person",
|
||||||
|
"Variant",
|
||||||
|
"AdpType",
|
||||||
|
"Reflex",
|
||||||
|
"Negative",
|
||||||
|
"Mood",
|
||||||
|
"Aspect",
|
||||||
|
"Case",
|
||||||
|
"Polarity",
|
||||||
|
"PrepCase",
|
||||||
|
"Animacy", # U20
|
||||||
]
|
]
|
||||||
for key in morph_keys:
|
for key in morph_keys:
|
||||||
if key in stringy_attrs:
|
if key in stringy_attrs:
|
||||||
|
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
int_key = intify_attr(name)
|
int_key = intify_attr(name)
|
||||||
if int_key is not None:
|
if int_key is not None:
|
||||||
|
if int_key == ENT_IOB:
|
||||||
|
if value in IOB_STRINGS:
|
||||||
|
value = IOB_STRINGS.index(value)
|
||||||
|
elif isinstance(value, str):
|
||||||
|
raise ValueError(Errors.E1025.format(value=value))
|
||||||
if strings_map is not None and isinstance(value, str):
|
if strings_map is not None and isinstance(value, str):
|
||||||
if hasattr(strings_map, 'add'):
|
if hasattr(strings_map, "add"):
|
||||||
value = strings_map.add(value)
|
value = strings_map.add(value)
|
||||||
else:
|
else:
|
||||||
value = strings_map[value]
|
value = strings_map[value]
|
||||||
|
|
|
@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||||
"Non-UD tags should use the `tag` property.")
|
"Non-UD tags should use the `tag` property.")
|
||||||
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||||
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
|
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
|
||||||
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
|
"exist.")
|
||||||
|
E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
|
||||||
|
"patterns.")
|
||||||
|
E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
|
||||||
|
"supported values are: 'I', 'O', 'B' and ''")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
OLD_MODEL_SHORTCUTS = {
|
OLD_MODEL_SHORTCUTS = {
|
||||||
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
|
"en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
from spacy.attrs import intify_attrs, ENT_IOB
|
||||||
|
|
||||||
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
|
from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
|
||||||
from spacy.lang.en.stop_words import STOP_WORDS
|
from spacy.lang.en.stop_words import STOP_WORDS
|
||||||
|
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
|
||||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_attrs_ent_iob_intify():
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||||
|
assert int_attrs == {ENT_IOB: 0}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "I"})
|
||||||
|
assert int_attrs == {ENT_IOB: 1}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "O"})
|
||||||
|
assert int_attrs == {ENT_IOB: 2}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "B"})
|
||||||
|
assert int_attrs == {ENT_IOB: 3}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: ""})
|
||||||
|
assert int_attrs == {ENT_IOB: 0}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "I"})
|
||||||
|
assert int_attrs == {ENT_IOB: 1}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "O"})
|
||||||
|
assert int_attrs == {ENT_IOB: 2}
|
||||||
|
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "B"})
|
||||||
|
assert int_attrs == {ENT_IOB: 3}
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
int_attrs = intify_attrs({"ENT_IOB": "XX"})
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
int_attrs = intify_attrs({ENT_IOB: "XX"})
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
||||||
def test_lex_attrs_is_punct(text, match):
|
def test_lex_attrs_is_punct(text, match):
|
||||||
assert is_punct(text) == match
|
assert is_punct(text) == match
|
||||||
|
|
|
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
|
||||||
|
|
||||||
from .. import parts_of_speech
|
from .. import parts_of_speech
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
from ..attrs import IOB_STRINGS
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
|
||||||
|
|
||||||
|
@ -745,7 +746,7 @@ cdef class Token:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iob_strings(cls):
|
def iob_strings(cls):
|
||||||
return ("", "I", "O", "B")
|
return IOB_STRINGS
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_iob_(self):
|
def ent_iob_(self):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user