Intify IOB (#9738)

* added iob to int * added tests * added iob strings * added error * blacked attrs * Update spacy/tests/lang/test_attrs.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Update spacy/attrs.pyx Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * added iob strings as global * minor refinement with iob * removed iob strings from token * changed to uppercase * cleaned and went back to master version * imported iob from attrs * Update and format errors * Support and test both str and int ENT_IOB key Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-12-09 11:14:21 +03:00 · 2022-01-20 13:19:38 +01:00 · 2022-01-20 13:19:38 +01:00 · 47a2916801
commit 47a2916801
parent 268ddf8a06
4 changed files with 107 additions and 26 deletions
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,3 +1,6 @@
 from .errors import Errors
 IOB_STRINGS = ("", "I", "O", "B")
 IDS = {
    "": NULL_ATTR,
@ -64,7 +67,6 @@ IDS = {
    "FLAG61": FLAG61,
    "FLAG62": FLAG62,
    "FLAG63": FLAG63,
    "ID": ID,
    "ORTH": ORTH,
    "LOWER": LOWER,
@ -72,7 +74,6 @@ IDS = {
    "SHAPE": SHAPE,
    "PREFIX": PREFIX,
    "SUFFIX": SUFFIX,
    "LENGTH": LENGTH,
    "LEMMA": LEMMA,
    "POS": POS,
@ -87,7 +88,7 @@ IDS = {
    "SPACY": SPACY,
    "LANG": LANG,
    "MORPH": MORPH,
-    "IDX": IDX
+    "IDX": IDX,
 }
@ -109,28 +110,66 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    """
    inty_attrs = {}
    if _do_deprecated:
-        if 'F' in stringy_attrs:
+        if "F" in stringy_attrs:
            stringy_attrs["ORTH"] = stringy_attrs.pop("F")
-        if 'L' in stringy_attrs:
+        if "L" in stringy_attrs:
            stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
-        if 'pos' in stringy_attrs:
+        if "pos" in stringy_attrs:
            stringy_attrs["TAG"] = stringy_attrs.pop("pos")
-        if 'morph' in stringy_attrs:
+        if "morph" in stringy_attrs:
-            morphs = stringy_attrs.pop('morph')
+            morphs = stringy_attrs.pop("morph")
-        if 'number' in stringy_attrs:
+        if "number" in stringy_attrs:
-            stringy_attrs.pop('number')
+            stringy_attrs.pop("number")
-        if 'tenspect' in stringy_attrs:
+        if "tenspect" in stringy_attrs:
-            stringy_attrs.pop('tenspect')
+            stringy_attrs.pop("tenspect")
        morph_keys = [
-            'PunctType', 'PunctSide', 'Other', 'Degree', 'AdvType', 'Number',
+            "PunctType",
-            'VerbForm', 'PronType', 'Aspect', 'Tense', 'PartType', 'Poss',
+            "PunctSide",
-            'Hyph', 'ConjType', 'NumType', 'Foreign', 'VerbType', 'NounType',
+            "Other",
-            'Gender', 'Mood', 'Negative', 'Tense', 'Voice', 'Abbr',
+            "Degree",
-            'Derivation', 'Echo', 'Foreign', 'NameType', 'NounType', 'NumForm',
+            "AdvType",
-            'NumValue', 'PartType', 'Polite', 'StyleVariant',
+            "Number",
-            'PronType', 'AdjType', 'Person', 'Variant', 'AdpType',
+            "VerbForm",
-            'Reflex', 'Negative', 'Mood', 'Aspect', 'Case',
+            "PronType",
-            'Polarity', 'PrepCase', 'Animacy' # U20
+            "Aspect",
            "Tense",
            "PartType",
            "Poss",
            "Hyph",
            "ConjType",
            "NumType",
            "Foreign",
            "VerbType",
            "NounType",
            "Gender",
            "Mood",
            "Negative",
            "Tense",
            "Voice",
            "Abbr",
            "Derivation",
            "Echo",
            "Foreign",
            "NameType",
            "NounType",
            "NumForm",
            "NumValue",
            "PartType",
            "Polite",
            "StyleVariant",
            "PronType",
            "AdjType",
            "Person",
            "Variant",
            "AdpType",
            "Reflex",
            "Negative",
            "Mood",
            "Aspect",
            "Case",
            "Polarity",
            "PrepCase",
            "Animacy",  # U20
        ]
        for key in morph_keys:
            if key in stringy_attrs:
@ -142,8 +181,13 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    for name, value in stringy_attrs.items():
        int_key = intify_attr(name)
        if int_key is not None:
            if int_key == ENT_IOB:
                if value in IOB_STRINGS:
                    value = IOB_STRINGS.index(value)
                elif isinstance(value, str):
                    raise ValueError(Errors.E1025.format(value=value))
            if strings_map is not None and isinstance(value, str):
-                if hasattr(strings_map, 'add'):
+                if hasattr(strings_map, "add"):
                    value = strings_map.add(value)
                else:
                    value = strings_map[value]
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -888,11 +888,14 @@ class Errors(metaclass=ErrorsWithCodes):
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
-    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
+    E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't "
-    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler patterns.")
+             "exist.")
    E1024 = ("A pattern with ID \"{ent_id}\" is not present in EntityRuler "
             "patterns.")
    E1025 = ("Cannot intify the value '{value}' as an IOB string. The only "
             "supported values are: 'I', 'O', 'B' and ''")
 # Deprecated model shortcuts, only used in errors and warnings
 OLD_MODEL_SHORTCUTS = {
    "en": "en_core_web_sm", "de": "de_core_news_sm", "es": "es_core_news_sm",
--- a/spacy/tests/lang/test_attrs.py
+++ b/spacy/tests/lang/test_attrs.py
@ -1,4 +1,5 @@
 import pytest
 from spacy.attrs import intify_attrs, ENT_IOB
 from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs
 from spacy.lang.en.stop_words import STOP_WORDS
@ -33,6 +34,38 @@ def test_attrs_do_deprecated(text):
    assert int_attrs == {ORTH: 10, IS_ALPHA: True}
 def test_attrs_ent_iob_intify():
    int_attrs = intify_attrs({"ENT_IOB": ""})
    assert int_attrs == {ENT_IOB: 0}
    int_attrs = intify_attrs({"ENT_IOB": "I"})
    assert int_attrs == {ENT_IOB: 1}
    int_attrs = intify_attrs({"ENT_IOB": "O"})
    assert int_attrs == {ENT_IOB: 2}
    int_attrs = intify_attrs({"ENT_IOB": "B"})
    assert int_attrs == {ENT_IOB: 3}
    int_attrs = intify_attrs({ENT_IOB: ""})
    assert int_attrs == {ENT_IOB: 0}
    int_attrs = intify_attrs({ENT_IOB: "I"})
    assert int_attrs == {ENT_IOB: 1}
    int_attrs = intify_attrs({ENT_IOB: "O"})
    assert int_attrs == {ENT_IOB: 2}
    int_attrs = intify_attrs({ENT_IOB: "B"})
    assert int_attrs == {ENT_IOB: 3}
    with pytest.raises(ValueError):
        int_attrs = intify_attrs({"ENT_IOB": "XX"})
    with pytest.raises(ValueError):
        int_attrs = intify_attrs({ENT_IOB: "XX"})
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
 def test_lex_attrs_is_punct(text, match):
    assert is_punct(text) == match
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -20,6 +20,7 @@ from .doc cimport set_children_from_heads
 from .. import parts_of_speech
 from ..errors import Errors, Warnings
 from ..attrs import IOB_STRINGS
 from .underscore import Underscore, get_ext_args
@ -745,7 +746,7 @@ cdef class Token:
    @classmethod
    def iob_strings(cls):
-        return ("", "I", "O", "B")
+        return IOB_STRINGS
    @property
    def ent_iob_(self):