spaCy/spacy/attrs.pyx
Adriane Boyd 4a615cacd2
Consolidate and freeze symbols (#11352)
* Consolidate and freeze symbols

Instead of having symbol values defined in three potentially conflicting
places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define
all symbols in `spacy.symbols` and reference those values in
`spacy.attrs` and `spacy.parts_of_speech`.

Remove deprecated and placeholder symbols from `spacy.attrs.IDS`.

Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather
than lists in order to support future use of hash values in `attr_id_t`.

Minor changes:

* Use `uint64_t` for attrs in `Doc.to_array` to support future use of
hash values
* Remove unneeded attrs filter for error message in `Doc.to_array`
* Remove unused attr `SENT_END`

* Handle dynamic size of attr_id_t in Doc.to_array

* Undo added warnings

* Refactor to make Doc.to_array more similar to Doc.from_array

* Improve refactoring
2022-09-02 09:08:40 +02:00

96 lines
2.6 KiB
Cython

from .errors import Errors
IOB_STRINGS = ("", "I", "O", "B")
IDS = {
"": NULL_ATTR,
"IS_ALPHA": IS_ALPHA,
"IS_ASCII": IS_ASCII,
"IS_DIGIT": IS_DIGIT,
"IS_LOWER": IS_LOWER,
"IS_PUNCT": IS_PUNCT,
"IS_SPACE": IS_SPACE,
"IS_TITLE": IS_TITLE,
"IS_UPPER": IS_UPPER,
"LIKE_URL": LIKE_URL,
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"IS_CURRENCY": IS_CURRENCY,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
"NORM": NORM,
"SHAPE": SHAPE,
"PREFIX": PREFIX,
"SUFFIX": SUFFIX,
"LENGTH": LENGTH,
"LEMMA": LEMMA,
"POS": POS,
"TAG": TAG,
"DEP": DEP,
"ENT_IOB": ENT_IOB,
"ENT_TYPE": ENT_TYPE,
"ENT_ID": ENT_ID,
"ENT_KB_ID": ENT_KB_ID,
"HEAD": HEAD,
"SENT_START": SENT_START,
"SPACY": SPACY,
"LANG": LANG,
"MORPH": MORPH,
"IDX": IDX,
}
NAMES = {v: k for k, v in IDS.items()}
locals().update(IDS)
def intify_attrs(stringy_attrs, strings_map=None):
"""
Normalize a dictionary of attributes, converting them to ints.
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
can be ints or strings.
strings_map (StringStore): Defaults to None. If provided, encodes string
values into ints.
RETURNS (dict): Attributes dictionary with keys and optionally values
converted to ints.
"""
inty_attrs = {}
for name, value in stringy_attrs.items():
int_key = intify_attr(name)
if int_key is not None:
if int_key == ENT_IOB:
if value in IOB_STRINGS:
value = IOB_STRINGS.index(value)
elif isinstance(value, str):
raise ValueError(Errors.E1025.format(value=value))
if strings_map is not None and isinstance(value, str):
if hasattr(strings_map, "add"):
value = strings_map.add(value)
else:
value = strings_map[value]
inty_attrs[int_key] = value
return inty_attrs
def intify_attr(name):
"""
Normalize an attribute name, converting it to int.
stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)
RETURNS (int): int representation of the attribute, or None if it couldn't be converted.
"""
if isinstance(name, int):
return name
elif name in IDS:
return IDS[name]
elif name.upper() in IDS:
return IDS[name.upper()]
return None