mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
b0228d8ea6
* chore: add cython-linter dev dependency * fix: lexeme.pyx * fix: morphology.pxd * fix: tokenizer.pxd * fix: vocab.pxd * fix: morphology.pxd (line length) * ci: add cython-lint * ci: fix cython-lint call * Fix kb/candidate.pyx. * Fix kb/kb.pyx. * Fix kb/kb_in_memory.pyx. * Fix kb. * Fix training/ partially. * Fix training/. Ignore trailing whitespaces and too long lines. * Fix ml/. * Fix matcher/. * Fix pipeline/. * Fix tokens/. * Fix build errors. Fix vocab.pyx. * Fix cython-lint install and run. * Fix lexeme.pyx, parts_of_speech.pxd, vectors.pyx. Temporarily disable cython-lint execution. * Fix attrs.pyx, lexeme.pyx, symbols.pxd, isort issues. * Make cython-lint install conditional. Fix tokenizer.pyx. * Fix remaining files. Reenable cython-lint check. * Readded parentheses. * Fix test_build_dependencies(). * Add explanatory comment to cython-lint execution. --------- Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
96 lines
2.6 KiB
Cython
96 lines
2.6 KiB
Cython
from .errors import Errors
|
|
|
|
IOB_STRINGS = ("", "I", "O", "B")
|
|
|
|
IDS = {
|
|
"": NULL_ATTR,
|
|
"IS_ALPHA": IS_ALPHA,
|
|
"IS_ASCII": IS_ASCII,
|
|
"IS_DIGIT": IS_DIGIT,
|
|
"IS_LOWER": IS_LOWER,
|
|
"IS_PUNCT": IS_PUNCT,
|
|
"IS_SPACE": IS_SPACE,
|
|
"IS_TITLE": IS_TITLE,
|
|
"IS_UPPER": IS_UPPER,
|
|
"LIKE_URL": LIKE_URL,
|
|
"LIKE_NUM": LIKE_NUM,
|
|
"LIKE_EMAIL": LIKE_EMAIL,
|
|
"IS_STOP": IS_STOP,
|
|
"IS_BRACKET": IS_BRACKET,
|
|
"IS_QUOTE": IS_QUOTE,
|
|
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
|
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
|
"IS_CURRENCY": IS_CURRENCY,
|
|
"ID": ID,
|
|
"ORTH": ORTH,
|
|
"LOWER": LOWER,
|
|
"NORM": NORM,
|
|
"SHAPE": SHAPE,
|
|
"PREFIX": PREFIX,
|
|
"SUFFIX": SUFFIX,
|
|
"LENGTH": LENGTH,
|
|
"LEMMA": LEMMA,
|
|
"POS": POS,
|
|
"TAG": TAG,
|
|
"DEP": DEP,
|
|
"ENT_IOB": ENT_IOB,
|
|
"ENT_TYPE": ENT_TYPE,
|
|
"ENT_ID": ENT_ID,
|
|
"ENT_KB_ID": ENT_KB_ID,
|
|
"HEAD": HEAD,
|
|
"SENT_START": SENT_START,
|
|
"SPACY": SPACY,
|
|
"LANG": LANG,
|
|
"MORPH": MORPH,
|
|
"IDX": IDX,
|
|
}
|
|
|
|
|
|
NAMES = {v: k for k, v in IDS.items()}
|
|
locals().update(IDS)
|
|
|
|
|
|
def intify_attrs(stringy_attrs, strings_map=None):
|
|
"""
|
|
Normalize a dictionary of attributes, converting them to ints.
|
|
|
|
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
|
|
can be ints or strings.
|
|
strings_map (StringStore): Defaults to None. If provided, encodes string
|
|
values into ints.
|
|
RETURNS (dict): Attributes dictionary with keys and optionally values
|
|
converted to ints.
|
|
"""
|
|
inty_attrs = {}
|
|
for name, value in stringy_attrs.items():
|
|
int_key = intify_attr(name)
|
|
if int_key is not None:
|
|
if int_key == ENT_IOB:
|
|
if value in IOB_STRINGS:
|
|
value = IOB_STRINGS.index(value)
|
|
elif isinstance(value, str):
|
|
raise ValueError(Errors.E1025.format(value=value))
|
|
if strings_map is not None and isinstance(value, str):
|
|
if hasattr(strings_map, "add"):
|
|
value = strings_map.add(value)
|
|
else:
|
|
value = strings_map[value]
|
|
inty_attrs[int_key] = value
|
|
return inty_attrs
|
|
|
|
|
|
def intify_attr(name):
|
|
"""
|
|
Normalize an attribute name, converting it to int.
|
|
|
|
stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)
|
|
RETURNS (int): int representation of the attribute, or None if it couldn't be converted.
|
|
"""
|
|
if isinstance(name, int):
|
|
return name
|
|
elif name in IDS:
|
|
return IDS[name]
|
|
elif name.upper() in IDS:
|
|
return IDS[name.upper()]
|
|
return None
|