2023-09-12 09:49:41 +03:00
|
|
|
# cython: profile=False
|
2022-01-20 15:19:38 +03:00
|
|
|
from .errors import Errors
|
|
|
|
|
|
|
|
IOB_STRINGS = ("", "I", "O", "B")
|
2017-04-15 13:05:47 +03:00
|
|
|
|
2015-10-10 09:55:55 +03:00
|
|
|
IDS = {
|
2015-10-10 14:10:19 +03:00
|
|
|
"": NULL_ATTR,
|
2015-10-06 16:39:50 +03:00
|
|
|
"IS_ALPHA": IS_ALPHA,
|
|
|
|
"IS_ASCII": IS_ASCII,
|
|
|
|
"IS_DIGIT": IS_DIGIT,
|
|
|
|
"IS_LOWER": IS_LOWER,
|
|
|
|
"IS_PUNCT": IS_PUNCT,
|
|
|
|
"IS_SPACE": IS_SPACE,
|
|
|
|
"IS_TITLE": IS_TITLE,
|
|
|
|
"IS_UPPER": IS_UPPER,
|
|
|
|
"LIKE_URL": LIKE_URL,
|
|
|
|
"LIKE_NUM": LIKE_NUM,
|
|
|
|
"LIKE_EMAIL": LIKE_EMAIL,
|
|
|
|
"IS_STOP": IS_STOP,
|
2020-05-19 16:59:14 +03:00
|
|
|
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
|
2016-03-10 15:01:34 +03:00
|
|
|
"IS_BRACKET": IS_BRACKET,
|
|
|
|
"IS_QUOTE": IS_QUOTE,
|
|
|
|
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
|
|
|
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
2018-02-11 20:51:32 +03:00
|
|
|
"IS_CURRENCY": IS_CURRENCY,
|
2015-10-06 16:39:50 +03:00
|
|
|
"FLAG19": FLAG19,
|
|
|
|
"FLAG20": FLAG20,
|
|
|
|
"FLAG21": FLAG21,
|
|
|
|
"FLAG22": FLAG22,
|
|
|
|
"FLAG23": FLAG23,
|
|
|
|
"FLAG24": FLAG24,
|
|
|
|
"FLAG25": FLAG25,
|
|
|
|
"FLAG26": FLAG26,
|
|
|
|
"FLAG27": FLAG27,
|
|
|
|
"FLAG28": FLAG28,
|
|
|
|
"FLAG29": FLAG29,
|
|
|
|
"FLAG30": FLAG30,
|
|
|
|
"FLAG31": FLAG31,
|
|
|
|
"FLAG32": FLAG32,
|
|
|
|
"FLAG33": FLAG33,
|
|
|
|
"FLAG34": FLAG34,
|
|
|
|
"FLAG35": FLAG35,
|
|
|
|
"FLAG36": FLAG36,
|
|
|
|
"FLAG37": FLAG37,
|
|
|
|
"FLAG38": FLAG38,
|
|
|
|
"FLAG39": FLAG39,
|
|
|
|
"FLAG40": FLAG40,
|
|
|
|
"FLAG41": FLAG41,
|
|
|
|
"FLAG42": FLAG42,
|
|
|
|
"FLAG43": FLAG43,
|
|
|
|
"FLAG44": FLAG44,
|
|
|
|
"FLAG45": FLAG45,
|
|
|
|
"FLAG46": FLAG46,
|
|
|
|
"FLAG47": FLAG47,
|
|
|
|
"FLAG48": FLAG48,
|
|
|
|
"FLAG49": FLAG49,
|
|
|
|
"FLAG50": FLAG50,
|
|
|
|
"FLAG51": FLAG51,
|
|
|
|
"FLAG52": FLAG52,
|
|
|
|
"FLAG53": FLAG53,
|
|
|
|
"FLAG54": FLAG54,
|
|
|
|
"FLAG55": FLAG55,
|
|
|
|
"FLAG56": FLAG56,
|
|
|
|
"FLAG57": FLAG57,
|
|
|
|
"FLAG58": FLAG58,
|
|
|
|
"FLAG59": FLAG59,
|
|
|
|
"FLAG60": FLAG60,
|
|
|
|
"FLAG61": FLAG61,
|
|
|
|
"FLAG62": FLAG62,
|
|
|
|
"FLAG63": FLAG63,
|
|
|
|
"ID": ID,
|
|
|
|
"ORTH": ORTH,
|
|
|
|
"LOWER": LOWER,
|
|
|
|
"NORM": NORM,
|
|
|
|
"SHAPE": SHAPE,
|
|
|
|
"PREFIX": PREFIX,
|
|
|
|
"SUFFIX": SUFFIX,
|
|
|
|
"LENGTH": LENGTH,
|
|
|
|
"LEMMA": LEMMA,
|
|
|
|
"POS": POS,
|
|
|
|
"TAG": TAG,
|
|
|
|
"DEP": DEP,
|
|
|
|
"ENT_IOB": ENT_IOB,
|
|
|
|
"ENT_TYPE": ENT_TYPE,
|
2020-01-06 16:57:34 +03:00
|
|
|
"ENT_ID": ENT_ID,
|
2019-06-25 16:28:51 +03:00
|
|
|
"ENT_KB_ID": ENT_KB_ID,
|
2015-10-06 16:39:50 +03:00
|
|
|
"HEAD": HEAD,
|
2016-05-05 13:11:57 +03:00
|
|
|
"SENT_START": SENT_START,
|
2015-10-06 16:39:50 +03:00
|
|
|
"SPACY": SPACY,
|
2016-03-10 15:01:34 +03:00
|
|
|
"LANG": LANG,
|
2020-01-29 19:45:46 +03:00
|
|
|
"MORPH": MORPH,
|
2022-01-20 15:19:38 +03:00
|
|
|
"IDX": IDX,
|
2015-10-06 16:39:50 +03:00
|
|
|
}
|
|
|
|
|
2016-11-25 13:34:30 +03:00
|
|
|
|
2015-10-06 16:39:50 +03:00
|
|
|
# ATTR IDs, in order of the symbol
|
2015-10-10 09:55:55 +03:00
|
|
|
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
2017-09-17 20:28:53 +03:00
|
|
|
locals().update(IDS)
|
2016-11-25 13:34:30 +03:00
|
|
|
|
|
|
|
|
|
|
|
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
|
|
|
Normalize a dictionary of attributes, converting them to ints.
|
2017-02-27 00:27:11 +03:00
|
|
|
|
2017-10-27 22:07:59 +03:00
|
|
|
stringy_attrs (dict): Dictionary keyed by attribute string names. Values
|
|
|
|
can be ints or strings.
|
|
|
|
strings_map (StringStore): Defaults to None. If provided, encodes string
|
|
|
|
values into ints.
|
|
|
|
RETURNS (dict): Attributes dictionary with keys and optionally values
|
|
|
|
converted to ints.
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
2016-11-25 13:34:30 +03:00
|
|
|
inty_attrs = {}
|
|
|
|
if _do_deprecated:
|
2022-01-20 15:19:38 +03:00
|
|
|
if "F" in stringy_attrs:
|
2016-11-25 13:34:30 +03:00
|
|
|
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
2022-01-20 15:19:38 +03:00
|
|
|
if "L" in stringy_attrs:
|
2016-11-25 13:34:30 +03:00
|
|
|
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
2022-01-20 15:19:38 +03:00
|
|
|
if "pos" in stringy_attrs:
|
2016-11-25 13:34:30 +03:00
|
|
|
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
2022-01-20 15:19:38 +03:00
|
|
|
if "morph" in stringy_attrs:
|
2023-07-19 13:03:31 +03:00
|
|
|
morphs = stringy_attrs.pop("morph") # no-cython-lint
|
2022-01-20 15:19:38 +03:00
|
|
|
if "number" in stringy_attrs:
|
|
|
|
stringy_attrs.pop("number")
|
|
|
|
if "tenspect" in stringy_attrs:
|
|
|
|
stringy_attrs.pop("tenspect")
|
2016-12-18 18:50:26 +03:00
|
|
|
morph_keys = [
|
2022-01-20 15:19:38 +03:00
|
|
|
"PunctType",
|
|
|
|
"PunctSide",
|
|
|
|
"Other",
|
|
|
|
"Degree",
|
|
|
|
"AdvType",
|
|
|
|
"Number",
|
|
|
|
"VerbForm",
|
|
|
|
"PronType",
|
|
|
|
"Aspect",
|
|
|
|
"Tense",
|
|
|
|
"PartType",
|
|
|
|
"Poss",
|
|
|
|
"Hyph",
|
|
|
|
"ConjType",
|
|
|
|
"NumType",
|
|
|
|
"Foreign",
|
|
|
|
"VerbType",
|
|
|
|
"NounType",
|
|
|
|
"Gender",
|
|
|
|
"Mood",
|
|
|
|
"Negative",
|
|
|
|
"Tense",
|
|
|
|
"Voice",
|
|
|
|
"Abbr",
|
|
|
|
"Derivation",
|
|
|
|
"Echo",
|
|
|
|
"Foreign",
|
|
|
|
"NameType",
|
|
|
|
"NounType",
|
|
|
|
"NumForm",
|
|
|
|
"NumValue",
|
|
|
|
"PartType",
|
|
|
|
"Polite",
|
|
|
|
"StyleVariant",
|
|
|
|
"PronType",
|
|
|
|
"AdjType",
|
|
|
|
"Person",
|
|
|
|
"Variant",
|
|
|
|
"AdpType",
|
|
|
|
"Reflex",
|
|
|
|
"Negative",
|
|
|
|
"Mood",
|
|
|
|
"Aspect",
|
|
|
|
"Case",
|
|
|
|
"Polarity",
|
|
|
|
"PrepCase",
|
|
|
|
"Animacy", # U20
|
2017-02-27 00:27:11 +03:00
|
|
|
]
|
2016-12-18 18:50:26 +03:00
|
|
|
for key in morph_keys:
|
|
|
|
if key in stringy_attrs:
|
|
|
|
stringy_attrs.pop(key)
|
2016-12-18 19:33:46 +03:00
|
|
|
elif key.lower() in stringy_attrs:
|
|
|
|
stringy_attrs.pop(key.lower())
|
|
|
|
elif key.upper() in stringy_attrs:
|
|
|
|
stringy_attrs.pop(key.upper())
|
2016-11-25 13:34:30 +03:00
|
|
|
for name, value in stringy_attrs.items():
|
2019-10-03 15:48:45 +03:00
|
|
|
int_key = intify_attr(name)
|
|
|
|
if int_key is not None:
|
2022-01-20 15:19:38 +03:00
|
|
|
if int_key == ENT_IOB:
|
|
|
|
if value in IOB_STRINGS:
|
|
|
|
value = IOB_STRINGS.index(value)
|
|
|
|
elif isinstance(value, str):
|
|
|
|
raise ValueError(Errors.E1025.format(value=value))
|
2021-09-13 18:02:17 +03:00
|
|
|
if strings_map is not None and isinstance(value, str):
|
2022-01-20 15:19:38 +03:00
|
|
|
if hasattr(strings_map, "add"):
|
2019-10-03 15:48:45 +03:00
|
|
|
value = strings_map.add(value)
|
|
|
|
else:
|
|
|
|
value = strings_map[value]
|
|
|
|
inty_attrs[int_key] = value
|
2016-11-25 13:34:30 +03:00
|
|
|
return inty_attrs
|
2019-10-03 15:48:45 +03:00
|
|
|
|
|
|
|
|
|
|
|
def intify_attr(name):
|
|
|
|
"""
|
|
|
|
Normalize an attribute name, converting it to int.
|
|
|
|
|
|
|
|
stringy_attr (string): Attribute string name. Can also be int (will then be left unchanged)
|
|
|
|
RETURNS (int): int representation of the attribute, or None if it couldn't be converted.
|
|
|
|
"""
|
|
|
|
if isinstance(name, int):
|
|
|
|
return name
|
|
|
|
elif name in IDS:
|
|
|
|
return IDS[name]
|
|
|
|
elif name.upper() in IDS:
|
|
|
|
return IDS[name.upper()]
|
|
|
|
return None
|