mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-22 02:44:15 +03:00
Remove intify_attrs(_do_deprecated) (#11319)
This commit is contained in:
parent
551e73ccfc
commit
d757dec5c4
|
@ -97,7 +97,7 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
locals().update(IDS)
|
locals().update(IDS)
|
||||||
|
|
||||||
|
|
||||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
def intify_attrs(stringy_attrs, strings_map=None):
|
||||||
"""
|
"""
|
||||||
Normalize a dictionary of attributes, converting them to ints.
|
Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
|
@ -109,75 +109,6 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
converted to ints.
|
converted to ints.
|
||||||
"""
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
|
||||||
if "F" in stringy_attrs:
|
|
||||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
|
||||||
if "L" in stringy_attrs:
|
|
||||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
|
||||||
if "pos" in stringy_attrs:
|
|
||||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
|
||||||
if "morph" in stringy_attrs:
|
|
||||||
morphs = stringy_attrs.pop("morph")
|
|
||||||
if "number" in stringy_attrs:
|
|
||||||
stringy_attrs.pop("number")
|
|
||||||
if "tenspect" in stringy_attrs:
|
|
||||||
stringy_attrs.pop("tenspect")
|
|
||||||
morph_keys = [
|
|
||||||
"PunctType",
|
|
||||||
"PunctSide",
|
|
||||||
"Other",
|
|
||||||
"Degree",
|
|
||||||
"AdvType",
|
|
||||||
"Number",
|
|
||||||
"VerbForm",
|
|
||||||
"PronType",
|
|
||||||
"Aspect",
|
|
||||||
"Tense",
|
|
||||||
"PartType",
|
|
||||||
"Poss",
|
|
||||||
"Hyph",
|
|
||||||
"ConjType",
|
|
||||||
"NumType",
|
|
||||||
"Foreign",
|
|
||||||
"VerbType",
|
|
||||||
"NounType",
|
|
||||||
"Gender",
|
|
||||||
"Mood",
|
|
||||||
"Negative",
|
|
||||||
"Tense",
|
|
||||||
"Voice",
|
|
||||||
"Abbr",
|
|
||||||
"Derivation",
|
|
||||||
"Echo",
|
|
||||||
"Foreign",
|
|
||||||
"NameType",
|
|
||||||
"NounType",
|
|
||||||
"NumForm",
|
|
||||||
"NumValue",
|
|
||||||
"PartType",
|
|
||||||
"Polite",
|
|
||||||
"StyleVariant",
|
|
||||||
"PronType",
|
|
||||||
"AdjType",
|
|
||||||
"Person",
|
|
||||||
"Variant",
|
|
||||||
"AdpType",
|
|
||||||
"Reflex",
|
|
||||||
"Negative",
|
|
||||||
"Mood",
|
|
||||||
"Aspect",
|
|
||||||
"Case",
|
|
||||||
"Polarity",
|
|
||||||
"PrepCase",
|
|
||||||
"Animacy", # U20
|
|
||||||
]
|
|
||||||
for key in morph_keys:
|
|
||||||
if key in stringy_attrs:
|
|
||||||
stringy_attrs.pop(key)
|
|
||||||
elif key.lower() in stringy_attrs:
|
|
||||||
stringy_attrs.pop(key.lower())
|
|
||||||
elif key.upper() in stringy_attrs:
|
|
||||||
stringy_attrs.pop(key.upper())
|
|
||||||
for name, value in stringy_attrs.items():
|
for name, value in stringy_attrs.items():
|
||||||
int_key = intify_attr(name)
|
int_key = intify_attr(name)
|
||||||
if int_key is not None:
|
if int_key is not None:
|
||||||
|
|
|
@ -26,14 +26,6 @@ def test_attrs_idempotence(text):
|
||||||
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text", ["dog"])
|
|
||||||
def test_attrs_do_deprecated(text):
|
|
||||||
int_attrs = intify_attrs(
|
|
||||||
{"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
|
|
||||||
)
|
|
||||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
|
||||||
|
|
||||||
|
|
||||||
def test_attrs_ent_iob_intify():
|
def test_attrs_ent_iob_intify():
|
||||||
int_attrs = intify_attrs({"ENT_IOB": ""})
|
int_attrs = intify_attrs({"ENT_IOB": ""})
|
||||||
assert int_attrs == {ENT_IOB: 0}
|
assert int_attrs == {ENT_IOB: 0}
|
||||||
|
|
|
@ -582,7 +582,7 @@ cdef class Tokenizer:
|
||||||
substrings (iterable): A sequence of dicts, where each dict describes
|
substrings (iterable): A sequence of dicts, where each dict describes
|
||||||
a token and its attributes.
|
a token and its attributes.
|
||||||
"""
|
"""
|
||||||
attrs = [intify_attrs(spec, _do_deprecated=True) for spec in substrings]
|
attrs = [intify_attrs(spec) for spec in substrings]
|
||||||
orth = "".join([spec[ORTH] for spec in attrs])
|
orth = "".join([spec[ORTH] for spec in attrs])
|
||||||
if chunk != orth:
|
if chunk != orth:
|
||||||
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
raise ValueError(Errors.E997.format(chunk=chunk, orth=orth, token_attrs=substrings))
|
||||||
|
@ -650,7 +650,7 @@ cdef class Tokenizer:
|
||||||
url_match = re.compile("a^").match
|
url_match = re.compile("a^").match
|
||||||
special_cases = {}
|
special_cases = {}
|
||||||
for orth, special_tokens in self.rules.items():
|
for orth, special_tokens in self.rules.items():
|
||||||
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings, _do_deprecated=True) for special_token in special_tokens]
|
special_cases[orth] = [intify_attrs(special_token, strings_map=self.vocab.strings) for special_token in special_tokens]
|
||||||
tokens = []
|
tokens = []
|
||||||
for substring in text.split():
|
for substring in text.split():
|
||||||
suffixes = []
|
suffixes = []
|
||||||
|
|
|
@ -268,8 +268,7 @@ cdef class Vocab:
|
||||||
cdef int i
|
cdef int i
|
||||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
for i, props in enumerate(substrings):
|
for i, props in enumerate(substrings):
|
||||||
props = intify_attrs(props, strings_map=self.strings,
|
props = intify_attrs(props, strings_map=self.strings)
|
||||||
_do_deprecated=True)
|
|
||||||
token = &tokens[i]
|
token = &tokens[i]
|
||||||
# Set the special tokens up to have arbitrary attributes
|
# Set the special tokens up to have arbitrary attributes
|
||||||
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user