Minor refactor for Morphology and MorphAnalysis (#5804)

* `MorphAnalysis.get` returns only the field values
* Move `_normalize_props` inside `Morphology` as
`Morphology.normalize_attrs` and simplify
  * Simplify POS field detection/conversion
  * Convert all non-POS features to strings
* `Morphology` returns an empty string for a missing morph to align
with the FEATS string returned for an existing morph
* Remove unused `list_to_feats`
This commit is contained in:
Adriane Boyd 2020-07-24 09:28:06 +02:00 committed by GitHub
parent d0c6d1efc5
commit fdb8815ef5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 64 deletions

View File

@ -18,43 +18,14 @@ from .util import ensure_path
from . import symbols from . import symbols
def _normalize_props(props):
"""Convert attrs dict so that POS is always by ID, other features are left
as is as long as they are strings or IDs.
"""
out = {}
props = dict(props)
for key, value in props.items():
# convert POS value to ID
if key == POS:
if hasattr(value, 'upper'):
value = value.upper()
if value in POS_IDS:
value = POS_IDS[value]
out[key] = value
elif isinstance(key, str) and key.lower() == 'pos':
out[POS] = POS_IDS[value.upper()]
# sort values
elif isinstance(value, str) and Morphology.VALUE_SEP in value:
out[key] = Morphology.VALUE_SEP.join(
sorted(value.split(Morphology.VALUE_SEP)))
# accept any string or ID fields and values
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
out[key] = value
else:
warnings.warn(Warnings.W100.format(feature={key: value}))
return out
cdef class Morphology: cdef class Morphology:
'''Store the possible morphological analyses for a language, and index them """Store the possible morphological analyses for a language, and index them
by hash. by hash.
To save space on each token, tokens only know the hash of their morphological To save space on each token, tokens only know the hash of their
analysis, so queries of morphological attributes are delegated morphological analysis, so queries of morphological attributes are delegated
to this class. to this class.
''' """
FEATURE_SEP = "|" FEATURE_SEP = "|"
FIELD_SEP = "=" FIELD_SEP = "="
VALUE_SEP = "," VALUE_SEP = ","
@ -86,7 +57,7 @@ cdef class Morphology:
tag_map = dict(tag_map) tag_map = dict(tag_map)
tag_map['_SP'] = space_attrs tag_map['_SP'] = space_attrs
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs) attrs = self.normalize_attrs(attrs)
self.add(attrs) self.add(attrs)
self.tag_map[tag_str] = dict(attrs) self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i self.reverse_index[self.strings.add(tag_str)] = i
@ -138,7 +109,7 @@ cdef class Morphology:
return tag.key return tag.key
def normalize_features(self, features): def normalize_features(self, features):
"""Create a normalized UFEATS string from a features string or dict. """Create a normalized FEATS string from a features string or dict.
features (Union[dict, str]): Features as dict or UFEATS string. features (Union[dict, str]): Features as dict or UFEATS string.
RETURNS (str): Features as normalized UFEATS string. RETURNS (str): Features as normalized UFEATS string.
@ -148,7 +119,7 @@ cdef class Morphology:
if not isinstance(features, dict): if not isinstance(features, dict):
warnings.warn(Warnings.W100.format(feature=features)) warnings.warn(Warnings.W100.format(feature=features))
features = {} features = {}
features = _normalize_props(features) features = self.normalize_attrs(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# normalized UFEATS string with sorted fields and values # normalized UFEATS string with sorted fields and values
norm_feats_string = self.FEATURE_SEP.join(sorted([ norm_feats_string = self.FEATURE_SEP.join(sorted([
@ -157,6 +128,33 @@ cdef class Morphology:
])) ]))
return norm_feats_string or self.EMPTY_MORPH return norm_feats_string or self.EMPTY_MORPH
def normalize_attrs(self, attrs):
"""Convert attrs dict so that POS is always by ID, other features are
by string. Values separated by VALUE_SEP are sorted.
"""
out = {}
attrs = dict(attrs)
for key, value in attrs.items():
# convert POS value to ID
if key == POS or (isinstance(key, str) and key.upper() == "POS"):
if isinstance(value, str) and value.upper() in POS_IDS:
value = POS_IDS[value.upper()]
elif isinstance(value, int) and value not in POS_IDS.values():
warnings.warn(Warnings.W100.format(feature={key: value}))
continue
out[POS] = value
# accept any string or ID fields and values and convert to strings
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
key = self.strings.as_string(key)
value = self.strings.as_string(value)
# sort values
if self.VALUE_SEP in value:
value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
out[key] = value
else:
warnings.warn(Warnings.W100.format(feature={key: value}))
return out
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
"""Creates a MorphAnalysisC from a list of intified """Creates a MorphAnalysisC from a list of intified
("Field", "Field=Value") tuples where fields with multiple values have ("Field", "Field=Value") tuples where fields with multiple values have
@ -183,7 +181,7 @@ cdef class Morphology:
def get(self, hash_t morph): def get(self, hash_t morph):
tag = <MorphAnalysisC*>self.tags.get(morph) tag = <MorphAnalysisC*>self.tags.get(morph)
if tag == NULL: if tag == NULL:
return [] return ""
else: else:
return self.strings[tag.key] return self.strings[tag.key]
@ -218,7 +216,7 @@ cdef class Morphology:
orth (str): The word-form to key the exception. orth (str): The word-form to key the exception.
""" """
attrs = dict(attrs) attrs = dict(attrs)
attrs = _normalize_props(attrs) attrs = self.normalize_attrs(attrs)
self.add(attrs) self.add(attrs)
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self._exc[(tag_str, self.strings.add(orth_str))] = attrs self._exc[(tag_str, self.strings.add(orth_str))] = attrs
@ -282,7 +280,7 @@ cdef class Morphology:
# Map (form, pos) to attributes # Map (form, pos) to attributes
for tag, exc in morph_rules.items(): for tag, exc in morph_rules.items():
for orth, attrs in exc.items(): for orth, attrs in exc.items():
attrs = _normalize_props(attrs) attrs = self.normalize_attrs(attrs)
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs) self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
@property @property
@ -309,19 +307,6 @@ cdef class Morphology:
return "" return ""
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()])) return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
@staticmethod
def list_to_feats(feats_list):
if len(feats_list) == 0:
return ""
feats_dict = {}
for feat in feats_list:
field, value = feat.split(Morphology.FIELD_SEP)
if field not in feats_dict:
feats_dict[field] = set()
feats_dict[field].add(value)
feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()}
return Morphology.dict_to_feats(feats_dict)
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil: cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
cdef int i cdef int i

View File

@ -35,7 +35,7 @@ def test_token_morph_key(i_has):
def test_morph_props(i_has): def test_morph_props(i_has):
assert i_has[0].morph.get("PronType") == ["PronType=prs"] assert i_has[0].morph.get("PronType") == ["prs"]
assert i_has[1].morph.get("PronType") == [] assert i_has[1].morph.get("PronType") == []
@ -47,20 +47,20 @@ def test_morph_iter(i_has):
def test_morph_get(i_has): def test_morph_get(i_has):
assert i_has[0].morph.get("PronType") == ["PronType=prs"] assert i_has[0].morph.get("PronType") == ["prs"]
def test_morph_set(i_has): def test_morph_set(i_has):
assert i_has[0].morph.get("PronType") == ["PronType=prs"] assert i_has[0].morph.get("PronType") == ["prs"]
# set by string # set by string
i_has[0].morph_ = "PronType=unk" i_has[0].morph_ = "PronType=unk"
assert i_has[0].morph.get("PronType") == ["PronType=unk"] assert i_has[0].morph.get("PronType") == ["unk"]
# set by string, fields are alphabetized # set by string, fields are alphabetized
i_has[0].morph_ = "PronType=123|NounType=unk" i_has[0].morph_ = "PronType=123|NounType=unk"
assert i_has[0].morph_ == "NounType=unk|PronType=123" assert i_has[0].morph_ == "NounType=unk|PronType=123"
# set by dict # set by dict
i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"} i_has[0].morph_ = {"AType": "123", "BType": "unk"}
assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ" assert i_has[0].morph_ == "AType=123|BType=unk"
# set by string with multiple values, fields and values are alphabetized # set by string with multiple values, fields and values are alphabetized
i_has[0].morph_ = "BType=c|AType=b,a" i_has[0].morph_ = "BType=c|AType=b,a"
assert i_has[0].morph_ == "AType=a,b|BType=c" assert i_has[0].morph_ == "AType=a,b|BType=c"

View File

@ -4,10 +4,8 @@ from spacy.morphology import Morphology
def test_feats_converters(): def test_feats_converters():
feats = "Case=dat,gen|Number=sing" feats = "Case=dat,gen|Number=sing"
feats_dict = {"Case": "dat,gen", "Number": "sing"} feats_dict = {"Case": "dat,gen", "Number": "sing"}
feats_list = feats.split(Morphology.FEATURE_SEP)
# simple conversions # simple conversions
assert Morphology.list_to_feats(feats_list) == feats
assert Morphology.dict_to_feats(feats_dict) == feats assert Morphology.dict_to_feats(feats_dict) == feats
assert Morphology.feats_to_dict(feats) == feats_dict assert Morphology.feats_to_dict(feats) == feats_dict
@ -18,8 +16,6 @@ def test_feats_converters():
# unsorted input is normalized # unsorted input is normalized
unsorted_feats = "Number=sing|Case=gen,dat" unsorted_feats = "Number=sing|Case=gen,dat"
unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"} unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"}
unsorted_feats_list = feats.split(Morphology.FEATURE_SEP)
assert Morphology.feats_to_dict(unsorted_feats) == feats_dict assert Morphology.feats_to_dict(unsorted_feats) == feats_dict
assert Morphology.dict_to_feats(unsorted_feats_dict) == feats assert Morphology.dict_to_feats(unsorted_feats_dict) == feats
assert Morphology.list_to_feats(unsorted_feats_list) == feats
assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats

View File

@ -2,6 +2,7 @@ from libc.string cimport memset
cimport numpy as np cimport numpy as np
from ..errors import Errors from ..errors import Errors
from ..morphology import Morphology
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..typedefs cimport hash_t, attr_t from ..typedefs cimport hash_t, attr_t
from ..morphology cimport list_features, check_feature, get_by_field from ..morphology cimport list_features, check_feature, get_by_field
@ -58,10 +59,11 @@ cdef class MorphAnalysis:
return self.key != other.key return self.key != other.key
def get(self, field): def get(self, field):
"""Retrieve a feature by field.""" """Retrieve feature values by field."""
cdef attr_t field_id = self.vocab.strings.as_int(field) cdef attr_t field_id = self.vocab.strings.as_int(field)
cdef np.ndarray results = get_by_field(&self.c, field_id) cdef np.ndarray results = get_by_field(&self.c, field_id)
return [self.vocab.strings[result] for result in results] features = [self.vocab.strings[result] for result in results]
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
def to_json(self): def to_json(self):
"""Produce a json serializable representation as a UD FEATS-style """Produce a json serializable representation as a UD FEATS-style