mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Minor refactor for Morphology and MorphAnalysis (#5804)
* `MorphAnalysis.get` returns only the field values * Move `_normalize_props` inside `Morphology` as `Morphology.normalize_attrs` and simplify * Simplify POS field detection/conversion * Convert all non-POS features to strings * `Morphology` returns an empty string for a missing morph to align with the FEATS string returned for an existing morph * Remove unused `list_to_feats`
This commit is contained in:
parent
d0c6d1efc5
commit
fdb8815ef5
|
@ -18,43 +18,14 @@ from .util import ensure_path
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
|
||||||
"""Convert attrs dict so that POS is always by ID, other features are left
|
|
||||||
as is as long as they are strings or IDs.
|
|
||||||
"""
|
|
||||||
out = {}
|
|
||||||
props = dict(props)
|
|
||||||
for key, value in props.items():
|
|
||||||
# convert POS value to ID
|
|
||||||
if key == POS:
|
|
||||||
if hasattr(value, 'upper'):
|
|
||||||
value = value.upper()
|
|
||||||
if value in POS_IDS:
|
|
||||||
value = POS_IDS[value]
|
|
||||||
out[key] = value
|
|
||||||
elif isinstance(key, str) and key.lower() == 'pos':
|
|
||||||
out[POS] = POS_IDS[value.upper()]
|
|
||||||
# sort values
|
|
||||||
elif isinstance(value, str) and Morphology.VALUE_SEP in value:
|
|
||||||
out[key] = Morphology.VALUE_SEP.join(
|
|
||||||
sorted(value.split(Morphology.VALUE_SEP)))
|
|
||||||
# accept any string or ID fields and values
|
|
||||||
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
|
|
||||||
out[key] = value
|
|
||||||
else:
|
|
||||||
warnings.warn(Warnings.W100.format(feature={key: value}))
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
'''Store the possible morphological analyses for a language, and index them
|
"""Store the possible morphological analyses for a language, and index them
|
||||||
by hash.
|
by hash.
|
||||||
|
|
||||||
To save space on each token, tokens only know the hash of their morphological
|
To save space on each token, tokens only know the hash of their
|
||||||
analysis, so queries of morphological attributes are delegated
|
morphological analysis, so queries of morphological attributes are delegated
|
||||||
to this class.
|
to this class.
|
||||||
'''
|
"""
|
||||||
|
|
||||||
FEATURE_SEP = "|"
|
FEATURE_SEP = "|"
|
||||||
FIELD_SEP = "="
|
FIELD_SEP = "="
|
||||||
VALUE_SEP = ","
|
VALUE_SEP = ","
|
||||||
|
@ -86,7 +57,7 @@ cdef class Morphology:
|
||||||
tag_map = dict(tag_map)
|
tag_map = dict(tag_map)
|
||||||
tag_map['_SP'] = space_attrs
|
tag_map['_SP'] = space_attrs
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
attrs = _normalize_props(attrs)
|
attrs = self.normalize_attrs(attrs)
|
||||||
self.add(attrs)
|
self.add(attrs)
|
||||||
self.tag_map[tag_str] = dict(attrs)
|
self.tag_map[tag_str] = dict(attrs)
|
||||||
self.reverse_index[self.strings.add(tag_str)] = i
|
self.reverse_index[self.strings.add(tag_str)] = i
|
||||||
|
@ -138,7 +109,7 @@ cdef class Morphology:
|
||||||
return tag.key
|
return tag.key
|
||||||
|
|
||||||
def normalize_features(self, features):
|
def normalize_features(self, features):
|
||||||
"""Create a normalized UFEATS string from a features string or dict.
|
"""Create a normalized FEATS string from a features string or dict.
|
||||||
|
|
||||||
features (Union[dict, str]): Features as dict or UFEATS string.
|
features (Union[dict, str]): Features as dict or UFEATS string.
|
||||||
RETURNS (str): Features as normalized UFEATS string.
|
RETURNS (str): Features as normalized UFEATS string.
|
||||||
|
@ -148,7 +119,7 @@ cdef class Morphology:
|
||||||
if not isinstance(features, dict):
|
if not isinstance(features, dict):
|
||||||
warnings.warn(Warnings.W100.format(feature=features))
|
warnings.warn(Warnings.W100.format(feature=features))
|
||||||
features = {}
|
features = {}
|
||||||
features = _normalize_props(features)
|
features = self.normalize_attrs(features)
|
||||||
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||||
# normalized UFEATS string with sorted fields and values
|
# normalized UFEATS string with sorted fields and values
|
||||||
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
||||||
|
@ -157,6 +128,33 @@ cdef class Morphology:
|
||||||
]))
|
]))
|
||||||
return norm_feats_string or self.EMPTY_MORPH
|
return norm_feats_string or self.EMPTY_MORPH
|
||||||
|
|
||||||
|
def normalize_attrs(self, attrs):
|
||||||
|
"""Convert attrs dict so that POS is always by ID, other features are
|
||||||
|
by string. Values separated by VALUE_SEP are sorted.
|
||||||
|
"""
|
||||||
|
out = {}
|
||||||
|
attrs = dict(attrs)
|
||||||
|
for key, value in attrs.items():
|
||||||
|
# convert POS value to ID
|
||||||
|
if key == POS or (isinstance(key, str) and key.upper() == "POS"):
|
||||||
|
if isinstance(value, str) and value.upper() in POS_IDS:
|
||||||
|
value = POS_IDS[value.upper()]
|
||||||
|
elif isinstance(value, int) and value not in POS_IDS.values():
|
||||||
|
warnings.warn(Warnings.W100.format(feature={key: value}))
|
||||||
|
continue
|
||||||
|
out[POS] = value
|
||||||
|
# accept any string or ID fields and values and convert to strings
|
||||||
|
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
|
||||||
|
key = self.strings.as_string(key)
|
||||||
|
value = self.strings.as_string(value)
|
||||||
|
# sort values
|
||||||
|
if self.VALUE_SEP in value:
|
||||||
|
value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
|
||||||
|
out[key] = value
|
||||||
|
else:
|
||||||
|
warnings.warn(Warnings.W100.format(feature={key: value}))
|
||||||
|
return out
|
||||||
|
|
||||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
|
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
|
||||||
"""Creates a MorphAnalysisC from a list of intified
|
"""Creates a MorphAnalysisC from a list of intified
|
||||||
("Field", "Field=Value") tuples where fields with multiple values have
|
("Field", "Field=Value") tuples where fields with multiple values have
|
||||||
|
@ -183,7 +181,7 @@ cdef class Morphology:
|
||||||
def get(self, hash_t morph):
|
def get(self, hash_t morph):
|
||||||
tag = <MorphAnalysisC*>self.tags.get(morph)
|
tag = <MorphAnalysisC*>self.tags.get(morph)
|
||||||
if tag == NULL:
|
if tag == NULL:
|
||||||
return []
|
return ""
|
||||||
else:
|
else:
|
||||||
return self.strings[tag.key]
|
return self.strings[tag.key]
|
||||||
|
|
||||||
|
@ -218,7 +216,7 @@ cdef class Morphology:
|
||||||
orth (str): The word-form to key the exception.
|
orth (str): The word-form to key the exception.
|
||||||
"""
|
"""
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
attrs = _normalize_props(attrs)
|
attrs = self.normalize_attrs(attrs)
|
||||||
self.add(attrs)
|
self.add(attrs)
|
||||||
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
|
||||||
self._exc[(tag_str, self.strings.add(orth_str))] = attrs
|
self._exc[(tag_str, self.strings.add(orth_str))] = attrs
|
||||||
|
@ -282,7 +280,7 @@ cdef class Morphology:
|
||||||
# Map (form, pos) to attributes
|
# Map (form, pos) to attributes
|
||||||
for tag, exc in morph_rules.items():
|
for tag, exc in morph_rules.items():
|
||||||
for orth, attrs in exc.items():
|
for orth, attrs in exc.items():
|
||||||
attrs = _normalize_props(attrs)
|
attrs = self.normalize_attrs(attrs)
|
||||||
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -309,19 +307,6 @@ cdef class Morphology:
|
||||||
return ""
|
return ""
|
||||||
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
|
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def list_to_feats(feats_list):
|
|
||||||
if len(feats_list) == 0:
|
|
||||||
return ""
|
|
||||||
feats_dict = {}
|
|
||||||
for feat in feats_list:
|
|
||||||
field, value = feat.split(Morphology.FIELD_SEP)
|
|
||||||
if field not in feats_dict:
|
|
||||||
feats_dict[field] = set()
|
|
||||||
feats_dict[field].add(value)
|
|
||||||
feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()}
|
|
||||||
return Morphology.dict_to_feats(feats_dict)
|
|
||||||
|
|
||||||
|
|
||||||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
|
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
|
||||||
cdef int i
|
cdef int i
|
||||||
|
|
|
@ -35,7 +35,7 @@ def test_token_morph_key(i_has):
|
||||||
|
|
||||||
|
|
||||||
def test_morph_props(i_has):
|
def test_morph_props(i_has):
|
||||||
assert i_has[0].morph.get("PronType") == ["PronType=prs"]
|
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||||
assert i_has[1].morph.get("PronType") == []
|
assert i_has[1].morph.get("PronType") == []
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,20 +47,20 @@ def test_morph_iter(i_has):
|
||||||
|
|
||||||
|
|
||||||
def test_morph_get(i_has):
|
def test_morph_get(i_has):
|
||||||
assert i_has[0].morph.get("PronType") == ["PronType=prs"]
|
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||||
|
|
||||||
|
|
||||||
def test_morph_set(i_has):
|
def test_morph_set(i_has):
|
||||||
assert i_has[0].morph.get("PronType") == ["PronType=prs"]
|
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||||
# set by string
|
# set by string
|
||||||
i_has[0].morph_ = "PronType=unk"
|
i_has[0].morph_ = "PronType=unk"
|
||||||
assert i_has[0].morph.get("PronType") == ["PronType=unk"]
|
assert i_has[0].morph.get("PronType") == ["unk"]
|
||||||
# set by string, fields are alphabetized
|
# set by string, fields are alphabetized
|
||||||
i_has[0].morph_ = "PronType=123|NounType=unk"
|
i_has[0].morph_ = "PronType=123|NounType=unk"
|
||||||
assert i_has[0].morph_ == "NounType=unk|PronType=123"
|
assert i_has[0].morph_ == "NounType=unk|PronType=123"
|
||||||
# set by dict
|
# set by dict
|
||||||
i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"}
|
i_has[0].morph_ = {"AType": "123", "BType": "unk"}
|
||||||
assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ"
|
assert i_has[0].morph_ == "AType=123|BType=unk"
|
||||||
# set by string with multiple values, fields and values are alphabetized
|
# set by string with multiple values, fields and values are alphabetized
|
||||||
i_has[0].morph_ = "BType=c|AType=b,a"
|
i_has[0].morph_ = "BType=c|AType=b,a"
|
||||||
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
assert i_has[0].morph_ == "AType=a,b|BType=c"
|
||||||
|
|
|
@ -4,10 +4,8 @@ from spacy.morphology import Morphology
|
||||||
def test_feats_converters():
|
def test_feats_converters():
|
||||||
feats = "Case=dat,gen|Number=sing"
|
feats = "Case=dat,gen|Number=sing"
|
||||||
feats_dict = {"Case": "dat,gen", "Number": "sing"}
|
feats_dict = {"Case": "dat,gen", "Number": "sing"}
|
||||||
feats_list = feats.split(Morphology.FEATURE_SEP)
|
|
||||||
|
|
||||||
# simple conversions
|
# simple conversions
|
||||||
assert Morphology.list_to_feats(feats_list) == feats
|
|
||||||
assert Morphology.dict_to_feats(feats_dict) == feats
|
assert Morphology.dict_to_feats(feats_dict) == feats
|
||||||
assert Morphology.feats_to_dict(feats) == feats_dict
|
assert Morphology.feats_to_dict(feats) == feats_dict
|
||||||
|
|
||||||
|
@ -18,8 +16,6 @@ def test_feats_converters():
|
||||||
# unsorted input is normalized
|
# unsorted input is normalized
|
||||||
unsorted_feats = "Number=sing|Case=gen,dat"
|
unsorted_feats = "Number=sing|Case=gen,dat"
|
||||||
unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"}
|
unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"}
|
||||||
unsorted_feats_list = feats.split(Morphology.FEATURE_SEP)
|
|
||||||
assert Morphology.feats_to_dict(unsorted_feats) == feats_dict
|
assert Morphology.feats_to_dict(unsorted_feats) == feats_dict
|
||||||
assert Morphology.dict_to_feats(unsorted_feats_dict) == feats
|
assert Morphology.dict_to_feats(unsorted_feats_dict) == feats
|
||||||
assert Morphology.list_to_feats(unsorted_feats_list) == feats
|
|
||||||
assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats
|
assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats
|
||||||
|
|
|
@ -2,6 +2,7 @@ from libc.string cimport memset
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..morphology import Morphology
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..typedefs cimport hash_t, attr_t
|
from ..typedefs cimport hash_t, attr_t
|
||||||
from ..morphology cimport list_features, check_feature, get_by_field
|
from ..morphology cimport list_features, check_feature, get_by_field
|
||||||
|
@ -58,10 +59,11 @@ cdef class MorphAnalysis:
|
||||||
return self.key != other.key
|
return self.key != other.key
|
||||||
|
|
||||||
def get(self, field):
|
def get(self, field):
|
||||||
"""Retrieve a feature by field."""
|
"""Retrieve feature values by field."""
|
||||||
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
||||||
cdef np.ndarray results = get_by_field(&self.c, field_id)
|
cdef np.ndarray results = get_by_field(&self.c, field_id)
|
||||||
return [self.vocab.strings[result] for result in results]
|
features = [self.vocab.strings[result] for result in results]
|
||||||
|
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
"""Produce a json serializable representation as a UD FEATS-style
|
"""Produce a json serializable representation as a UD FEATS-style
|
||||||
|
|
Loading…
Reference in New Issue
Block a user