2016-12-18 19:35:22 +03:00
|
|
|
# cython: infer_types
|
2023-09-12 09:49:41 +03:00
|
|
|
# cython: profile=False
|
2020-02-28 14:20:23 +03:00
|
|
|
import warnings
|
2023-06-26 12:41:03 +03:00
|
|
|
from typing import Dict, List, Optional, Tuple, Union
|
|
|
|
|
|
|
|
import numpy
|
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
from cython.operator cimport dereference as deref
|
|
|
|
from libcpp.memory cimport shared_ptr
|
2016-12-18 16:59:44 +03:00
|
|
|
|
2020-03-02 13:48:10 +03:00
|
|
|
from . import symbols
|
2023-06-26 12:41:03 +03:00
|
|
|
from .errors import Warnings
|
2015-08-28 00:11:51 +03:00
|
|
|
|
2018-09-25 00:57:41 +03:00
|
|
|
|
2015-08-26 20:17:21 +03:00
|
|
|
cdef class Morphology:
|
2020-07-24 10:28:06 +03:00
|
|
|
"""Store the possible morphological analyses for a language, and index them
|
2018-09-25 00:57:41 +03:00
|
|
|
by hash.
|
2019-03-16 15:44:22 +03:00
|
|
|
|
2020-07-24 10:28:06 +03:00
|
|
|
To save space on each token, tokens only know the hash of their
|
|
|
|
morphological analysis, so queries of morphological attributes are delegated
|
2018-09-25 00:57:41 +03:00
|
|
|
to this class.
|
2020-07-24 10:28:06 +03:00
|
|
|
"""
|
2020-01-24 00:01:54 +03:00
|
|
|
FEATURE_SEP = "|"
|
|
|
|
FIELD_SEP = "="
|
|
|
|
VALUE_SEP = ","
|
2020-12-18 13:51:01 +03:00
|
|
|
# not an empty string so we can distinguish unset morph from empty morph
|
2020-09-27 23:20:14 +03:00
|
|
|
EMPTY_MORPH = symbols.NAMES[symbols._]
|
2020-01-24 00:01:54 +03:00
|
|
|
|
2020-08-07 16:27:13 +03:00
|
|
|
def __init__(self, StringStore strings):
|
2020-01-24 00:01:54 +03:00
|
|
|
self.strings = strings
|
2019-08-29 22:17:34 +03:00
|
|
|
|
2018-09-25 11:57:33 +03:00
|
|
|
def __reduce__(self):
|
2020-08-07 16:27:13 +03:00
|
|
|
tags = set([self.get(self.strings[s]) for s in self.strings])
|
|
|
|
tags -= set([""])
|
|
|
|
return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
|
2018-09-25 11:57:33 +03:00
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
|
|
|
|
match = self.tags.find(tag_hash)
|
|
|
|
if match != self.tags.const_end():
|
|
|
|
return deref(match).second
|
|
|
|
else:
|
|
|
|
return shared_ptr[MorphAnalysisC]()
|
|
|
|
|
|
|
|
def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
|
|
|
|
if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
|
|
|
|
attr_key = self.strings.as_string(attr_key)
|
|
|
|
attr_value = self.strings.as_string(attr_value)
|
|
|
|
|
|
|
|
# Preserve multiple values as a list
|
|
|
|
if self.VALUE_SEP in attr_value:
|
|
|
|
values = attr_value.split(self.VALUE_SEP)
|
|
|
|
values.sort()
|
|
|
|
attr_value = values
|
|
|
|
else:
|
|
|
|
warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
|
|
|
|
return None
|
|
|
|
|
|
|
|
return attr_key, attr_value
|
|
|
|
|
|
|
|
def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
|
|
|
|
if not feats or feats == self.EMPTY_MORPH:
|
|
|
|
return {}
|
|
|
|
|
|
|
|
out = []
|
|
|
|
for feat in feats.split(self.FEATURE_SEP):
|
|
|
|
field, values = feat.split(self.FIELD_SEP, 1)
|
|
|
|
normalized_attr = self._normalize_attr(field, values)
|
|
|
|
if normalized_attr is None:
|
|
|
|
continue
|
|
|
|
out.append((normalized_attr[0], normalized_attr[1]))
|
|
|
|
out.sort(key=lambda x: x[0])
|
|
|
|
return dict(out)
|
|
|
|
|
|
|
|
def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
|
|
|
|
out = []
|
|
|
|
for field, values in feats.items():
|
|
|
|
normalized_attr = self._normalize_attr(field, values)
|
|
|
|
if normalized_attr is None:
|
|
|
|
continue
|
|
|
|
out.append((normalized_attr[0], normalized_attr[1]))
|
|
|
|
out.sort(key=lambda x: x[0])
|
|
|
|
return dict(out)
|
|
|
|
|
|
|
|
def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
|
|
|
|
norm_feats_string = self.FEATURE_SEP.join([
|
2023-07-19 18:41:29 +03:00
|
|
|
self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
|
2022-07-15 12:14:08 +03:00
|
|
|
for field, values in feats.items()
|
2023-07-19 18:41:29 +03:00
|
|
|
])
|
2022-07-15 12:14:08 +03:00
|
|
|
return norm_feats_string or self.EMPTY_MORPH
|
|
|
|
|
|
|
|
cdef hash_t _add(self, features):
|
2020-01-24 00:01:54 +03:00
|
|
|
"""Insert a morphological analysis in the morphology table, if not
|
|
|
|
already present. The morphological analysis may be provided in the UD
|
|
|
|
FEATS format as a string or in the tag map dict format.
|
|
|
|
Returns the hash of the new analysis.
|
|
|
|
"""
|
2022-07-15 12:14:08 +03:00
|
|
|
cdef hash_t tag_hash = 0
|
|
|
|
cdef shared_ptr[MorphAnalysisC] tag
|
2020-01-24 00:01:54 +03:00
|
|
|
if isinstance(features, str):
|
2020-12-18 13:51:01 +03:00
|
|
|
if features == "":
|
|
|
|
features = self.EMPTY_MORPH
|
2022-07-15 12:14:08 +03:00
|
|
|
|
|
|
|
tag_hash = self.strings[features]
|
|
|
|
tag = self._lookup_tag(tag_hash)
|
|
|
|
if tag:
|
|
|
|
return deref(tag).key
|
|
|
|
|
|
|
|
features = self._str_to_normalized_feat_dict(features)
|
|
|
|
elif isinstance(features, dict):
|
|
|
|
features = self._dict_to_normalized_feat_dict(features)
|
|
|
|
else:
|
2020-06-03 15:37:09 +03:00
|
|
|
warnings.warn(Warnings.W100.format(feature=features))
|
2020-01-24 00:01:54 +03:00
|
|
|
features = {}
|
2022-07-15 12:14:08 +03:00
|
|
|
|
2020-01-24 00:01:54 +03:00
|
|
|
# the hash key for the tag is either the hash of the normalized UFEATS
|
2020-12-18 13:51:01 +03:00
|
|
|
# string or the hash of an empty placeholder
|
2022-07-15 12:14:08 +03:00
|
|
|
norm_feats_string = self._normalized_feat_dict_to_str(features)
|
|
|
|
tag_hash = self.strings.add(norm_feats_string)
|
|
|
|
tag = self._lookup_tag(tag_hash)
|
|
|
|
if tag:
|
|
|
|
return deref(tag).key
|
|
|
|
|
|
|
|
self._intern_morph_tag(tag_hash, features)
|
|
|
|
return tag_hash
|
|
|
|
|
|
|
|
cdef void _intern_morph_tag(self, hash_t tag_key, feats):
|
|
|
|
# intified ("Field", "Field=Value") pairs where fields with multiple values have
|
|
|
|
# been split into individual tuples, e.g.:
|
|
|
|
# [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
|
|
|
|
# ("Field2", "Field2=Value3")]
|
|
|
|
field_feature_pairs = []
|
2020-01-24 00:01:54 +03:00
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
# Feat dict is normalized at this point.
|
|
|
|
for field, values in feats.items():
|
|
|
|
field_key = self.strings.add(field)
|
|
|
|
if isinstance(values, list):
|
|
|
|
for value in values:
|
|
|
|
value_key = self.strings.add(field + self.FIELD_SEP + value)
|
|
|
|
field_feature_pairs.append((field_key, value_key))
|
|
|
|
else:
|
|
|
|
# We could box scalar values into a list and use a common
|
2023-07-19 17:37:31 +03:00
|
|
|
# code path to generate features but that incurs a small
|
2022-07-15 12:14:08 +03:00
|
|
|
# but measurable allocation/iteration overhead (as this
|
|
|
|
# branch is taken often enough).
|
|
|
|
value_key = self.strings.add(field + self.FIELD_SEP + values)
|
|
|
|
field_feature_pairs.append((field_key, value_key))
|
|
|
|
|
|
|
|
num_features = len(field_feature_pairs)
|
|
|
|
cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
|
|
|
|
deref(tag).key = tag_key
|
|
|
|
deref(tag).features.resize(num_features)
|
|
|
|
|
|
|
|
for i in range(num_features):
|
|
|
|
deref(tag).features[i].field = field_feature_pairs[i][0]
|
|
|
|
deref(tag).features[i].value = field_feature_pairs[i][1]
|
|
|
|
|
|
|
|
self.tags[tag_key] = tag
|
|
|
|
|
|
|
|
cdef str get_morph_str(self, hash_t morph_key):
|
|
|
|
cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
|
|
|
|
if not tag:
|
|
|
|
return ""
|
|
|
|
else:
|
|
|
|
return self.strings[deref(tag).key]
|
|
|
|
|
|
|
|
cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
|
|
|
|
return self._lookup_tag(morph_key)
|
|
|
|
|
|
|
|
cdef str _normalize_features(self, features):
|
2020-07-24 10:28:06 +03:00
|
|
|
"""Create a normalized FEATS string from a features string or dict.
|
2020-07-19 12:10:51 +03:00
|
|
|
|
|
|
|
features (Union[dict, str]): Features as dict or UFEATS string.
|
|
|
|
RETURNS (str): Features as normalized UFEATS string.
|
|
|
|
"""
|
|
|
|
if isinstance(features, str):
|
2022-07-15 12:14:08 +03:00
|
|
|
features = self._str_to_normalized_feat_dict(features)
|
|
|
|
elif isinstance(features, dict):
|
|
|
|
features = self._dict_to_normalized_feat_dict(features)
|
|
|
|
else:
|
2020-07-19 12:10:51 +03:00
|
|
|
warnings.warn(Warnings.W100.format(feature=features))
|
|
|
|
features = {}
|
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
return self._normalized_feat_dict_to_str(features)
|
2020-07-24 10:28:06 +03:00
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
def add(self, features):
|
|
|
|
return self._add(features)
|
|
|
|
|
|
|
|
def get(self, morph_key):
|
|
|
|
return self.get_morph_str(morph_key)
|
|
|
|
|
|
|
|
def normalize_features(self, features):
|
|
|
|
return self._normalize_features(features)
|
2018-09-25 21:53:24 +03:00
|
|
|
|
2020-01-24 00:01:54 +03:00
|
|
|
@staticmethod
|
2022-07-15 12:14:08 +03:00
|
|
|
def feats_to_dict(feats, *, sort_values=True):
|
2020-07-08 14:59:28 +03:00
|
|
|
if not feats or feats == Morphology.EMPTY_MORPH:
|
2020-01-24 00:01:54 +03:00
|
|
|
return {}
|
2022-07-15 12:14:08 +03:00
|
|
|
|
|
|
|
out = {}
|
|
|
|
for feat in feats.split(Morphology.FEATURE_SEP):
|
|
|
|
field, values = feat.split(Morphology.FIELD_SEP, 1)
|
|
|
|
if sort_values:
|
|
|
|
values = values.split(Morphology.VALUE_SEP)
|
|
|
|
values.sort()
|
|
|
|
values = Morphology.VALUE_SEP.join(values)
|
|
|
|
|
|
|
|
out[field] = values
|
|
|
|
return out
|
2020-01-24 00:01:54 +03:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def dict_to_feats(feats_dict):
|
|
|
|
if len(feats_dict) == 0:
|
|
|
|
return ""
|
|
|
|
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
|
|
|
|
|
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
|
2020-01-24 00:01:54 +03:00
|
|
|
cdef int i
|
2022-07-15 12:14:08 +03:00
|
|
|
for i in range(deref(morph).features.size()):
|
|
|
|
if deref(morph).features[i].value == feature:
|
2020-01-24 00:01:54 +03:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
|
2020-01-24 00:01:54 +03:00
|
|
|
cdef int i
|
|
|
|
features = []
|
2022-07-15 12:14:08 +03:00
|
|
|
for i in range(deref(morph).features.size()):
|
|
|
|
features.append(deref(morph).features[i].value)
|
2020-01-24 00:01:54 +03:00
|
|
|
return features
|
|
|
|
|
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
|
|
|
|
cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
|
2020-01-24 00:01:54 +03:00
|
|
|
n = get_n_by_field(<uint64_t*>results.data, morph, field)
|
|
|
|
return results[:n]
|
|
|
|
|
|
|
|
|
2022-07-15 12:14:08 +03:00
|
|
|
cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
|
2020-01-24 00:01:54 +03:00
|
|
|
cdef int n_results = 0
|
|
|
|
cdef int i
|
2022-07-15 12:14:08 +03:00
|
|
|
for i in range(deref(morph).features.size()):
|
|
|
|
if deref(morph).features[i].field == field:
|
|
|
|
results[n_results] = deref(morph).features[i].value
|
2020-01-24 00:01:54 +03:00
|
|
|
n_results += 1
|
|
|
|
return n_results
|
2020-08-07 16:27:13 +03:00
|
|
|
|
2023-07-19 13:03:31 +03:00
|
|
|
|
2020-08-07 16:27:13 +03:00
|
|
|
def unpickle_morphology(strings, tags):
|
|
|
|
cdef Morphology morphology = Morphology(strings)
|
|
|
|
for tag in tags:
|
|
|
|
morphology.add(tag)
|
|
|
|
return morphology
|