2016-12-18 19:35:22 +03:00
|
|
|
# cython: infer_types
|
2016-12-18 16:59:44 +03:00
|
|
|
from libc.string cimport memset
|
2020-03-02 13:48:10 +03:00
|
|
|
|
2019-03-07 03:17:19 +03:00
|
|
|
import srsly
|
2019-03-07 19:14:57 +03:00
|
|
|
from collections import Counter
|
2020-01-24 00:01:54 +03:00
|
|
|
import numpy
|
2020-02-28 14:20:23 +03:00
|
|
|
import warnings
|
2016-12-18 16:59:44 +03:00
|
|
|
|
2016-12-07 23:12:49 +03:00
|
|
|
from .attrs cimport POS, IS_SPACE
|
2017-10-27 22:07:59 +03:00
|
|
|
from .parts_of_speech cimport SPACE
|
2016-01-19 05:36:51 +03:00
|
|
|
from .lexeme cimport Lexeme
|
2019-03-07 20:32:36 +03:00
|
|
|
|
2020-03-02 13:48:10 +03:00
|
|
|
from .strings import get_string_id
|
|
|
|
from .attrs import LEMMA, intify_attrs
|
|
|
|
from .parts_of_speech import IDS as POS_IDS
|
2020-02-28 14:20:23 +03:00
|
|
|
from .errors import Errors, Warnings
|
2019-08-29 22:17:34 +03:00
|
|
|
from .util import ensure_path
|
2020-03-02 13:48:10 +03:00
|
|
|
from . import symbols
|
2015-08-28 00:11:51 +03:00
|
|
|
|
2018-09-25 00:57:41 +03:00
|
|
|
|
2015-08-26 20:17:21 +03:00
|
|
|
cdef class Morphology:
|
2020-07-24 10:28:06 +03:00
|
|
|
"""Store the possible morphological analyses for a language, and index them
|
2018-09-25 00:57:41 +03:00
|
|
|
by hash.
|
2019-03-16 15:44:22 +03:00
|
|
|
|
2020-07-24 10:28:06 +03:00
|
|
|
To save space on each token, tokens only know the hash of their
|
|
|
|
morphological analysis, so queries of morphological attributes are delegated
|
2018-09-25 00:57:41 +03:00
|
|
|
to this class.
|
2020-07-24 10:28:06 +03:00
|
|
|
"""
|
2020-01-24 00:01:54 +03:00
|
|
|
FEATURE_SEP = "|"
|
|
|
|
FIELD_SEP = "="
|
|
|
|
VALUE_SEP = ","
|
2020-09-27 23:20:14 +03:00
|
|
|
# not an empty string so that the PreshMap key is not 0
|
|
|
|
EMPTY_MORPH = symbols.NAMES[symbols._]
|
2020-01-24 00:01:54 +03:00
|
|
|
|
2020-08-07 16:27:13 +03:00
|
|
|
def __init__(self, StringStore strings):
|
2015-08-28 03:02:33 +03:00
|
|
|
self.mem = Pool()
|
2020-01-24 00:01:54 +03:00
|
|
|
self.strings = strings
|
2018-09-25 00:57:41 +03:00
|
|
|
self.tags = PreshMap()
|
2019-08-29 22:17:34 +03:00
|
|
|
|
2018-09-25 11:57:33 +03:00
|
|
|
def __reduce__(self):
|
2020-08-07 16:27:13 +03:00
|
|
|
tags = set([self.get(self.strings[s]) for s in self.strings])
|
|
|
|
tags -= set([""])
|
|
|
|
return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
|
2018-09-25 11:57:33 +03:00
|
|
|
|
2018-09-25 00:57:41 +03:00
|
|
|
def add(self, features):
|
2020-01-24 00:01:54 +03:00
|
|
|
"""Insert a morphological analysis in the morphology table, if not
|
|
|
|
already present. The morphological analysis may be provided in the UD
|
|
|
|
FEATS format as a string or in the tag map dict format.
|
|
|
|
Returns the hash of the new analysis.
|
|
|
|
"""
|
|
|
|
cdef MorphAnalysisC* tag_ptr
|
|
|
|
if isinstance(features, str):
|
2020-06-29 15:33:00 +03:00
|
|
|
if features == self.EMPTY_MORPH:
|
|
|
|
features = ""
|
2020-01-24 00:01:54 +03:00
|
|
|
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
|
|
|
|
if tag_ptr != NULL:
|
|
|
|
return tag_ptr.key
|
|
|
|
features = self.feats_to_dict(features)
|
|
|
|
if not isinstance(features, dict):
|
2020-06-03 15:37:09 +03:00
|
|
|
warnings.warn(Warnings.W100.format(feature=features))
|
2020-01-24 00:01:54 +03:00
|
|
|
features = {}
|
|
|
|
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
|
|
|
# intified ("Field", "Field=Value") pairs
|
|
|
|
field_feature_pairs = []
|
|
|
|
for field in sorted(string_features):
|
|
|
|
values = string_features[field]
|
|
|
|
for value in values.split(self.VALUE_SEP):
|
|
|
|
field_feature_pairs.append((
|
|
|
|
self.strings.add(field),
|
|
|
|
self.strings.add(field + self.FIELD_SEP + value),
|
|
|
|
))
|
|
|
|
cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
|
|
|
|
# the hash key for the tag is either the hash of the normalized UFEATS
|
|
|
|
# string or the hash of an empty placeholder (using the empty string
|
|
|
|
# would give a hash key of 0, which is not good for PreshMap)
|
2020-07-19 12:10:51 +03:00
|
|
|
norm_feats_string = self.normalize_features(features)
|
2020-01-24 00:01:54 +03:00
|
|
|
if norm_feats_string:
|
|
|
|
tag.key = self.strings.add(norm_feats_string)
|
|
|
|
else:
|
|
|
|
tag.key = self.strings.add(self.EMPTY_MORPH)
|
|
|
|
self.insert(tag)
|
|
|
|
return tag.key
|
|
|
|
|
2020-07-19 12:10:51 +03:00
|
|
|
def normalize_features(self, features):
|
2020-07-24 10:28:06 +03:00
|
|
|
"""Create a normalized FEATS string from a features string or dict.
|
2020-07-19 12:10:51 +03:00
|
|
|
|
|
|
|
features (Union[dict, str]): Features as dict or UFEATS string.
|
|
|
|
RETURNS (str): Features as normalized UFEATS string.
|
|
|
|
"""
|
|
|
|
if isinstance(features, str):
|
|
|
|
features = self.feats_to_dict(features)
|
|
|
|
if not isinstance(features, dict):
|
|
|
|
warnings.warn(Warnings.W100.format(feature=features))
|
|
|
|
features = {}
|
2020-07-24 10:28:06 +03:00
|
|
|
features = self.normalize_attrs(features)
|
2020-07-19 12:10:51 +03:00
|
|
|
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
|
|
|
# normalized UFEATS string with sorted fields and values
|
|
|
|
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
|
|
|
self.FIELD_SEP.join([field, values])
|
|
|
|
for field, values in string_features.items()
|
|
|
|
]))
|
|
|
|
return norm_feats_string or self.EMPTY_MORPH
|
|
|
|
|
2020-07-24 10:28:06 +03:00
|
|
|
def normalize_attrs(self, attrs):
|
|
|
|
"""Convert attrs dict so that POS is always by ID, other features are
|
|
|
|
by string. Values separated by VALUE_SEP are sorted.
|
|
|
|
"""
|
|
|
|
out = {}
|
|
|
|
attrs = dict(attrs)
|
|
|
|
for key, value in attrs.items():
|
|
|
|
# convert POS value to ID
|
|
|
|
if key == POS or (isinstance(key, str) and key.upper() == "POS"):
|
|
|
|
if isinstance(value, str) and value.upper() in POS_IDS:
|
|
|
|
value = POS_IDS[value.upper()]
|
|
|
|
elif isinstance(value, int) and value not in POS_IDS.values():
|
|
|
|
warnings.warn(Warnings.W100.format(feature={key: value}))
|
|
|
|
continue
|
|
|
|
out[POS] = value
|
|
|
|
# accept any string or ID fields and values and convert to strings
|
|
|
|
elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
|
|
|
|
key = self.strings.as_string(key)
|
|
|
|
value = self.strings.as_string(value)
|
|
|
|
# sort values
|
|
|
|
if self.VALUE_SEP in value:
|
|
|
|
value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
|
|
|
|
out[key] = value
|
|
|
|
else:
|
|
|
|
warnings.warn(Warnings.W100.format(feature={key: value}))
|
|
|
|
return out
|
|
|
|
|
2020-01-24 00:01:54 +03:00
|
|
|
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
|
|
|
|
"""Creates a MorphAnalysisC from a list of intified
|
|
|
|
("Field", "Field=Value") tuples where fields with multiple values have
|
|
|
|
been split into individual tuples, e.g.:
|
|
|
|
[("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
|
|
|
|
("Field2", "Field2=Value3")]
|
2018-09-25 00:57:41 +03:00
|
|
|
"""
|
2019-03-07 16:03:07 +03:00
|
|
|
cdef MorphAnalysisC tag
|
2020-01-24 00:01:54 +03:00
|
|
|
tag.length = len(field_feature_pairs)
|
|
|
|
tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
|
|
|
tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
|
|
|
|
for i, (field, feature) in enumerate(field_feature_pairs):
|
|
|
|
tag.fields[i] = field
|
|
|
|
tag.features[i] = feature
|
|
|
|
return tag
|
|
|
|
|
|
|
|
cdef int insert(self, MorphAnalysisC tag) except -1:
|
|
|
|
cdef hash_t key = tag.key
|
|
|
|
if self.tags.get(key) == NULL:
|
|
|
|
tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
|
|
|
tag_ptr[0] = tag
|
|
|
|
self.tags.set(key, <void*>tag_ptr)
|
2018-09-26 22:03:57 +03:00
|
|
|
|
|
|
|
def get(self, hash_t morph):
|
2019-03-07 16:03:07 +03:00
|
|
|
tag = <MorphAnalysisC*>self.tags.get(morph)
|
2018-09-26 22:03:57 +03:00
|
|
|
if tag == NULL:
|
2020-07-24 10:28:06 +03:00
|
|
|
return ""
|
2018-09-26 22:03:57 +03:00
|
|
|
else:
|
2020-01-24 00:01:54 +03:00
|
|
|
return self.strings[tag.key]
|
2018-09-25 21:53:24 +03:00
|
|
|
|
2020-01-24 00:01:54 +03:00
|
|
|
@staticmethod
|
|
|
|
def feats_to_dict(feats):
|
2020-07-08 14:59:28 +03:00
|
|
|
if not feats or feats == Morphology.EMPTY_MORPH:
|
2020-01-24 00:01:54 +03:00
|
|
|
return {}
|
|
|
|
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
|
|
|
|
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def dict_to_feats(feats_dict):
|
|
|
|
if len(feats_dict) == 0:
|
|
|
|
return ""
|
|
|
|
return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
|
|
|
|
|
|
|
|
|
|
|
|
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
|
|
|
|
cdef int i
|
|
|
|
for i in range(morph.length):
|
|
|
|
if morph.features[i] == feature:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
cdef list list_features(const MorphAnalysisC* morph):
|
|
|
|
cdef int i
|
|
|
|
features = []
|
|
|
|
for i in range(morph.length):
|
|
|
|
features.append(morph.features[i])
|
|
|
|
return features
|
|
|
|
|
|
|
|
|
|
|
|
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
|
|
|
|
cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
|
|
|
|
n = get_n_by_field(<uint64_t*>results.data, morph, field)
|
|
|
|
return results[:n]
|
|
|
|
|
|
|
|
|
|
|
|
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
|
|
|
|
cdef int n_results = 0
|
|
|
|
cdef int i
|
|
|
|
for i in range(morph.length):
|
|
|
|
if morph.fields[i] == field:
|
|
|
|
results[n_results] = morph.features[i]
|
|
|
|
n_results += 1
|
|
|
|
return n_results
|
2020-08-07 16:27:13 +03:00
|
|
|
|
|
|
|
def unpickle_morphology(strings, tags):
|
|
|
|
cdef Morphology morphology = Morphology(strings)
|
|
|
|
for tag in tags:
|
|
|
|
morphology.add(tag)
|
|
|
|
return morphology
|