mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	* `Morphology`: Refactor to use C types, reduce allocations, remove unused code * `Morphologzier`: Avoid unnecessary sorting of morpho features * `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints Update docs * Remove unused method * Replace `unique_ptr` usage with `shared_ptr` * Add type annotations to internal Python methods, rename `hash` variable, fix typos * Add comment to clarify implementation detail * Fix return type * `Morphology`: Stop early when splitting fields and values
		
			
				
	
	
		
			252 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			252 lines
		
	
	
		
			9.3 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
| # cython: infer_types
 | |
| import numpy
 | |
| import warnings
 | |
| from typing import Union, Tuple, List, Dict, Optional
 | |
| from cython.operator cimport dereference as deref
 | |
| from libcpp.memory cimport shared_ptr
 | |
| 
 | |
| from .errors import Warnings
 | |
| from . import symbols
 | |
| 
 | |
| 
 | |
| cdef class Morphology:
 | |
|     """Store the possible morphological analyses for a language, and index them
 | |
|     by hash.
 | |
| 
 | |
|     To save space on each token, tokens only know the hash of their
 | |
|     morphological analysis, so queries of morphological attributes are delegated
 | |
|     to this class.
 | |
|     """
 | |
|     FEATURE_SEP = "|"
 | |
|     FIELD_SEP = "="
 | |
|     VALUE_SEP = ","
 | |
|     # not an empty string so we can distinguish unset morph from empty morph
 | |
|     EMPTY_MORPH = symbols.NAMES[symbols._]
 | |
| 
 | |
|     def __init__(self, StringStore strings):
 | |
|         self.strings = strings
 | |
| 
 | |
|     def __reduce__(self):
 | |
|         tags = set([self.get(self.strings[s]) for s in self.strings])
 | |
|         tags -= set([""])
 | |
|         return (unpickle_morphology, (self.strings, sorted(tags)), None, None)
 | |
| 
 | |
|     cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
 | |
|         match = self.tags.find(tag_hash)
 | |
|         if match != self.tags.const_end():
 | |
|             return deref(match).second
 | |
|         else:
 | |
|             return shared_ptr[MorphAnalysisC]()
 | |
| 
 | |
|     def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
 | |
|         if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
 | |
|             attr_key = self.strings.as_string(attr_key)
 | |
|             attr_value = self.strings.as_string(attr_value)
 | |
| 
 | |
|             # Preserve multiple values as a list
 | |
|             if self.VALUE_SEP in attr_value:
 | |
|                 values = attr_value.split(self.VALUE_SEP)
 | |
|                 values.sort()
 | |
|                 attr_value = values
 | |
|         else:
 | |
|             warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
 | |
|             return None
 | |
| 
 | |
|         return attr_key, attr_value
 | |
| 
 | |
|     def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
 | |
|         if not feats or feats == self.EMPTY_MORPH:
 | |
|             return {}
 | |
| 
 | |
|         out = []
 | |
|         for feat in feats.split(self.FEATURE_SEP):
 | |
|             field, values = feat.split(self.FIELD_SEP, 1)
 | |
|             normalized_attr = self._normalize_attr(field, values)
 | |
|             if normalized_attr is None:
 | |
|                 continue
 | |
|             out.append((normalized_attr[0], normalized_attr[1]))
 | |
|         out.sort(key=lambda x: x[0])
 | |
|         return dict(out)
 | |
| 
 | |
|     def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
 | |
|         out = []
 | |
|         for field, values in feats.items():
 | |
|             normalized_attr = self._normalize_attr(field, values)
 | |
|             if normalized_attr is None:
 | |
|                 continue
 | |
|             out.append((normalized_attr[0], normalized_attr[1]))
 | |
|         out.sort(key=lambda x: x[0])
 | |
|         return dict(out)
 | |
| 
 | |
| 
 | |
|     def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
 | |
|         norm_feats_string = self.FEATURE_SEP.join([
 | |
|                 self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
 | |
|             for field, values in feats.items()
 | |
|         ])
 | |
|         return norm_feats_string or self.EMPTY_MORPH
 | |
| 
 | |
| 
 | |
|     cdef hash_t _add(self, features):
 | |
|         """Insert a morphological analysis in the morphology table, if not
 | |
|         already present. The morphological analysis may be provided in the UD
 | |
|         FEATS format as a string or in the tag map dict format.
 | |
|         Returns the hash of the new analysis.
 | |
|         """
 | |
|         cdef hash_t tag_hash = 0
 | |
|         cdef shared_ptr[MorphAnalysisC] tag
 | |
|         if isinstance(features, str):
 | |
|             if features == "":
 | |
|                 features = self.EMPTY_MORPH
 | |
| 
 | |
|             tag_hash = self.strings[features]
 | |
|             tag = self._lookup_tag(tag_hash)
 | |
|             if tag:
 | |
|                 return deref(tag).key
 | |
| 
 | |
|             features = self._str_to_normalized_feat_dict(features)
 | |
|         elif isinstance(features, dict):
 | |
|             features = self._dict_to_normalized_feat_dict(features)
 | |
|         else:
 | |
|             warnings.warn(Warnings.W100.format(feature=features))
 | |
|             features = {}
 | |
| 
 | |
|         # the hash key for the tag is either the hash of the normalized UFEATS
 | |
|         # string or the hash of an empty placeholder
 | |
|         norm_feats_string = self._normalized_feat_dict_to_str(features)
 | |
|         tag_hash = self.strings.add(norm_feats_string)
 | |
|         tag = self._lookup_tag(tag_hash)
 | |
|         if tag:
 | |
|             return deref(tag).key
 | |
| 
 | |
|         self._intern_morph_tag(tag_hash, features)
 | |
|         return tag_hash
 | |
| 
 | |
|     cdef void _intern_morph_tag(self, hash_t tag_key, feats):
 | |
|         # intified ("Field", "Field=Value") pairs where fields with multiple values have
 | |
|         # been split into individual tuples, e.g.:
 | |
|         # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
 | |
|         # ("Field2", "Field2=Value3")]
 | |
|         field_feature_pairs = []
 | |
| 
 | |
|         # Feat dict is normalized at this point.
 | |
|         for field, values in feats.items():
 | |
|             field_key = self.strings.add(field)
 | |
|             if isinstance(values, list):
 | |
|                 for value in values:
 | |
|                     value_key = self.strings.add(field + self.FIELD_SEP + value)
 | |
|                     field_feature_pairs.append((field_key, value_key))
 | |
|             else:
 | |
|                 # We could box scalar values into a list and use a common
 | |
|                 # code path to generate features but that incurs a small 
 | |
|                 # but measurable allocation/iteration overhead (as this
 | |
|                 # branch is taken often enough).
 | |
|                 value_key = self.strings.add(field + self.FIELD_SEP + values)
 | |
|                 field_feature_pairs.append((field_key, value_key))
 | |
| 
 | |
|         num_features = len(field_feature_pairs)
 | |
|         cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
 | |
|         deref(tag).key = tag_key
 | |
|         deref(tag).features.resize(num_features)
 | |
| 
 | |
|         for i in range(num_features):
 | |
|             deref(tag).features[i].field = field_feature_pairs[i][0]
 | |
|             deref(tag).features[i].value = field_feature_pairs[i][1]
 | |
| 
 | |
|         self.tags[tag_key] = tag
 | |
| 
 | |
|     cdef str get_morph_str(self, hash_t morph_key):
 | |
|         cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
 | |
|         if not tag:
 | |
|             return ""
 | |
|         else:
 | |
|             return self.strings[deref(tag).key]
 | |
| 
 | |
|     cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
 | |
|         return self._lookup_tag(morph_key)
 | |
| 
 | |
|     cdef str _normalize_features(self, features):
 | |
|         """Create a normalized FEATS string from a features string or dict.
 | |
| 
 | |
|         features (Union[dict, str]): Features as dict or UFEATS string.
 | |
|         RETURNS (str): Features as normalized UFEATS string.
 | |
|         """
 | |
|         if isinstance(features, str):
 | |
|             features = self._str_to_normalized_feat_dict(features)
 | |
|         elif isinstance(features, dict):
 | |
|             features = self._dict_to_normalized_feat_dict(features)
 | |
|         else:
 | |
|             warnings.warn(Warnings.W100.format(feature=features))
 | |
|             features = {}
 | |
| 
 | |
|         return self._normalized_feat_dict_to_str(features)
 | |
| 
 | |
|     def add(self, features):
 | |
|         return self._add(features)
 | |
| 
 | |
|     def get(self, morph_key):
 | |
|         return self.get_morph_str(morph_key)
 | |
| 
 | |
|     def normalize_features(self, features):
 | |
|         return self._normalize_features(features)
 | |
| 
 | |
|     @staticmethod
 | |
|     def feats_to_dict(feats, *, sort_values=True):
 | |
|         if not feats or feats == Morphology.EMPTY_MORPH:
 | |
|             return {}
 | |
| 
 | |
|         out = {}
 | |
|         for feat in feats.split(Morphology.FEATURE_SEP):
 | |
|             field, values = feat.split(Morphology.FIELD_SEP, 1)
 | |
|             if sort_values:
 | |
|                 values = values.split(Morphology.VALUE_SEP)
 | |
|                 values.sort()
 | |
|                 values = Morphology.VALUE_SEP.join(values)
 | |
| 
 | |
|             out[field] = values
 | |
|         return out
 | |
| 
 | |
|     @staticmethod
 | |
|     def dict_to_feats(feats_dict):
 | |
|         if len(feats_dict) == 0:
 | |
|             return ""
 | |
|         return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
 | |
| 
 | |
| 
 | |
| cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
 | |
|     cdef int i
 | |
|     for i in range(deref(morph).features.size()):
 | |
|         if deref(morph).features[i].value == feature:
 | |
|             return True
 | |
|     return False
 | |
| 
 | |
| 
 | |
| cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
 | |
|     cdef int i
 | |
|     features = []
 | |
|     for i in range(deref(morph).features.size()):
 | |
|         features.append(deref(morph).features[i].value)
 | |
|     return features
 | |
| 
 | |
| 
 | |
| cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
 | |
|     cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
 | |
|     n = get_n_by_field(<uint64_t*>results.data, morph, field)
 | |
|     return results[:n]
 | |
| 
 | |
| 
 | |
| cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
 | |
|     cdef int n_results = 0
 | |
|     cdef int i
 | |
|     for i in range(deref(morph).features.size()):
 | |
|         if deref(morph).features[i].field == field:
 | |
|             results[n_results] = deref(morph).features[i].value
 | |
|             n_results += 1
 | |
|     return n_results
 | |
| 
 | |
| def unpickle_morphology(strings, tags):
 | |
|     cdef Morphology morphology = Morphology(strings)
 | |
|     for tag in tags:
 | |
|         morphology.add(tag)
 | |
|     return morphology
 |