Refactor morphologizer class map

2025-12-23 18:13:13 +03:00 · 2019-03-09 20:55:33 +01:00 · 2019-03-09 20:55:33 +01:00 · 41a3016019
commit 41a3016019
parent f742900f83
4 changed files with 58 additions and 67 deletions
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -20,6 +20,7 @@ cdef class Morphology:
    cdef readonly object tag_names
    cdef readonly object reverse_index
    cdef readonly object exc
    cdef readonly object _feat_map
    cdef readonly PreshMapArray _cache
    cdef readonly int n_tags
@ -36,6 +37,5 @@ cdef class Morphology:
 cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
 cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
 cdef list list_features(const MorphAnalysisC* tag)
 cdef int attribute_to_field(unicode attribute)
 cdef tag_to_json(const MorphAnalysisC* tag)
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -93,26 +93,34 @@ def _normalize_props(props):
    return out
-def parse_feature(feature):
+class MorphologyClassMap(object):
-    field = FEATURE_FIELDS[feature]
+    def __init__(self, features, fields):
-    offset = FEATURE_OFFSETS[feature]
+        self.features = tuple(features)
-    return (field, offset)
+        self.fields = tuple(fields)
        self.id2feat = {get_string_id(name): name for name in features}
        self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
        self.field2feats = {}
        self.col2info = []
        self.attr2field = dict(LOWER_FIELDS.items())
        for feature in features:
            field = self.feat2field[feature]
            if field not in self.field2feats:
                self.col2info.append((field, 0, 'NIL'))
            self.field2feats.setdefault(field, []).append(feature)
            self.col2info.append((field, len(self.field2feats[field]), feature))
    @property
    def field_sizes(self):
        return [len(self.field2feats[field]) for field in self.fields]
-cdef int attribute_to_field(unicode attribute_name):
+    def get_field_offset(self, field):
-    return LOWER_FIELDS[attribute_name]
+        n = 0
-
+        for f in self.fields:
-
+            if f == field:
-def get_field_id(feature):
+                return n
-    return FEATURE_FIELDS[feature]
+            n += len(self.field2feats[f])
-
+        else:
-
+            return -1
 def get_field_size(field):
    return FIELD_SIZES[FIELDS[field]]
 def get_field_offset(field):
    return FIELD_OFFSETS[FIELDS[field]]
 cdef class Morphology:
@ -139,9 +147,11 @@ cdef class Morphology:
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map)
        self.reverse_index = {}
        self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            attrs = _normalize_props(attrs)
-            self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
+            self.add({self._feat_map.id2feat[feat] for feat in attrs
                      if feat in self._feat_map.id2feat})
            self.tag_map[tag_str] = dict(attrs)
            self.reverse_index[self.strings.add(tag_str)] = i
@ -167,7 +177,7 @@ cdef class Morphology:
        features = intify_features(features)
        cdef attr_t feature
        for feature in features:
-            if feature != 0 and feature not in FEATURE_NAMES:
+            if feature != 0 and feature not in self._feat_map.id2feat:
                raise KeyError("Unknown feature: %s" % self.strings[feature])
        cdef MorphAnalysisC tag
        tag = create_rich_tag(features)
@ -187,7 +197,7 @@ cdef class Morphology:
        features = intify_features(features)
        cdef attr_t feature
        for feature in features:
-            field = get_field_id(feature)
+            field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
            set_feature(&tag, field, feature, 1)
        morph = self.insert(tag)
        return morph
@ -224,7 +234,8 @@ cdef class Morphology:
        """
        attrs = dict(attrs)
        attrs = _normalize_props(attrs)
-        self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
+        self.add({self._feat_map.id2feat[feat] for feat in attrs
                 if feat in self._feat_map.id2feat})
        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
        self.exc[(tag_str, self.strings.add(orth_str))] = attrs
@ -313,6 +324,10 @@ cdef class Morphology:
    def from_disk(self, path):
        raise NotImplementedError
    @classmethod
    def create_class_map(cls):
        return MorphologyClassMap(FEATURES, FIELDS)
 cpdef univ_pos_t get_int_tag(pos_):
    return <univ_pos_t>0
@ -324,17 +339,12 @@ cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
    return mrmr.hash64(&tag, sizeof(tag), 0)
 def get_feature_field(feature):
    cdef attr_t key = get_string_id(feature)
    return FEATURE_FIELDS[feature]
 cdef MorphAnalysisC create_rich_tag(features) except *:
    cdef MorphAnalysisC tag
    cdef attr_t feature
    memset(&tag, 0, sizeof(tag))
    for feature in features:
-        field = get_field_id(feature)
+        field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
        set_feature(&tag, field, feature, 1)
    return tag
@ -519,8 +529,7 @@ cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
    elif field == Field_VerbType:
        return tag.verb_type
    else:
-        raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
+        raise ValueError("Unknown field: (%d)" % field_id)
 cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
@ -1090,22 +1099,5 @@ FEATURES = [
   "Voice_int",
 ]
-FEATURE_NAMES = {get_string_id(name): name for name in FEATURES}
+FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
-FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES}
+FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}
 FIELD_SIZES = Counter(FEATURE_FIELDS.values())
 for field in FIELD_SIZES:
    FIELD_SIZES[field] += 1
 for feat_id, name in FEATURE_NAMES.items():
    FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
 # Mapping of feature names to their position in total vector
 FEATURE_OFFSETS = {}
 # Mapping of field names to their first position in total vector.
 FIELD_OFFSETS = {}
 _seen_fields = Counter()
 for i, feature in enumerate(FEATURES):
    field = FEATURE_FIELDS[feature]
    # Add 1 for the NIL class, on each field
    FEATURE_OFFSETS[feature] = _seen_fields[field] + 1
    if _seen_fields[field] == 0:
        FIELD_OFFSETS[field] = i
    _seen_fields[field] += 1 
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -16,26 +16,24 @@ from ..compat import basestring_
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
 from ..morphology cimport Morphology
 from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS
 from ..morphology import FEATURES
 class Morphologizer(Pipe):
    name = 'morphologizer'
    @classmethod
-    def Model(cls, attr_nums=None, **cfg):
+    def Model(cls, **cfg):
        if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
            raise ValueError(TempErrors.T008)
-        if attr_nums is None:
+        class_map = Morphology.create_class_map()
-            attr_nums = [get_field_size(name) for name in FIELDS]
+        return build_morphologizer_model(class_map.field_sizes, **cfg)
        return build_morphologizer_model(attr_nums, **cfg)
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = OrderedDict(sorted(cfg.items()))
        self.cfg.setdefault('cnn_maxout_pieces', 2)
        self._class_map = self.vocab.morphology.create_class_map()
    @property
    def labels(self):
@ -76,13 +74,13 @@ class Morphologizer(Pipe):
            docs = [docs]
        cdef Doc doc
        cdef Vocab vocab = self.vocab
-        field_names = list(FIELDS)
+        offsets = [self._class_map.get_field_offset(field)
-        offsets = [get_field_offset(field) for field in field_names]
+                   for field in self._class_map.fields]
        for i, doc in enumerate(docs):
            doc_scores = batch_scores[i]
            doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
            # Convert the neuron indices into feature IDs.
-            doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i')
+            doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
            for j in range(len(doc)):
                for k, offset in enumerate(offsets):
                    if doc_guesses[j, k] == 0:
@ -90,7 +88,8 @@ class Morphologizer(Pipe):
                    else:
                        doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
                # Get the set of feature names.
-                feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0}
+                feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
                         if f != 0}
                # Now add the analysis, and set the hash.
                try:
                    doc.c[j].morph = self.vocab.morphology.add(feats)
@ -132,14 +131,15 @@ class Morphologizer(Pipe):
                if features is None:
                    target[idx] = scores[idx]
                else:
-                    by_field = {}
+                    gold_fields = {}
                    for feature in features:
-                        field, column = parse_feature(feature)
+                        field = self.get_field(feature)
-                        by_field[field] = column
+                        column = self.get_column(feature)
                        gold_fields[field] = column
                    col_offset = 0
                    for field, field_size in enumerate(field_sizes):
-                        if field in by_field:
+                        if field in gold_fields:
-                            target[idx, col_offset + by_field[field]] = 1.
+                            target[idx, col_offset + gold_fields[field]] = 1.
                        else:
                            target[idx, col_offset] = 1.
                        col_offset += field_size
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -3,7 +3,6 @@ from libc.string cimport memset
 from ..vocab cimport Vocab
 from ..typedefs cimport hash_t, attr_t
 from ..morphology cimport list_features, check_feature, get_field, tag_to_json
 from ..morphology cimport attribute_to_field
 from ..strings import get_string_id
@ -53,7 +52,7 @@ cdef class MorphAnalysis:
        return self.key
    def get(self, unicode field):
-        cdef int field_id = attribute_to_field(field)
+        cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
        return self.vocab.strings[get_field(&self.c, field_id)]
    def to_json(self):