Refactor morphologizer class map

This commit is contained in:
Matthew Honnibal 2019-03-09 20:55:33 +01:00
parent f742900f83
commit 41a3016019
4 changed files with 58 additions and 67 deletions

View File

@ -20,6 +20,7 @@ cdef class Morphology:
cdef readonly object tag_names
cdef readonly object reverse_index
cdef readonly object exc
cdef readonly object _feat_map
cdef readonly PreshMapArray _cache
cdef readonly int n_tags
@ -36,6 +37,5 @@ cdef class Morphology:
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
cdef list list_features(const MorphAnalysisC* tag)
cdef int attribute_to_field(unicode attribute)
cdef tag_to_json(const MorphAnalysisC* tag)

View File

@ -93,26 +93,34 @@ def _normalize_props(props):
return out
def parse_feature(feature):
field = FEATURE_FIELDS[feature]
offset = FEATURE_OFFSETS[feature]
return (field, offset)
class MorphologyClassMap(object):
def __init__(self, features, fields):
self.features = tuple(features)
self.fields = tuple(fields)
self.id2feat = {get_string_id(name): name for name in features}
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
self.field2feats = {}
self.col2info = []
self.attr2field = dict(LOWER_FIELDS.items())
for feature in features:
field = self.feat2field[feature]
if field not in self.field2feats:
self.col2info.append((field, 0, 'NIL'))
self.field2feats.setdefault(field, []).append(feature)
self.col2info.append((field, len(self.field2feats[field]), feature))
@property
def field_sizes(self):
return [len(self.field2feats[field]) for field in self.fields]
cdef int attribute_to_field(unicode attribute_name):
return LOWER_FIELDS[attribute_name]
def get_field_id(feature):
return FEATURE_FIELDS[feature]
def get_field_size(field):
return FIELD_SIZES[FIELDS[field]]
def get_field_offset(field):
return FIELD_OFFSETS[FIELDS[field]]
def get_field_offset(self, field):
n = 0
for f in self.fields:
if f == field:
return n
n += len(self.field2feats[f])
else:
return -1
cdef class Morphology:
@ -139,9 +147,11 @@ cdef class Morphology:
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.reverse_index = {}
self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
self.add({self._feat_map.id2feat[feat] for feat in attrs
if feat in self._feat_map.id2feat})
self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i
@ -167,7 +177,7 @@ cdef class Morphology:
features = intify_features(features)
cdef attr_t feature
for feature in features:
if feature != 0 and feature not in FEATURE_NAMES:
if feature != 0 and feature not in self._feat_map.id2feat:
raise KeyError("Unknown feature: %s" % self.strings[feature])
cdef MorphAnalysisC tag
tag = create_rich_tag(features)
@ -187,7 +197,7 @@ cdef class Morphology:
features = intify_features(features)
cdef attr_t feature
for feature in features:
field = get_field_id(feature)
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
set_feature(&tag, field, feature, 1)
morph = self.insert(tag)
return morph
@ -224,7 +234,8 @@ cdef class Morphology:
"""
attrs = dict(attrs)
attrs = _normalize_props(attrs)
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES})
self.add({self._feat_map.id2feat[feat] for feat in attrs
if feat in self._feat_map.id2feat})
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs
@ -313,6 +324,10 @@ cdef class Morphology:
def from_disk(self, path):
raise NotImplementedError
@classmethod
def create_class_map(cls):
return MorphologyClassMap(FEATURES, FIELDS)
cpdef univ_pos_t get_int_tag(pos_):
return <univ_pos_t>0
@ -324,17 +339,12 @@ cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
return mrmr.hash64(&tag, sizeof(tag), 0)
def get_feature_field(feature):
cdef attr_t key = get_string_id(feature)
return FEATURE_FIELDS[feature]
cdef MorphAnalysisC create_rich_tag(features) except *:
cdef MorphAnalysisC tag
cdef attr_t feature
memset(&tag, 0, sizeof(tag))
for feature in features:
field = get_field_id(feature)
field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
set_feature(&tag, field, feature, 1)
return tag
@ -519,8 +529,7 @@ cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
elif field == Field_VerbType:
return tag.verb_type
else:
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature))
raise ValueError("Unknown field: (%d)" % field_id)
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
@ -1090,22 +1099,5 @@ FEATURES = [
"Voice_int",
]
FEATURE_NAMES = {get_string_id(name): name for name in FEATURES}
FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES}
FIELD_SIZES = Counter(FEATURE_FIELDS.values())
for field in FIELD_SIZES:
FIELD_SIZES[field] += 1
for feat_id, name in FEATURE_NAMES.items():
FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
# Mapping of feature names to their position in total vector
FEATURE_OFFSETS = {}
# Mapping of field names to their first position in total vector.
FIELD_OFFSETS = {}
_seen_fields = Counter()
for i, feature in enumerate(FEATURES):
field = FEATURE_FIELDS[feature]
# Add 1 for the NIL class, on each field
FEATURE_OFFSETS[feature] = _seen_fields[field] + 1
if _seen_fields[field] == 0:
FIELD_OFFSETS[field] = i
_seen_fields[field] += 1
FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}

View File

@ -16,26 +16,24 @@ from ..compat import basestring_
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS
from ..morphology import FEATURES
class Morphologizer(Pipe):
name = 'morphologizer'
@classmethod
def Model(cls, attr_nums=None, **cfg):
def Model(cls, **cfg):
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
raise ValueError(TempErrors.T008)
if attr_nums is None:
attr_nums = [get_field_size(name) for name in FIELDS]
return build_morphologizer_model(attr_nums, **cfg)
class_map = Morphology.create_class_map()
return build_morphologizer_model(class_map.field_sizes, **cfg)
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = OrderedDict(sorted(cfg.items()))
self.cfg.setdefault('cnn_maxout_pieces', 2)
self._class_map = self.vocab.morphology.create_class_map()
@property
def labels(self):
@ -76,13 +74,13 @@ class Morphologizer(Pipe):
docs = [docs]
cdef Doc doc
cdef Vocab vocab = self.vocab
field_names = list(FIELDS)
offsets = [get_field_offset(field) for field in field_names]
offsets = [self._class_map.get_field_offset(field)
for field in self._class_map.fields]
for i, doc in enumerate(docs):
doc_scores = batch_scores[i]
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
# Convert the neuron indices into feature IDs.
doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i')
doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
for j in range(len(doc)):
for k, offset in enumerate(offsets):
if doc_guesses[j, k] == 0:
@ -90,7 +88,8 @@ class Morphologizer(Pipe):
else:
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
# Get the set of feature names.
feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0}
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
if f != 0}
# Now add the analysis, and set the hash.
try:
doc.c[j].morph = self.vocab.morphology.add(feats)
@ -132,14 +131,15 @@ class Morphologizer(Pipe):
if features is None:
target[idx] = scores[idx]
else:
by_field = {}
gold_fields = {}
for feature in features:
field, column = parse_feature(feature)
by_field[field] = column
field = self.get_field(feature)
column = self.get_column(feature)
gold_fields[field] = column
col_offset = 0
for field, field_size in enumerate(field_sizes):
if field in by_field:
target[idx, col_offset + by_field[field]] = 1.
if field in gold_fields:
target[idx, col_offset + gold_fields[field]] = 1.
else:
target[idx, col_offset] = 1.
col_offset += field_size

View File

@ -3,7 +3,6 @@ from libc.string cimport memset
from ..vocab cimport Vocab
from ..typedefs cimport hash_t, attr_t
from ..morphology cimport list_features, check_feature, get_field, tag_to_json
from ..morphology cimport attribute_to_field
from ..strings import get_string_id
@ -53,7 +52,7 @@ cdef class MorphAnalysis:
return self.key
def get(self, unicode field):
cdef int field_id = attribute_to_field(field)
cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
return self.vocab.strings[get_field(&self.c, field_id)]
def to_json(self):