Refactor morphologizer class map

This commit is contained in:
Matthew Honnibal 2019-03-09 20:55:33 +01:00
parent f742900f83
commit 41a3016019
4 changed files with 58 additions and 67 deletions

View File

@ -20,6 +20,7 @@ cdef class Morphology:
cdef readonly object tag_names cdef readonly object tag_names
cdef readonly object reverse_index cdef readonly object reverse_index
cdef readonly object exc cdef readonly object exc
cdef readonly object _feat_map
cdef readonly PreshMapArray _cache cdef readonly PreshMapArray _cache
cdef readonly int n_tags cdef readonly int n_tags
@ -36,6 +37,5 @@ cdef class Morphology:
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
cdef list list_features(const MorphAnalysisC* tag) cdef list list_features(const MorphAnalysisC* tag)
cdef int attribute_to_field(unicode attribute)
cdef tag_to_json(const MorphAnalysisC* tag) cdef tag_to_json(const MorphAnalysisC* tag)

View File

@ -93,26 +93,34 @@ def _normalize_props(props):
return out return out
def parse_feature(feature): class MorphologyClassMap(object):
field = FEATURE_FIELDS[feature] def __init__(self, features, fields):
offset = FEATURE_OFFSETS[feature] self.features = tuple(features)
return (field, offset) self.fields = tuple(fields)
self.id2feat = {get_string_id(name): name for name in features}
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
self.field2feats = {}
self.col2info = []
self.attr2field = dict(LOWER_FIELDS.items())
for feature in features:
field = self.feat2field[feature]
if field not in self.field2feats:
self.col2info.append((field, 0, 'NIL'))
self.field2feats.setdefault(field, []).append(feature)
self.col2info.append((field, len(self.field2feats[field]), feature))
@property
def field_sizes(self):
return [len(self.field2feats[field]) for field in self.fields]
cdef int attribute_to_field(unicode attribute_name): def get_field_offset(self, field):
return LOWER_FIELDS[attribute_name] n = 0
for f in self.fields:
if f == field:
def get_field_id(feature): return n
return FEATURE_FIELDS[feature] n += len(self.field2feats[f])
else:
return -1
def get_field_size(field):
return FIELD_SIZES[FIELDS[field]]
def get_field_offset(field):
return FIELD_OFFSETS[FIELDS[field]]
cdef class Morphology: cdef class Morphology:
@ -139,9 +147,11 @@ cdef class Morphology:
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) self.n_tags = len(tag_map)
self.reverse_index = {} self.reverse_index = {}
self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES}) self.add({self._feat_map.id2feat[feat] for feat in attrs
if feat in self._feat_map.id2feat})
self.tag_map[tag_str] = dict(attrs) self.tag_map[tag_str] = dict(attrs)
self.reverse_index[self.strings.add(tag_str)] = i self.reverse_index[self.strings.add(tag_str)] = i
@ -167,7 +177,7 @@ cdef class Morphology:
features = intify_features(features) features = intify_features(features)
cdef attr_t feature cdef attr_t feature
for feature in features: for feature in features:
if feature != 0 and feature not in FEATURE_NAMES: if feature != 0 and feature not in self._feat_map.id2feat:
raise KeyError("Unknown feature: %s" % self.strings[feature]) raise KeyError("Unknown feature: %s" % self.strings[feature])
cdef MorphAnalysisC tag cdef MorphAnalysisC tag
tag = create_rich_tag(features) tag = create_rich_tag(features)
@ -187,7 +197,7 @@ cdef class Morphology:
features = intify_features(features) features = intify_features(features)
cdef attr_t feature cdef attr_t feature
for feature in features: for feature in features:
field = get_field_id(feature) field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
set_feature(&tag, field, feature, 1) set_feature(&tag, field, feature, 1)
morph = self.insert(tag) morph = self.insert(tag)
return morph return morph
@ -224,7 +234,8 @@ cdef class Morphology:
""" """
attrs = dict(attrs) attrs = dict(attrs)
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)
self.add({FEATURE_NAMES[feat] for feat in attrs if feat in FEATURE_NAMES}) self.add({self._feat_map.id2feat[feat] for feat in attrs
if feat in self._feat_map.id2feat})
attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
self.exc[(tag_str, self.strings.add(orth_str))] = attrs self.exc[(tag_str, self.strings.add(orth_str))] = attrs
@ -313,6 +324,10 @@ cdef class Morphology:
def from_disk(self, path): def from_disk(self, path):
raise NotImplementedError raise NotImplementedError
@classmethod
def create_class_map(cls):
return MorphologyClassMap(FEATURES, FIELDS)
cpdef univ_pos_t get_int_tag(pos_): cpdef univ_pos_t get_int_tag(pos_):
return <univ_pos_t>0 return <univ_pos_t>0
@ -324,17 +339,12 @@ cdef hash_t hash_tag(MorphAnalysisC tag) nogil:
return mrmr.hash64(&tag, sizeof(tag), 0) return mrmr.hash64(&tag, sizeof(tag), 0)
def get_feature_field(feature):
cdef attr_t key = get_string_id(feature)
return FEATURE_FIELDS[feature]
cdef MorphAnalysisC create_rich_tag(features) except *: cdef MorphAnalysisC create_rich_tag(features) except *:
cdef MorphAnalysisC tag cdef MorphAnalysisC tag
cdef attr_t feature cdef attr_t feature
memset(&tag, 0, sizeof(tag)) memset(&tag, 0, sizeof(tag))
for feature in features: for feature in features:
field = get_field_id(feature) field = FEATURE_FIELDS[FEATURE_NAMES[feature]]
set_feature(&tag, field, feature, 1) set_feature(&tag, field, feature, 1)
return tag return tag
@ -519,8 +529,7 @@ cdef attr_t get_field(const MorphAnalysisC* tag, int field_id) nogil:
elif field == Field_VerbType: elif field == Field_VerbType:
return tag.verb_type return tag.verb_type
else: else:
raise ValueError("Unknown feature: %s (%d)" % (FEATURE_NAMES.get(feature), feature)) raise ValueError("Unknown field: (%d)" % field_id)
cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil: cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil:
@ -1090,22 +1099,5 @@ FEATURES = [
"Voice_int", "Voice_int",
] ]
FEATURE_NAMES = {get_string_id(name): name for name in FEATURES} FEATURE_NAMES = {get_string_id(f): f for f in FEATURES}
FEATURE_FIELDS = {feature: FIELDS[feature.split('_', 1)[0]] for feature in FEATURES} FEATURE_FIELDS = {f: FIELDS[f.split('_', 1)[0]] for f in FEATURES}
FIELD_SIZES = Counter(FEATURE_FIELDS.values())
for field in FIELD_SIZES:
FIELD_SIZES[field] += 1
for feat_id, name in FEATURE_NAMES.items():
FEATURE_FIELDS[feat_id] = FEATURE_FIELDS[name]
# Mapping of feature names to their position in total vector
FEATURE_OFFSETS = {}
# Mapping of field names to their first position in total vector.
FIELD_OFFSETS = {}
_seen_fields = Counter()
for i, feature in enumerate(FEATURES):
field = FEATURE_FIELDS[feature]
# Add 1 for the NIL class, on each field
FEATURE_OFFSETS[feature] = _seen_fields[field] + 1
if _seen_fields[field] == 0:
FIELD_OFFSETS[field] = i
_seen_fields[field] += 1

View File

@ -16,26 +16,24 @@ from ..compat import basestring_
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..morphology cimport Morphology from ..morphology cimport Morphology
from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS
from ..morphology import FEATURES
class Morphologizer(Pipe): class Morphologizer(Pipe):
name = 'morphologizer' name = 'morphologizer'
@classmethod @classmethod
def Model(cls, attr_nums=None, **cfg): def Model(cls, **cfg):
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
raise ValueError(TempErrors.T008) raise ValueError(TempErrors.T008)
if attr_nums is None: class_map = Morphology.create_class_map()
attr_nums = [get_field_size(name) for name in FIELDS] return build_morphologizer_model(class_map.field_sizes, **cfg)
return build_morphologizer_model(attr_nums, **cfg)
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self.cfg = OrderedDict(sorted(cfg.items())) self.cfg = OrderedDict(sorted(cfg.items()))
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
self._class_map = self.vocab.morphology.create_class_map()
@property @property
def labels(self): def labels(self):
@ -76,13 +74,13 @@ class Morphologizer(Pipe):
docs = [docs] docs = [docs]
cdef Doc doc cdef Doc doc
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
field_names = list(FIELDS) offsets = [self._class_map.get_field_offset(field)
offsets = [get_field_offset(field) for field in field_names] for field in self._class_map.fields]
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_scores = batch_scores[i] doc_scores = batch_scores[i]
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes) doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
# Convert the neuron indices into feature IDs. # Convert the neuron indices into feature IDs.
doc_feat_ids = numpy.zeros((len(doc), len(field_names)), dtype='i') doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
for j in range(len(doc)): for j in range(len(doc)):
for k, offset in enumerate(offsets): for k, offset in enumerate(offsets):
if doc_guesses[j, k] == 0: if doc_guesses[j, k] == 0:
@ -90,7 +88,8 @@ class Morphologizer(Pipe):
else: else:
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1) doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
# Get the set of feature names. # Get the set of feature names.
feats = {FEATURES[f] for f in doc_feat_ids[j] if f != 0} feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
if f != 0}
# Now add the analysis, and set the hash. # Now add the analysis, and set the hash.
try: try:
doc.c[j].morph = self.vocab.morphology.add(feats) doc.c[j].morph = self.vocab.morphology.add(feats)
@ -132,14 +131,15 @@ class Morphologizer(Pipe):
if features is None: if features is None:
target[idx] = scores[idx] target[idx] = scores[idx]
else: else:
by_field = {} gold_fields = {}
for feature in features: for feature in features:
field, column = parse_feature(feature) field = self.get_field(feature)
by_field[field] = column column = self.get_column(feature)
gold_fields[field] = column
col_offset = 0 col_offset = 0
for field, field_size in enumerate(field_sizes): for field, field_size in enumerate(field_sizes):
if field in by_field: if field in gold_fields:
target[idx, col_offset + by_field[field]] = 1. target[idx, col_offset + gold_fields[field]] = 1.
else: else:
target[idx, col_offset] = 1. target[idx, col_offset] = 1.
col_offset += field_size col_offset += field_size

View File

@ -3,7 +3,6 @@ from libc.string cimport memset
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..typedefs cimport hash_t, attr_t from ..typedefs cimport hash_t, attr_t
from ..morphology cimport list_features, check_feature, get_field, tag_to_json from ..morphology cimport list_features, check_feature, get_field, tag_to_json
from ..morphology cimport attribute_to_field
from ..strings import get_string_id from ..strings import get_string_id
@ -53,7 +52,7 @@ cdef class MorphAnalysis:
return self.key return self.key
def get(self, unicode field): def get(self, unicode field):
cdef int field_id = attribute_to_field(field) cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
return self.vocab.strings[get_field(&self.c, field_id)] return self.vocab.strings[get_field(&self.c, field_id)]
def to_json(self): def to_json(self):