Refactor morphologizer

This commit is contained in:
Matthew Honnibal 2019-03-09 22:54:59 +00:00
parent 41a3016019
commit 0f12082465
4 changed files with 58 additions and 41 deletions

View File

@ -561,7 +561,7 @@ def build_morphologizer_model(class_nums, **cfg):
token_vector_width = util.env_opt("token_vector_width", 128) token_vector_width = util.env_opt("token_vector_width", 128)
pretrained_vectors = cfg.get("pretrained_vectors") pretrained_vectors = cfg.get("pretrained_vectors")
char_embed = cfg.get("char_embed", True) char_embed = cfg.get("char_embed", True)
with Model.define_operators({">>": chain, "+": add}): with Model.define_operators({">>": chain, "+": add, "**": clone}):
if "tok2vec" in cfg: if "tok2vec" in cfg:
tok2vec = cfg["tok2vec"] tok2vec = cfg["tok2vec"]
else: else:
@ -571,7 +571,9 @@ def build_morphologizer_model(class_nums, **cfg):
char_embed=char_embed, char_embed=char_embed,
pretrained_vectors=pretrained_vectors, pretrained_vectors=pretrained_vectors,
) )
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) softmax = with_flatten(
MultiSoftmax(class_nums, token_vector_width)
)
softmax.out_sizes = class_nums softmax.out_sizes = class_nums
model = tok2vec >> softmax model = tok2vec >> softmax
model.nI = None model.nI = None

View File

@ -46,8 +46,8 @@ cdef enum univ_field_t:
Field_PartForm Field_PartForm
Field_PartType Field_PartType
Field_Person Field_Person
Field_Polite
Field_Polarity Field_Polarity
Field_Polite
Field_Poss Field_Poss
Field_Prefix Field_Prefix
Field_PrepCase Field_PrepCase
@ -60,8 +60,8 @@ cdef enum univ_field_t:
Field_Tense Field_Tense
Field_Typo Field_Typo
Field_VerbForm Field_VerbForm
Field_Voice
Field_VerbType Field_VerbType
Field_Voice
def _normalize_props(props): def _normalize_props(props):
@ -94,20 +94,36 @@ def _normalize_props(props):
class MorphologyClassMap(object): class MorphologyClassMap(object):
def __init__(self, features, fields): def __init__(self, features):
self.features = tuple(features) self.features = tuple(features)
self.fields = tuple(fields) self.fields = []
self.feat2field = {}
seen_fields = set()
for feature in features:
field = feature.split("_", 1)[0]
if field not in seen_fields:
self.fields.append(field)
seen_fields.add(field)
self.feat2field[feature] = FIELDS[field]
self.id2feat = {get_string_id(name): name for name in features} self.id2feat = {get_string_id(name): name for name in features}
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features} self.field2feats = {"POS": []}
self.field2feats = {}
self.col2info = [] self.col2info = []
self.attr2field = dict(LOWER_FIELDS.items()) self.attr2field = dict(LOWER_FIELDS.items())
self.feat2offset = {}
self.field2col = {}
self.field2id = dict(FIELDS.items())
self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()}
for feature in features: for feature in features:
field = self.feat2field[feature] field = self.fields[self.feat2field[feature]]
if field not in self.field2feats: if field not in self.field2col:
self.col2info.append((field, 0, 'NIL')) self.field2col[field] = len(self.col2info)
self.field2feats.setdefault(field, []).append(feature) if field != "POS" and field not in self.field2feats:
self.col2info.append((field, len(self.field2feats[field]), feature)) self.col2info.append((field, 0, "NIL"))
self.field2feats.setdefault(field, ["NIL"])
offset = len(self.field2feats[field])
self.field2feats[field].append(feature)
self.col2info.append((field, offset, feature))
self.feat2offset[feature] = offset
@property @property
def field_sizes(self): def field_sizes(self):
@ -147,7 +163,7 @@ cdef class Morphology:
self.lemmatizer = lemmatizer self.lemmatizer = lemmatizer
self.n_tags = len(tag_map) self.n_tags = len(tag_map)
self.reverse_index = {} self.reverse_index = {}
self._feat_map = MorphologyClassMap(FEATURES, FIELDS) self._feat_map = MorphologyClassMap(FEATURES)
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)
self.add({self._feat_map.id2feat[feat] for feat in attrs self.add({self._feat_map.id2feat[feat] for feat in attrs
@ -326,7 +342,7 @@ cdef class Morphology:
@classmethod @classmethod
def create_class_map(cls): def create_class_map(cls):
return MorphologyClassMap(FEATURES, FIELDS) return MorphologyClassMap(FEATURES)
cpdef univ_pos_t get_int_tag(pos_): cpdef univ_pos_t get_int_tag(pos_):
@ -770,8 +786,8 @@ FIELDS = {
'Tense': Field_Tense, 'Tense': Field_Tense,
'Typo': Field_Typo, 'Typo': Field_Typo,
'VerbForm': Field_VerbForm, 'VerbForm': Field_VerbForm,
'VerbType': Field_VerbType,
'Voice': Field_Voice, 'Voice': Field_Voice,
'VerbType': Field_VerbType
} }
LOWER_FIELDS = { LOWER_FIELDS = {
@ -803,8 +819,8 @@ LOWER_FIELDS = {
'part_form': Field_PartForm, 'part_form': Field_PartForm,
'part_type': Field_PartType, 'part_type': Field_PartType,
'person': Field_Person, 'person': Field_Person,
'polite': Field_Polite,
'polarity': Field_Polarity, 'polarity': Field_Polarity,
'polite': Field_Polite,
'poss': Field_Poss, 'poss': Field_Poss,
'prefix': Field_Prefix, 'prefix': Field_Prefix,
'prep_case': Field_PrepCase, 'prep_case': Field_PrepCase,
@ -817,8 +833,8 @@ LOWER_FIELDS = {
'tense': Field_Tense, 'tense': Field_Tense,
'typo': Field_Typo, 'typo': Field_Typo,
'verb_form': Field_VerbForm, 'verb_form': Field_VerbForm,
'verb_type': Field_VerbType,
'voice': Field_Voice, 'voice': Field_Voice,
'verb_type': Field_VerbType
} }
@ -849,7 +865,7 @@ FEATURES = [
"AdpType_prep", "AdpType_prep",
"AdpType_post", "AdpType_post",
"AdpType_voc", "AdpType_voc",
"AdvType_adadj," "AdvType_adadj",
"AdvType_cau", "AdvType_cau",
"AdvType_deg", "AdvType_deg",
"AdvType_ex", "AdvType_ex",

View File

@ -86,20 +86,15 @@ class Morphologizer(Pipe):
if doc_guesses[j, k] == 0: if doc_guesses[j, k] == 0:
doc_feat_ids[j, k] = 0 doc_feat_ids[j, k] = 0
else: else:
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1) doc_feat_ids[j, k] = offset + doc_guesses[j, k]
# Get the set of feature names. # Get the set of feature names.
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j] feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
if f != 0} if "NIL" in feats:
feats.remove("NIL")
# Now add the analysis, and set the hash. # Now add the analysis, and set the hash.
try:
doc.c[j].morph = self.vocab.morphology.add(feats) doc.c[j].morph = self.vocab.morphology.add(feats)
if doc[j].morph.pos != 0: if doc[j].morph.pos != 0:
doc.c[j].pos = doc[j].morph.pos doc.c[j].pos = doc[j].morph.pos
except:
print(offsets)
print(doc_guesses[j])
print(doc_feat_ids[j])
raise
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
@ -126,23 +121,25 @@ class Morphologizer(Pipe):
# Do this on CPU, as we can't vectorize easily. # Do this on CPU, as we can't vectorize easily.
target = numpy.zeros(scores.shape, dtype='f') target = numpy.zeros(scores.shape, dtype='f')
field_sizes = self.model.softmax.out_sizes field_sizes = self.model.softmax.out_sizes
for gold in golds: for doc, gold in zip(docs, golds):
for features in gold.morphology: for t, features in enumerate(gold.morphology):
if features is None: if features is None:
target[idx] = scores[idx] target[idx] = scores[idx]
else: else:
gold_fields = {} gold_fields = {}
for feature in features: for feature in features:
field = self.get_field(feature) field = self._class_map.feat2field[feature]
column = self.get_column(feature) gold_fields[field] = self._class_map.feat2offset[feature]
gold_fields[field] = column for field in self._class_map.fields:
col_offset = 0 field_id = self._class_map.field2id[field]
for field, field_size in enumerate(field_sizes): col_offset = self._class_map.field2col[field]
if field in gold_fields: if field_id in gold_fields:
target[idx, col_offset + gold_fields[field]] = 1. target[idx, col_offset + gold_fields[field_id]] = 1.
else: else:
target[idx, col_offset] = 1. target[idx, col_offset] = 1.
col_offset += field_size #print(doc[t])
#for col, info in enumerate(self._class_map.col2info):
# print(col, info, scores[idx, col], target[idx, col])
idx += 1 idx += 1
target = self.model.ops.asarray(target, dtype='f') target = self.model.ops.asarray(target, dtype='f')
scores = self.model.ops.asarray(scores, dtype='f') scores = self.model.ops.asarray(scores, dtype='f')

View File

@ -418,6 +418,8 @@ class Tagger(Pipe):
vocab.morphology.assign_tag_id(&doc.c[j], tag_id) vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
if lemma != 0 and lemma != doc.c[j].lex.orth: if lemma != 0 and lemma != doc.c[j].lex.orth:
doc.c[j].lemma = lemma doc.c[j].lemma = lemma
else:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
idx += 1 idx += 1
if tensors is not None and len(tensors): if tensors is not None and len(tensors):
if isinstance(doc.tensor, numpy.ndarray) \ if isinstance(doc.tensor, numpy.ndarray) \