Refactor morphologizer

This commit is contained in:
Matthew Honnibal 2019-03-09 22:54:59 +00:00
parent 41a3016019
commit 0f12082465
4 changed files with 58 additions and 41 deletions

View File

@ -561,7 +561,7 @@ def build_morphologizer_model(class_nums, **cfg):
token_vector_width = util.env_opt("token_vector_width", 128)
pretrained_vectors = cfg.get("pretrained_vectors")
char_embed = cfg.get("char_embed", True)
with Model.define_operators({">>": chain, "+": add}):
with Model.define_operators({">>": chain, "+": add, "**": clone}):
if "tok2vec" in cfg:
tok2vec = cfg["tok2vec"]
else:
@ -571,7 +571,9 @@ def build_morphologizer_model(class_nums, **cfg):
char_embed=char_embed,
pretrained_vectors=pretrained_vectors,
)
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
softmax = with_flatten(
MultiSoftmax(class_nums, token_vector_width)
)
softmax.out_sizes = class_nums
model = tok2vec >> softmax
model.nI = None

View File

@ -46,8 +46,8 @@ cdef enum univ_field_t:
Field_PartForm
Field_PartType
Field_Person
Field_Polite
Field_Polarity
Field_Polite
Field_Poss
Field_Prefix
Field_PrepCase
@ -60,8 +60,8 @@ cdef enum univ_field_t:
Field_Tense
Field_Typo
Field_VerbForm
Field_Voice
Field_VerbType
Field_Voice
def _normalize_props(props):
@ -94,20 +94,36 @@ def _normalize_props(props):
class MorphologyClassMap(object):
def __init__(self, features, fields):
def __init__(self, features):
self.features = tuple(features)
self.fields = tuple(fields)
self.fields = []
self.feat2field = {}
seen_fields = set()
for feature in features:
field = feature.split("_", 1)[0]
if field not in seen_fields:
self.fields.append(field)
seen_fields.add(field)
self.feat2field[feature] = FIELDS[field]
self.id2feat = {get_string_id(name): name for name in features}
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
self.field2feats = {}
self.field2feats = {"POS": []}
self.col2info = []
self.attr2field = dict(LOWER_FIELDS.items())
self.feat2offset = {}
self.field2col = {}
self.field2id = dict(FIELDS.items())
self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()}
for feature in features:
field = self.feat2field[feature]
if field not in self.field2feats:
self.col2info.append((field, 0, 'NIL'))
self.field2feats.setdefault(field, []).append(feature)
self.col2info.append((field, len(self.field2feats[field]), feature))
field = self.fields[self.feat2field[feature]]
if field not in self.field2col:
self.field2col[field] = len(self.col2info)
if field != "POS" and field not in self.field2feats:
self.col2info.append((field, 0, "NIL"))
self.field2feats.setdefault(field, ["NIL"])
offset = len(self.field2feats[field])
self.field2feats[field].append(feature)
self.col2info.append((field, offset, feature))
self.feat2offset[feature] = offset
@property
def field_sizes(self):
@ -147,7 +163,7 @@ cdef class Morphology:
self.lemmatizer = lemmatizer
self.n_tags = len(tag_map)
self.reverse_index = {}
self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
self._feat_map = MorphologyClassMap(FEATURES)
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
attrs = _normalize_props(attrs)
self.add({self._feat_map.id2feat[feat] for feat in attrs
@ -326,7 +342,7 @@ cdef class Morphology:
@classmethod
def create_class_map(cls):
return MorphologyClassMap(FEATURES, FIELDS)
return MorphologyClassMap(FEATURES)
cpdef univ_pos_t get_int_tag(pos_):
@ -770,8 +786,8 @@ FIELDS = {
'Tense': Field_Tense,
'Typo': Field_Typo,
'VerbForm': Field_VerbForm,
'VerbType': Field_VerbType,
'Voice': Field_Voice,
'VerbType': Field_VerbType
}
LOWER_FIELDS = {
@ -803,8 +819,8 @@ LOWER_FIELDS = {
'part_form': Field_PartForm,
'part_type': Field_PartType,
'person': Field_Person,
'polite': Field_Polite,
'polarity': Field_Polarity,
'polite': Field_Polite,
'poss': Field_Poss,
'prefix': Field_Prefix,
'prep_case': Field_PrepCase,
@ -817,8 +833,8 @@ LOWER_FIELDS = {
'tense': Field_Tense,
'typo': Field_Typo,
'verb_form': Field_VerbForm,
'verb_type': Field_VerbType,
'voice': Field_Voice,
'verb_type': Field_VerbType
}
@ -849,7 +865,7 @@ FEATURES = [
"AdpType_prep",
"AdpType_post",
"AdpType_voc",
"AdvType_adadj,"
"AdvType_adadj",
"AdvType_cau",
"AdvType_deg",
"AdvType_ex",

View File

@ -86,20 +86,15 @@ class Morphologizer(Pipe):
if doc_guesses[j, k] == 0:
doc_feat_ids[j, k] = 0
else:
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
# Get the set of feature names.
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
if f != 0}
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
if "NIL" in feats:
feats.remove("NIL")
# Now add the analysis, and set the hash.
try:
doc.c[j].morph = self.vocab.morphology.add(feats)
if doc[j].morph.pos != 0:
doc.c[j].pos = doc[j].morph.pos
except:
print(offsets)
print(doc_guesses[j])
print(doc_feat_ids[j])
raise
doc.c[j].morph = self.vocab.morphology.add(feats)
if doc[j].morph.pos != 0:
doc.c[j].pos = doc[j].morph.pos
def update(self, docs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
@ -126,23 +121,25 @@ class Morphologizer(Pipe):
# Do this on CPU, as we can't vectorize easily.
target = numpy.zeros(scores.shape, dtype='f')
field_sizes = self.model.softmax.out_sizes
for gold in golds:
for features in gold.morphology:
for doc, gold in zip(docs, golds):
for t, features in enumerate(gold.morphology):
if features is None:
target[idx] = scores[idx]
else:
gold_fields = {}
for feature in features:
field = self.get_field(feature)
column = self.get_column(feature)
gold_fields[field] = column
col_offset = 0
for field, field_size in enumerate(field_sizes):
if field in gold_fields:
target[idx, col_offset + gold_fields[field]] = 1.
field = self._class_map.feat2field[feature]
gold_fields[field] = self._class_map.feat2offset[feature]
for field in self._class_map.fields:
field_id = self._class_map.field2id[field]
col_offset = self._class_map.field2col[field]
if field_id in gold_fields:
target[idx, col_offset + gold_fields[field_id]] = 1.
else:
target[idx, col_offset] = 1.
col_offset += field_size
#print(doc[t])
#for col, info in enumerate(self._class_map.col2info):
# print(col, info, scores[idx, col], target[idx, col])
idx += 1
target = self.model.ops.asarray(target, dtype='f')
scores = self.model.ops.asarray(scores, dtype='f')

View File

@ -418,6 +418,8 @@ class Tagger(Pipe):
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
if lemma != 0 and lemma != doc.c[j].lex.orth:
doc.c[j].lemma = lemma
else:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
idx += 1
if tensors is not None and len(tensors):
if isinstance(doc.tensor, numpy.ndarray) \