mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Refactor morphologizer
This commit is contained in:
parent
41a3016019
commit
0f12082465
|
@ -561,7 +561,7 @@ def build_morphologizer_model(class_nums, **cfg):
|
|||
token_vector_width = util.env_opt("token_vector_width", 128)
|
||||
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||
char_embed = cfg.get("char_embed", True)
|
||||
with Model.define_operators({">>": chain, "+": add}):
|
||||
with Model.define_operators({">>": chain, "+": add, "**": clone}):
|
||||
if "tok2vec" in cfg:
|
||||
tok2vec = cfg["tok2vec"]
|
||||
else:
|
||||
|
@ -571,7 +571,9 @@ def build_morphologizer_model(class_nums, **cfg):
|
|||
char_embed=char_embed,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
)
|
||||
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
||||
softmax = with_flatten(
|
||||
MultiSoftmax(class_nums, token_vector_width)
|
||||
)
|
||||
softmax.out_sizes = class_nums
|
||||
model = tok2vec >> softmax
|
||||
model.nI = None
|
||||
|
|
|
@ -46,8 +46,8 @@ cdef enum univ_field_t:
|
|||
Field_PartForm
|
||||
Field_PartType
|
||||
Field_Person
|
||||
Field_Polite
|
||||
Field_Polarity
|
||||
Field_Polite
|
||||
Field_Poss
|
||||
Field_Prefix
|
||||
Field_PrepCase
|
||||
|
@ -60,8 +60,8 @@ cdef enum univ_field_t:
|
|||
Field_Tense
|
||||
Field_Typo
|
||||
Field_VerbForm
|
||||
Field_Voice
|
||||
Field_VerbType
|
||||
Field_Voice
|
||||
|
||||
|
||||
def _normalize_props(props):
|
||||
|
@ -94,20 +94,36 @@ def _normalize_props(props):
|
|||
|
||||
|
||||
class MorphologyClassMap(object):
|
||||
def __init__(self, features, fields):
|
||||
def __init__(self, features):
|
||||
self.features = tuple(features)
|
||||
self.fields = tuple(fields)
|
||||
self.fields = []
|
||||
self.feat2field = {}
|
||||
seen_fields = set()
|
||||
for feature in features:
|
||||
field = feature.split("_", 1)[0]
|
||||
if field not in seen_fields:
|
||||
self.fields.append(field)
|
||||
seen_fields.add(field)
|
||||
self.feat2field[feature] = FIELDS[field]
|
||||
self.id2feat = {get_string_id(name): name for name in features}
|
||||
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
|
||||
self.field2feats = {}
|
||||
self.field2feats = {"POS": []}
|
||||
self.col2info = []
|
||||
self.attr2field = dict(LOWER_FIELDS.items())
|
||||
self.feat2offset = {}
|
||||
self.field2col = {}
|
||||
self.field2id = dict(FIELDS.items())
|
||||
self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()}
|
||||
for feature in features:
|
||||
field = self.feat2field[feature]
|
||||
if field not in self.field2feats:
|
||||
self.col2info.append((field, 0, 'NIL'))
|
||||
self.field2feats.setdefault(field, []).append(feature)
|
||||
self.col2info.append((field, len(self.field2feats[field]), feature))
|
||||
field = self.fields[self.feat2field[feature]]
|
||||
if field not in self.field2col:
|
||||
self.field2col[field] = len(self.col2info)
|
||||
if field != "POS" and field not in self.field2feats:
|
||||
self.col2info.append((field, 0, "NIL"))
|
||||
self.field2feats.setdefault(field, ["NIL"])
|
||||
offset = len(self.field2feats[field])
|
||||
self.field2feats[field].append(feature)
|
||||
self.col2info.append((field, offset, feature))
|
||||
self.feat2offset[feature] = offset
|
||||
|
||||
@property
|
||||
def field_sizes(self):
|
||||
|
@ -147,7 +163,7 @@ cdef class Morphology:
|
|||
self.lemmatizer = lemmatizer
|
||||
self.n_tags = len(tag_map)
|
||||
self.reverse_index = {}
|
||||
self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
|
||||
self._feat_map = MorphologyClassMap(FEATURES)
|
||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||
attrs = _normalize_props(attrs)
|
||||
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||
|
@ -326,7 +342,7 @@ cdef class Morphology:
|
|||
|
||||
@classmethod
|
||||
def create_class_map(cls):
|
||||
return MorphologyClassMap(FEATURES, FIELDS)
|
||||
return MorphologyClassMap(FEATURES)
|
||||
|
||||
|
||||
cpdef univ_pos_t get_int_tag(pos_):
|
||||
|
@ -770,8 +786,8 @@ FIELDS = {
|
|||
'Tense': Field_Tense,
|
||||
'Typo': Field_Typo,
|
||||
'VerbForm': Field_VerbForm,
|
||||
'VerbType': Field_VerbType,
|
||||
'Voice': Field_Voice,
|
||||
'VerbType': Field_VerbType
|
||||
}
|
||||
|
||||
LOWER_FIELDS = {
|
||||
|
@ -803,8 +819,8 @@ LOWER_FIELDS = {
|
|||
'part_form': Field_PartForm,
|
||||
'part_type': Field_PartType,
|
||||
'person': Field_Person,
|
||||
'polite': Field_Polite,
|
||||
'polarity': Field_Polarity,
|
||||
'polite': Field_Polite,
|
||||
'poss': Field_Poss,
|
||||
'prefix': Field_Prefix,
|
||||
'prep_case': Field_PrepCase,
|
||||
|
@ -817,8 +833,8 @@ LOWER_FIELDS = {
|
|||
'tense': Field_Tense,
|
||||
'typo': Field_Typo,
|
||||
'verb_form': Field_VerbForm,
|
||||
'verb_type': Field_VerbType,
|
||||
'voice': Field_Voice,
|
||||
'verb_type': Field_VerbType
|
||||
}
|
||||
|
||||
|
||||
|
@ -849,7 +865,7 @@ FEATURES = [
|
|||
"AdpType_prep",
|
||||
"AdpType_post",
|
||||
"AdpType_voc",
|
||||
"AdvType_adadj,"
|
||||
"AdvType_adadj",
|
||||
"AdvType_cau",
|
||||
"AdvType_deg",
|
||||
"AdvType_ex",
|
||||
|
|
|
@ -86,20 +86,15 @@ class Morphologizer(Pipe):
|
|||
if doc_guesses[j, k] == 0:
|
||||
doc_feat_ids[j, k] = 0
|
||||
else:
|
||||
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
|
||||
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
|
||||
# Get the set of feature names.
|
||||
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
|
||||
if f != 0}
|
||||
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
|
||||
if "NIL" in feats:
|
||||
feats.remove("NIL")
|
||||
# Now add the analysis, and set the hash.
|
||||
try:
|
||||
doc.c[j].morph = self.vocab.morphology.add(feats)
|
||||
if doc[j].morph.pos != 0:
|
||||
doc.c[j].pos = doc[j].morph.pos
|
||||
except:
|
||||
print(offsets)
|
||||
print(doc_guesses[j])
|
||||
print(doc_feat_ids[j])
|
||||
raise
|
||||
doc.c[j].morph = self.vocab.morphology.add(feats)
|
||||
if doc[j].morph.pos != 0:
|
||||
doc.c[j].pos = doc[j].morph.pos
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
if losses is not None and self.name not in losses:
|
||||
|
@ -126,23 +121,25 @@ class Morphologizer(Pipe):
|
|||
# Do this on CPU, as we can't vectorize easily.
|
||||
target = numpy.zeros(scores.shape, dtype='f')
|
||||
field_sizes = self.model.softmax.out_sizes
|
||||
for gold in golds:
|
||||
for features in gold.morphology:
|
||||
for doc, gold in zip(docs, golds):
|
||||
for t, features in enumerate(gold.morphology):
|
||||
if features is None:
|
||||
target[idx] = scores[idx]
|
||||
else:
|
||||
gold_fields = {}
|
||||
for feature in features:
|
||||
field = self.get_field(feature)
|
||||
column = self.get_column(feature)
|
||||
gold_fields[field] = column
|
||||
col_offset = 0
|
||||
for field, field_size in enumerate(field_sizes):
|
||||
if field in gold_fields:
|
||||
target[idx, col_offset + gold_fields[field]] = 1.
|
||||
field = self._class_map.feat2field[feature]
|
||||
gold_fields[field] = self._class_map.feat2offset[feature]
|
||||
for field in self._class_map.fields:
|
||||
field_id = self._class_map.field2id[field]
|
||||
col_offset = self._class_map.field2col[field]
|
||||
if field_id in gold_fields:
|
||||
target[idx, col_offset + gold_fields[field_id]] = 1.
|
||||
else:
|
||||
target[idx, col_offset] = 1.
|
||||
col_offset += field_size
|
||||
#print(doc[t])
|
||||
#for col, info in enumerate(self._class_map.col2info):
|
||||
# print(col, info, scores[idx, col], target[idx, col])
|
||||
idx += 1
|
||||
target = self.model.ops.asarray(target, dtype='f')
|
||||
scores = self.model.ops.asarray(scores, dtype='f')
|
||||
|
|
|
@ -418,6 +418,8 @@ class Tagger(Pipe):
|
|||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||
if lemma != 0 and lemma != doc.c[j].lex.orth:
|
||||
doc.c[j].lemma = lemma
|
||||
else:
|
||||
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||
idx += 1
|
||||
if tensors is not None and len(tensors):
|
||||
if isinstance(doc.tensor, numpy.ndarray) \
|
||||
|
|
Loading…
Reference in New Issue
Block a user