mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 19:08:06 +03:00
Refactor morphologizer
This commit is contained in:
parent
41a3016019
commit
0f12082465
|
@ -561,7 +561,7 @@ def build_morphologizer_model(class_nums, **cfg):
|
||||||
token_vector_width = util.env_opt("token_vector_width", 128)
|
token_vector_width = util.env_opt("token_vector_width", 128)
|
||||||
pretrained_vectors = cfg.get("pretrained_vectors")
|
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||||
char_embed = cfg.get("char_embed", True)
|
char_embed = cfg.get("char_embed", True)
|
||||||
with Model.define_operators({">>": chain, "+": add}):
|
with Model.define_operators({">>": chain, "+": add, "**": clone}):
|
||||||
if "tok2vec" in cfg:
|
if "tok2vec" in cfg:
|
||||||
tok2vec = cfg["tok2vec"]
|
tok2vec = cfg["tok2vec"]
|
||||||
else:
|
else:
|
||||||
|
@ -571,7 +571,9 @@ def build_morphologizer_model(class_nums, **cfg):
|
||||||
char_embed=char_embed,
|
char_embed=char_embed,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
)
|
)
|
||||||
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
softmax = with_flatten(
|
||||||
|
MultiSoftmax(class_nums, token_vector_width)
|
||||||
|
)
|
||||||
softmax.out_sizes = class_nums
|
softmax.out_sizes = class_nums
|
||||||
model = tok2vec >> softmax
|
model = tok2vec >> softmax
|
||||||
model.nI = None
|
model.nI = None
|
||||||
|
|
|
@ -46,8 +46,8 @@ cdef enum univ_field_t:
|
||||||
Field_PartForm
|
Field_PartForm
|
||||||
Field_PartType
|
Field_PartType
|
||||||
Field_Person
|
Field_Person
|
||||||
Field_Polite
|
|
||||||
Field_Polarity
|
Field_Polarity
|
||||||
|
Field_Polite
|
||||||
Field_Poss
|
Field_Poss
|
||||||
Field_Prefix
|
Field_Prefix
|
||||||
Field_PrepCase
|
Field_PrepCase
|
||||||
|
@ -60,8 +60,8 @@ cdef enum univ_field_t:
|
||||||
Field_Tense
|
Field_Tense
|
||||||
Field_Typo
|
Field_Typo
|
||||||
Field_VerbForm
|
Field_VerbForm
|
||||||
Field_Voice
|
|
||||||
Field_VerbType
|
Field_VerbType
|
||||||
|
Field_Voice
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
|
@ -94,20 +94,36 @@ def _normalize_props(props):
|
||||||
|
|
||||||
|
|
||||||
class MorphologyClassMap(object):
|
class MorphologyClassMap(object):
|
||||||
def __init__(self, features, fields):
|
def __init__(self, features):
|
||||||
self.features = tuple(features)
|
self.features = tuple(features)
|
||||||
self.fields = tuple(fields)
|
self.fields = []
|
||||||
|
self.feat2field = {}
|
||||||
|
seen_fields = set()
|
||||||
|
for feature in features:
|
||||||
|
field = feature.split("_", 1)[0]
|
||||||
|
if field not in seen_fields:
|
||||||
|
self.fields.append(field)
|
||||||
|
seen_fields.add(field)
|
||||||
|
self.feat2field[feature] = FIELDS[field]
|
||||||
self.id2feat = {get_string_id(name): name for name in features}
|
self.id2feat = {get_string_id(name): name for name in features}
|
||||||
self.feat2field = {feature: fields[feature.split('_', 1)[0]] for feature in features}
|
self.field2feats = {"POS": []}
|
||||||
self.field2feats = {}
|
|
||||||
self.col2info = []
|
self.col2info = []
|
||||||
self.attr2field = dict(LOWER_FIELDS.items())
|
self.attr2field = dict(LOWER_FIELDS.items())
|
||||||
|
self.feat2offset = {}
|
||||||
|
self.field2col = {}
|
||||||
|
self.field2id = dict(FIELDS.items())
|
||||||
|
self.fieldid2field = {field_id: field for field, field_id in FIELDS.items()}
|
||||||
for feature in features:
|
for feature in features:
|
||||||
field = self.feat2field[feature]
|
field = self.fields[self.feat2field[feature]]
|
||||||
if field not in self.field2feats:
|
if field not in self.field2col:
|
||||||
self.col2info.append((field, 0, 'NIL'))
|
self.field2col[field] = len(self.col2info)
|
||||||
self.field2feats.setdefault(field, []).append(feature)
|
if field != "POS" and field not in self.field2feats:
|
||||||
self.col2info.append((field, len(self.field2feats[field]), feature))
|
self.col2info.append((field, 0, "NIL"))
|
||||||
|
self.field2feats.setdefault(field, ["NIL"])
|
||||||
|
offset = len(self.field2feats[field])
|
||||||
|
self.field2feats[field].append(feature)
|
||||||
|
self.col2info.append((field, offset, feature))
|
||||||
|
self.feat2offset[feature] = offset
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def field_sizes(self):
|
def field_sizes(self):
|
||||||
|
@ -147,7 +163,7 @@ cdef class Morphology:
|
||||||
self.lemmatizer = lemmatizer
|
self.lemmatizer = lemmatizer
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map)
|
||||||
self.reverse_index = {}
|
self.reverse_index = {}
|
||||||
self._feat_map = MorphologyClassMap(FEATURES, FIELDS)
|
self._feat_map = MorphologyClassMap(FEATURES)
|
||||||
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
|
||||||
attrs = _normalize_props(attrs)
|
attrs = _normalize_props(attrs)
|
||||||
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
self.add({self._feat_map.id2feat[feat] for feat in attrs
|
||||||
|
@ -326,7 +342,7 @@ cdef class Morphology:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_class_map(cls):
|
def create_class_map(cls):
|
||||||
return MorphologyClassMap(FEATURES, FIELDS)
|
return MorphologyClassMap(FEATURES)
|
||||||
|
|
||||||
|
|
||||||
cpdef univ_pos_t get_int_tag(pos_):
|
cpdef univ_pos_t get_int_tag(pos_):
|
||||||
|
@ -770,8 +786,8 @@ FIELDS = {
|
||||||
'Tense': Field_Tense,
|
'Tense': Field_Tense,
|
||||||
'Typo': Field_Typo,
|
'Typo': Field_Typo,
|
||||||
'VerbForm': Field_VerbForm,
|
'VerbForm': Field_VerbForm,
|
||||||
|
'VerbType': Field_VerbType,
|
||||||
'Voice': Field_Voice,
|
'Voice': Field_Voice,
|
||||||
'VerbType': Field_VerbType
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LOWER_FIELDS = {
|
LOWER_FIELDS = {
|
||||||
|
@ -803,8 +819,8 @@ LOWER_FIELDS = {
|
||||||
'part_form': Field_PartForm,
|
'part_form': Field_PartForm,
|
||||||
'part_type': Field_PartType,
|
'part_type': Field_PartType,
|
||||||
'person': Field_Person,
|
'person': Field_Person,
|
||||||
'polite': Field_Polite,
|
|
||||||
'polarity': Field_Polarity,
|
'polarity': Field_Polarity,
|
||||||
|
'polite': Field_Polite,
|
||||||
'poss': Field_Poss,
|
'poss': Field_Poss,
|
||||||
'prefix': Field_Prefix,
|
'prefix': Field_Prefix,
|
||||||
'prep_case': Field_PrepCase,
|
'prep_case': Field_PrepCase,
|
||||||
|
@ -817,8 +833,8 @@ LOWER_FIELDS = {
|
||||||
'tense': Field_Tense,
|
'tense': Field_Tense,
|
||||||
'typo': Field_Typo,
|
'typo': Field_Typo,
|
||||||
'verb_form': Field_VerbForm,
|
'verb_form': Field_VerbForm,
|
||||||
|
'verb_type': Field_VerbType,
|
||||||
'voice': Field_Voice,
|
'voice': Field_Voice,
|
||||||
'verb_type': Field_VerbType
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -849,7 +865,7 @@ FEATURES = [
|
||||||
"AdpType_prep",
|
"AdpType_prep",
|
||||||
"AdpType_post",
|
"AdpType_post",
|
||||||
"AdpType_voc",
|
"AdpType_voc",
|
||||||
"AdvType_adadj,"
|
"AdvType_adadj",
|
||||||
"AdvType_cau",
|
"AdvType_cau",
|
||||||
"AdvType_deg",
|
"AdvType_deg",
|
||||||
"AdvType_ex",
|
"AdvType_ex",
|
||||||
|
|
|
@ -86,20 +86,15 @@ class Morphologizer(Pipe):
|
||||||
if doc_guesses[j, k] == 0:
|
if doc_guesses[j, k] == 0:
|
||||||
doc_feat_ids[j, k] = 0
|
doc_feat_ids[j, k] = 0
|
||||||
else:
|
else:
|
||||||
doc_feat_ids[j, k] = offset + (doc_guesses[j, k]-1)
|
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
|
||||||
# Get the set of feature names.
|
# Get the set of feature names.
|
||||||
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]
|
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
|
||||||
if f != 0}
|
if "NIL" in feats:
|
||||||
|
feats.remove("NIL")
|
||||||
# Now add the analysis, and set the hash.
|
# Now add the analysis, and set the hash.
|
||||||
try:
|
|
||||||
doc.c[j].morph = self.vocab.morphology.add(feats)
|
doc.c[j].morph = self.vocab.morphology.add(feats)
|
||||||
if doc[j].morph.pos != 0:
|
if doc[j].morph.pos != 0:
|
||||||
doc.c[j].pos = doc[j].morph.pos
|
doc.c[j].pos = doc[j].morph.pos
|
||||||
except:
|
|
||||||
print(offsets)
|
|
||||||
print(doc_guesses[j])
|
|
||||||
print(doc_feat_ids[j])
|
|
||||||
raise
|
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
|
@ -126,23 +121,25 @@ class Morphologizer(Pipe):
|
||||||
# Do this on CPU, as we can't vectorize easily.
|
# Do this on CPU, as we can't vectorize easily.
|
||||||
target = numpy.zeros(scores.shape, dtype='f')
|
target = numpy.zeros(scores.shape, dtype='f')
|
||||||
field_sizes = self.model.softmax.out_sizes
|
field_sizes = self.model.softmax.out_sizes
|
||||||
for gold in golds:
|
for doc, gold in zip(docs, golds):
|
||||||
for features in gold.morphology:
|
for t, features in enumerate(gold.morphology):
|
||||||
if features is None:
|
if features is None:
|
||||||
target[idx] = scores[idx]
|
target[idx] = scores[idx]
|
||||||
else:
|
else:
|
||||||
gold_fields = {}
|
gold_fields = {}
|
||||||
for feature in features:
|
for feature in features:
|
||||||
field = self.get_field(feature)
|
field = self._class_map.feat2field[feature]
|
||||||
column = self.get_column(feature)
|
gold_fields[field] = self._class_map.feat2offset[feature]
|
||||||
gold_fields[field] = column
|
for field in self._class_map.fields:
|
||||||
col_offset = 0
|
field_id = self._class_map.field2id[field]
|
||||||
for field, field_size in enumerate(field_sizes):
|
col_offset = self._class_map.field2col[field]
|
||||||
if field in gold_fields:
|
if field_id in gold_fields:
|
||||||
target[idx, col_offset + gold_fields[field]] = 1.
|
target[idx, col_offset + gold_fields[field_id]] = 1.
|
||||||
else:
|
else:
|
||||||
target[idx, col_offset] = 1.
|
target[idx, col_offset] = 1.
|
||||||
col_offset += field_size
|
#print(doc[t])
|
||||||
|
#for col, info in enumerate(self._class_map.col2info):
|
||||||
|
# print(col, info, scores[idx, col], target[idx, col])
|
||||||
idx += 1
|
idx += 1
|
||||||
target = self.model.ops.asarray(target, dtype='f')
|
target = self.model.ops.asarray(target, dtype='f')
|
||||||
scores = self.model.ops.asarray(scores, dtype='f')
|
scores = self.model.ops.asarray(scores, dtype='f')
|
||||||
|
|
|
@ -418,6 +418,8 @@ class Tagger(Pipe):
|
||||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||||
if lemma != 0 and lemma != doc.c[j].lex.orth:
|
if lemma != 0 and lemma != doc.c[j].lex.orth:
|
||||||
doc.c[j].lemma = lemma
|
doc.c[j].lemma = lemma
|
||||||
|
else:
|
||||||
|
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
|
||||||
idx += 1
|
idx += 1
|
||||||
if tensors is not None and len(tensors):
|
if tensors is not None and len(tensors):
|
||||||
if isinstance(doc.tensor, numpy.ndarray) \
|
if isinstance(doc.tensor, numpy.ndarray) \
|
||||||
|
|
Loading…
Reference in New Issue
Block a user