mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix parsing and tok2vec models
This commit is contained in:
parent
e88a42e460
commit
24ff6b0ad9
17
spacy/_ml.py
17
spacy/_ml.py
|
@ -229,20 +229,18 @@ def drop_layer(layer, factor=2.):
|
||||||
def Tok2Vec(width, embed_size, preprocess=None):
|
def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
||||||
norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower')
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm')
|
||||||
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix')
|
prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix')
|
||||||
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix')
|
suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix')
|
||||||
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape')
|
shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape')
|
||||||
|
|
||||||
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
|
embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3))
|
||||||
tok2vec = (
|
tok2vec = (
|
||||||
with_flatten(
|
with_flatten(
|
||||||
asarray(Model.ops, dtype='uint64')
|
asarray(Model.ops, dtype='uint64')
|
||||||
>> uniqued(embed, column=5)
|
>> uniqued(embed, column=5)
|
||||||
>> drop_layer(
|
>> Residual(
|
||||||
Residual(
|
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
||||||
(ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
|
|
||||||
)
|
|
||||||
) ** 4, pad=4
|
) ** 4, pad=4
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -372,6 +370,7 @@ def fine_tune(embedding, combine=None):
|
||||||
"fine_tune currently only supports addition. Set combine=None")
|
"fine_tune currently only supports addition. Set combine=None")
|
||||||
def fine_tune_fwd(docs_tokvecs, drop=0.):
|
def fine_tune_fwd(docs_tokvecs, drop=0.):
|
||||||
docs, tokvecs = docs_tokvecs
|
docs, tokvecs = docs_tokvecs
|
||||||
|
|
||||||
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
|
lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
|
||||||
|
|
||||||
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
|
vecs, bp_vecs = embedding.begin_update(docs, drop=drop)
|
||||||
|
@ -556,7 +555,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
|
|
||||||
cnn_model = (
|
cnn_model = (
|
||||||
# TODO Make concatenate support lists
|
# TODO Make concatenate support lists
|
||||||
concatenate_lists(trained_vectors, static_vectors)
|
concatenate_lists(trained_vectors, static_vectors)
|
||||||
>> with_flatten(
|
>> with_flatten(
|
||||||
LN(Maxout(width, width*2))
|
LN(Maxout(width, width*2))
|
||||||
>> Residual(
|
>> Residual(
|
||||||
|
|
|
@ -72,8 +72,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
util.env_opt('batch_compound', 1.001))
|
util.env_opt('batch_compound', 1.001))
|
||||||
|
|
||||||
if resume:
|
if resume:
|
||||||
prints(output_path / 'model19.pickle', title="Resuming training")
|
prints(output_path / 'model9.pickle', title="Resuming training")
|
||||||
nlp = dill.load((output_path / 'model19.pickle').open('rb'))
|
nlp = dill.load((output_path / 'model9.pickle').open('rb'))
|
||||||
else:
|
else:
|
||||||
nlp = lang_class(pipeline=pipeline)
|
nlp = lang_class(pipeline=pipeline)
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||||
|
@ -87,7 +87,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
if resume:
|
if resume:
|
||||||
i += 20
|
i += 20
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
train_docs = corpus.train_docs(nlp, projectivize=True,
|
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||||
gold_preproc=gold_preproc, max_length=0)
|
gold_preproc=gold_preproc, max_length=0)
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
|
|
|
@ -393,8 +393,7 @@ cdef class Parser:
|
||||||
|
|
||||||
tokvecs = self.model[0].ops.flatten(tokvecses)
|
tokvecs = self.model[0].ops.flatten(tokvecses)
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
# TODO: This is incorrect! Unhack when training next model
|
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
||||||
tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
|
|
||||||
|
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
|
@ -533,7 +532,7 @@ cdef class Parser:
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
if USE_FINE_TUNE:
|
if USE_FINE_TUNE:
|
||||||
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
|
||||||
tokvecs += self.model[0].ops.flatten(my_tokvecs)
|
tokvecs = self.model[0].ops.flatten(my_tokvecs)
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = get_cuda_stream()
|
||||||
|
|
||||||
|
@ -706,7 +705,7 @@ cdef class Parser:
|
||||||
lower, stream, drop=dropout)
|
lower, stream, drop=dropout)
|
||||||
return state2vec, upper
|
return state2vec, upper
|
||||||
|
|
||||||
nr_feature = 13
|
nr_feature = 8
|
||||||
|
|
||||||
def get_token_ids(self, states):
|
def get_token_ids(self, states):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
|
|
Loading…
Reference in New Issue
Block a user