Merge branch 'develop' into feature/lemmatizer

This commit is contained in:
ines 2017-10-11 11:56:35 +02:00
commit c1d6d43c83
7 changed files with 34 additions and 19 deletions

View File

@ -311,7 +311,7 @@ def link_vectors_to_models(vocab):
def Tok2Vec(width, embed_size, **kwargs): def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0) pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
'*': reapply}): '*': reapply}):

View File

@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
if not isinstance(meta, dict): if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)), prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1) title="Not a valid meta.json format", exits=1)
meta.setdefault('lang', lang)
meta.setdefault('name', 'unnamed')
pipeline = ['tagger', 'parser', 'ner'] pipeline = ['tagger', 'parser', 'ner']
if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
@ -89,6 +91,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
lang_class = util.get_lang_class(lang) lang_class = util.get_lang_class(lang)
nlp = lang_class() nlp = lang_class()
meta['pipeline'] = pipeline
nlp.meta.update(meta)
if vectors: if vectors:
util.load_model(vectors, vocab=nlp.vocab) util.load_model(vectors, vocab=nlp.vocab)
for name in pipeline: for name in pipeline:

View File

@ -213,7 +213,7 @@ class GoldCorpus(object):
train_tuples = self.train_tuples train_tuples = self.train_tuples
if projectivize: if projectivize:
train_tuples = nonproj.preprocess_training_data( train_tuples = nonproj.preprocess_training_data(
self.train_tuples) self.train_tuples, label_freq_cutoff=100)
random.shuffle(train_tuples) random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length, max_length=max_length,

View File

@ -239,13 +239,13 @@ cdef class Parser:
""" """
@classmethod @classmethod
def Model(cls, nr_class, **cfg): def Model(cls, nr_class, **cfg):
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0)) depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 64))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
if hist_size >= 1 and depth == 0: if hist_size >= 1 and depth == 0:
raise ValueError("Inconsistent hyper-params: " raise ValueError("Inconsistent hyper-params: "
"history_feats >= 1 but parser_hidden_depth==0") "history_feats >= 1 but parser_hidden_depth==0")
@ -800,6 +800,15 @@ cdef class Parser:
if self.model not in (True, False, None) and resized: if self.model not in (True, False, None) and resized:
# Weights are stored in (nr_out, nr_in) format, so we're basically # Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here. # just adding rows here.
if self.model[-1].is_noop:
smaller = self.model[1]
dims = dict(self.model[1]._dims)
dims['nO'] = self.moves.n_moves
larger = self.model[1].__class__(**dims)
copy_array(larger.W[:, :smaller.nO], smaller.W)
copy_array(larger.b[:smaller.nO], smaller.b)
self.model = (self.model[0], larger, self.model[2])
else:
smaller = self.model[-1]._layers[-1] smaller = self.model[-1]._layers[-1]
larger = Affine(self.moves.n_moves, smaller.nI) larger = Affine(self.moves.n_moves, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W) copy_array(larger.W[:smaller.nO], smaller.W)
@ -809,7 +818,7 @@ cdef class Parser:
def begin_training(self, gold_tuples, pipeline=None, **cfg): def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
actions = self.moves.get_actions(gold_parses=gold_tuples) actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items(): for action, labels in actions.items():
for label in labels: for label in labels:

View File

@ -22,14 +22,14 @@ def vocab():
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
parser = NeuralDependencyParser(vocab) parser = NeuralDependencyParser(vocab)
parser.cfg['token_vector_width'] = 4 parser.cfg['token_vector_width'] = 8
parser.cfg['hidden_width'] = 6 parser.cfg['hidden_width'] = 30
parser.cfg['hist_size'] = 0 parser.cfg['hist_size'] = 0
parser.add_label('left') parser.add_label('left')
parser.begin_training([], **parser.cfg) parser.begin_training([], **parser.cfg)
sgd = Adam(NumpyOps(), 0.001) sgd = Adam(NumpyOps(), 0.001)
for i in range(30): for i in range(10):
losses = {} losses = {}
doc = Doc(vocab, words=['a', 'b', 'c', 'd']) doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3], gold = GoldParse(doc, heads=[1, 1, 3, 3],
@ -37,6 +37,8 @@ def parser(vocab):
parser.update([doc], [gold], sgd=sgd, losses=losses) parser.update([doc], [gold], sgd=sgd, losses=losses)
return parser return parser
def test_init_parser(parser):
pass
def test_add_label(parser): def test_add_label(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])

View File

@ -64,7 +64,7 @@ def test_sents_1_3(parser):
doc[1].sent_start = True doc[1].sent_start = True
doc[3].sent_start = True doc[3].sent_start = True
doc = parser(doc) doc = parser(doc)
assert len(list(doc.sents)) == 4 assert len(list(doc.sents)) >= 3
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc[1].sent_start = True doc[1].sent_start = True
doc[2].sent_start = False doc[2].sent_start = False