From 3065f12ef206d13db3544213266973dcc2b08aa3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 10 Oct 2017 22:57:31 +0200 Subject: [PATCH 01/13] Make add parser label work for hidden_depth=0 --- spacy/syntax/nn_parser.pyx | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index a8a1d4334..939414bd3 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -800,11 +800,20 @@ cdef class Parser: if self.model not in (True, False, None) and resized: # Weights are stored in (nr_out, nr_in) format, so we're basically # just adding rows here. - smaller = self.model[-1]._layers[-1] - larger = Affine(self.moves.n_moves, smaller.nI) - copy_array(larger.W[:smaller.nO], smaller.W) - copy_array(larger.b[:smaller.nO], smaller.b) - self.model[-1]._layers[-1] = larger + if self.model[-1].is_noop: + smaller = self.model[1] + dims = dict(self.model[1]._dims) + dims['nO'] = self.moves.n_moves + larger = self.model[1].__class__(**dims) + copy_array(larger.W[:, :smaller.nO], smaller.W) + copy_array(larger.b[:smaller.nO], smaller.b) + self.model = (self.model[0], larger, self.model[2]) + else: + smaller = self.model[-1]._layers[-1] + larger = Affine(self.moves.n_moves, smaller.nI) + copy_array(larger.W[:smaller.nO], smaller.W) + copy_array(larger.b[:smaller.nO], smaller.b) + self.model[-1]._layers[-1] = larger def begin_training(self, gold_tuples, pipeline=None, **cfg): if 'model' in cfg: From d84136b4a9eb716be5771ed5634be6fef4c740ef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 10 Oct 2017 22:57:41 +0200 Subject: [PATCH 02/13] Update add label test --- spacy/tests/parser/test_add_label.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index b89cca113..3fbfc96a6 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -22,14 +22,14 @@ def vocab(): @pytest.fixture def parser(vocab): parser = NeuralDependencyParser(vocab) - parser.cfg['token_vector_width'] = 4 - parser.cfg['hidden_width'] = 6 + parser.cfg['token_vector_width'] = 8 + parser.cfg['hidden_width'] = 30 parser.cfg['hist_size'] = 0 parser.add_label('left') parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) - for i in range(30): + for i in range(10): losses = {} doc = Doc(vocab, words=['a', 'b', 'c', 'd']) gold = GoldParse(doc, heads=[1, 1, 3, 3], @@ -37,6 +37,8 @@ def parser(vocab): parser.update([doc], [gold], sgd=sgd, losses=losses) return parser +def test_init_parser(parser): + pass def test_add_label(parser): doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) From 2c118ab3a6b516fae87280dac69cb9c5d7caa5a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 03:21:23 +0200 Subject: [PATCH 03/13] Add tests for Doc creation --- spacy/tests/doc/test_creation.py | 37 ++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 spacy/tests/doc/test_creation.py diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py new file mode 100644 index 000000000..edadbf086 --- /dev/null +++ b/spacy/tests/doc/test_creation.py @@ -0,0 +1,37 @@ +'''Test Doc sets up tokens correctly.''' +from __future__ import unicode_literals +import pytest + +from ...vocab import Vocab +from ...tokens.doc import Doc +from ...lemmatizerlookup import Lemmatizer + + +@pytest.fixture +def lemmatizer(): + return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) + + +@pytest.fixture +def vocab(lemmatizer): + return Vocab(lemmatizer=lemmatizer) + + +def test_empty_doc(vocab): + doc = Doc(vocab) + assert len(doc) == 0 + + +def test_single_word(vocab): + doc = Doc(vocab, words=['a']) + assert doc.text == 'a ' + doc = Doc(vocab, words=['a'], spaces=[False]) + assert doc.text == 'a' + + +def test_lookup_lemmatization(vocab): + doc = Doc(vocab, words=['dogs', 'dogses']) + assert doc[0].text == 'dogs' + assert doc[0].lemma_ == 'dog' + assert doc[1].text == 'dogses' + assert doc[1].lemma_ == 'dogses' From d528b6e36dd13d70238b085191f844728d8a7535 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 03:22:49 +0200 Subject: [PATCH 04/13] Add assign_untagged method in Morphology --- spacy/morphology.pxd | 2 ++ spacy/morphology.pyx | 14 ++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 922843d6d..be6711bfd 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -35,6 +35,8 @@ cdef class Morphology: cdef RichTagC* rich_tags cdef PreshMapArray _cache + cdef int assign_untagged(self, TokenC* token) except -1 + cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 5ee11c151..5a4399698 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -42,7 +42,7 @@ cdef class Morphology: self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} - self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) + self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) @@ -52,6 +52,10 @@ cdef class Morphology: self.rich_tags[i].morph = 0 self.rich_tags[i].pos = attrs[POS] self.reverse_index[self.rich_tags[i].name] = i + # Add a 'null' tag, which we can reference when assign morphology to + # untagged tokens. + self.rich_tags[self.n_tags].id = self.n_tags + self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: @@ -62,6 +66,10 @@ cdef class Morphology: return (Morphology, (self.strings, self.tag_map, self.lemmatizer, self.exc), None, None) + cdef int assign_untagged(self, TokenC* token) except -1: + '''Set morphological attributes on a token without a POS tag.''' + token.lemma = self.lemmatize(0, token.lex.orth, {}) + cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring): tag = self.strings.add(tag) @@ -72,7 +80,7 @@ cdef class Morphology: token.tag = tag cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: - if tag_id >= self.n_tags: + if tag_id > self.n_tags: raise ValueError("Unknown tag ID: %s" % tag_id) # TODO: It's pretty arbitrary to put this logic here. I guess the justification # is that this is where the specific word and the tag interact. Still, @@ -151,8 +159,6 @@ cdef class Morphology: cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) - if univ_pos not in (NOUN, VERB, ADJ, PUNCT): - return self.strings.add(py_string.lower()) cdef set lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) From c15d8278cb3c382a7453b1b33c10700a3f4f0766 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 03:23:23 +0200 Subject: [PATCH 05/13] Avoid lemmatizing inappropriate tags in English lemmatizer --- spacy/lemmatizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 312c8db72..ff7666c37 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -24,6 +24,8 @@ class Lemmatizer(object): univ_pos = 'adj' elif univ_pos == PUNCT: univ_pos = 'punct' + else: + return set([string.lower()]) # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): return set([string.lower()]) From 3b527fa52bdd6f29131f3bfb7deb32816c2de4f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 03:23:57 +0200 Subject: [PATCH 06/13] Call morphology.assign_untagged when pushing token to Doc --- spacy/tokens/doc.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index df75ab3ec..400ca0f2a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -512,6 +512,8 @@ cdef class Doc: assert t.lex.orth != 0 t.spacy = has_space self.length += 1 + # Set morphological attributes, e.g. by lemma, if possible + self.vocab.morphology.assign_untagged(t) self._py_tokens.append(None) return t.idx + t.lex.length + t.spacy From fd47f8e89f55703ad1c527124d631ab8543e6213 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 08:38:34 +0200 Subject: [PATCH 07/13] Fix failing test --- spacy/tests/parser/test_preset_sbd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 77326f797..f10b96192 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -64,7 +64,7 @@ def test_sents_1_3(parser): doc[1].sent_start = True doc[3].sent_start = True doc = parser(doc) - assert len(list(doc.sents)) == 4 + assert len(list(doc.sents)) >= 3 doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc[1].sent_start = True doc[2].sent_start = False From 74c2c6a58cabdb31b77df3b24f6068355d9738bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 08:49:12 +0200 Subject: [PATCH 08/13] Add default name and lang to meta --- spacy/cli/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 05d035769..a8b45e8fa 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, if not isinstance(meta, dict): prints("Expected dict but got: {}".format(type(meta)), title="Not a valid meta.json format", exits=1) + meta.setdefault('lang', lang) + meta.setdefault('name', 'unnamed') pipeline = ['tagger', 'parser', 'ner'] if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') From acba2e1051a0734d7d6ae2cc11211096039446bd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 08:55:52 +0200 Subject: [PATCH 09/13] Fix metadata in training --- spacy/cli/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index a8b45e8fa..3dae3f68b 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -91,6 +91,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, lang_class = util.get_lang_class(lang) nlp = lang_class() + meta['pipeline'] = pipeline + nlp.meta.update(meta) if vectors: util.load_model(vectors, vocab=nlp.vocab) for name in pipeline: From 188f62004694d89a040f5409164258a150abc2b1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 09:43:48 +0200 Subject: [PATCH 10/13] Improve parser defaults --- spacy/syntax/nn_parser.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 939414bd3..ce9ee39fa 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -239,13 +239,13 @@ cdef class Parser: """ @classmethod def Model(cls, nr_class, **cfg): - depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0)) - token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) - hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) - parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3)) + depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) + token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 64)) + hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) + parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) - hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) - hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) + hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4)) + hist_width = util.env_opt('history_width', cfg.get('hist_width', 16)) if hist_size >= 1 and depth == 0: raise ValueError("Inconsistent hyper-params: " "history_feats >= 1 but parser_hidden_depth==0") From 76fe24f44d1238e3755c07cd377eddde2b74a913 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 09:44:17 +0200 Subject: [PATCH 11/13] Improve embedding defaults --- spacy/_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 62e0ceb9a..b07e179f0 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -311,7 +311,7 @@ def link_vectors_to_models(vocab): def Tok2Vec(width, embed_size, **kwargs): pretrained_dims = kwargs.get('pretrained_dims', 0) - cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) + cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, '*': reapply}): From 6e552c9d83ed2010e8de2291680bc8527b58fec4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 02:46:44 -0500 Subject: [PATCH 12/13] Prune number of non-projective labels more aggressiely --- spacy/gold.pyx | 2 +- spacy/syntax/nn_parser.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 2512c179f..5729af667 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -213,7 +213,7 @@ class GoldCorpus(object): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( - self.train_tuples) + self.train_tuples, label_freq_cutoff=100) random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, max_length=max_length, diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index a8a1d4334..9288b523f 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -809,7 +809,7 @@ cdef class Parser: def begin_training(self, gold_tuples, pipeline=None, **cfg): if 'model' in cfg: self.model = cfg['model'] - gold_tuples = nonproj.preprocess_training_data(gold_tuples) + gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) actions = self.moves.get_actions(gold_parses=gold_tuples) for action, labels in actions.items(): for label in labels: From 17c467e0ab143eb89c45917740b5d32be303f56a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 11 Oct 2017 03:33:06 -0500 Subject: [PATCH 13/13] Avoid clobbering existing lemmas --- spacy/morphology.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 5a4399698..b8dbb83ba 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -55,7 +55,7 @@ cdef class Morphology: # Add a 'null' tag, which we can reference when assign morphology to # untagged tokens. self.rich_tags[self.n_tags].id = self.n_tags - + self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: @@ -68,7 +68,8 @@ cdef class Morphology: cdef int assign_untagged(self, TokenC* token) except -1: '''Set morphological attributes on a token without a POS tag.''' - token.lemma = self.lemmatize(0, token.lex.orth, {}) + if token.lemma == 0: + token.lemma = self.lemmatize(0, token.lex.orth, {}) cdef int assign_tag(self, TokenC* token, tag) except -1: if isinstance(tag, basestring):