Merge branch 'develop' into feature/dot-underscore

2025-10-25 05:01:02 +03:00 · 2017-10-11 11:57:05 +02:00 · 2017-10-11 11:57:05 +02:00 · e0ff145a8b
commit e0ff145a8b
parent f4ae6763b9 17c467e0ab
11 changed files with 87 additions and 22 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -311,7 +311,7 @@ def link_vectors_to_models(vocab):

 def Tok2Vec(width, embed_size, **kwargs):
    pretrained_dims = kwargs.get('pretrained_dims', 0)
-    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
+    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add,
                                 '*': reapply}):
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -68,6 +68,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
    if not isinstance(meta, dict):
        prints("Expected dict but got: {}".format(type(meta)),
               title="Not a valid meta.json format", exits=1)
+    meta.setdefault('lang', lang)
+    meta.setdefault('name', 'unnamed')

    pipeline = ['tagger', 'parser', 'ner']
    if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger')
@ -89,6 +91,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,

    lang_class = util.get_lang_class(lang)
    nlp = lang_class()
+    meta['pipeline'] = pipeline
+    nlp.meta.update(meta)
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
    for name in pipeline:
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -213,7 +213,7 @@ class GoldCorpus(object):
        train_tuples = self.train_tuples
        if projectivize:
            train_tuples = nonproj.preprocess_training_data(
-                               self.train_tuples)
+                               self.train_tuples, label_freq_cutoff=100)
        random.shuffle(train_tuples)
        gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
                                        max_length=max_length,
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -24,6 +24,8 @@ class Lemmatizer(object):
            univ_pos = 'adj'
        elif univ_pos == PUNCT:
            univ_pos = 'punct'
+        else:
+            return set([string.lower()])
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
            return set([string.lower()])
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -35,6 +35,8 @@ cdef class Morphology:
    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache

+    cdef int assign_untagged(self, TokenC* token) except -1
+
    cdef int assign_tag(self, TokenC* token, tag) except -1

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -42,7 +42,7 @@ cdef class Morphology:
        self.tag_names = tuple(sorted(tag_map.keys()))
        self.reverse_index = {}

-        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
+        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            self.tag_map[tag_str] = dict(attrs)
            attrs = _normalize_props(attrs)
@ -52,6 +52,10 @@ cdef class Morphology:
            self.rich_tags[i].morph = 0
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
+        # Add a 'null' tag, which we can reference when assign morphology to
+        # untagged tokens.
+        self.rich_tags[self.n_tags].id = self.n_tags
+
        self._cache = PreshMapArray(self.n_tags)
        self.exc = {}
        if exc is not None:
@ -62,6 +66,11 @@ cdef class Morphology:
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
                             self.exc), None, None)

+    cdef int assign_untagged(self, TokenC* token) except -1:
+        '''Set morphological attributes on a token without a POS tag.'''
+        if token.lemma == 0:
+            token.lemma = self.lemmatize(0, token.lex.orth, {})
+
    cdef int assign_tag(self, TokenC* token, tag) except -1:
        if isinstance(tag, basestring):
            tag = self.strings.add(tag)
@ -72,7 +81,7 @@ cdef class Morphology:
            token.tag = tag

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
-        if tag_id >= self.n_tags:
+        if tag_id > self.n_tags:
            raise ValueError("Unknown tag ID: %s" % tag_id)
        # TODO: It's pretty arbitrary to put this logic here. I guess the justification
        # is that this is where the specific word and the tag interact. Still,
@ -151,8 +160,6 @@ cdef class Morphology:
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
-        if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
-            return self.strings.add(py_string.lower())
        cdef set lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -239,13 +239,13 @@ cdef class Parser:
    """
    @classmethod
    def Model(cls, nr_class, **cfg):
-        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 0))
-        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
-        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128))
-        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 3))
+        depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
+        token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 64))
+        hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64))
+        parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
        embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
-        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
-        hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
+        hist_size = util.env_opt('history_feats', cfg.get('hist_size', 4))
+        hist_width = util.env_opt('history_width', cfg.get('hist_width', 16))
        if hist_size >= 1 and depth == 0:
            raise ValueError("Inconsistent hyper-params: "
                "history_feats >= 1 but parser_hidden_depth==0")
@ -800,6 +800,15 @@ cdef class Parser:
        if self.model not in (True, False, None) and resized:
            # Weights are stored in (nr_out, nr_in) format, so we're basically
            # just adding rows here.
+            if self.model[-1].is_noop:
+                smaller = self.model[1]
+                dims = dict(self.model[1]._dims)
+                dims['nO'] = self.moves.n_moves
+                larger = self.model[1].__class__(**dims)
+                copy_array(larger.W[:, :smaller.nO], smaller.W)
+                copy_array(larger.b[:smaller.nO], smaller.b)
+                self.model = (self.model[0], larger, self.model[2])
+            else:
                smaller = self.model[-1]._layers[-1]
                larger = Affine(self.moves.n_moves, smaller.nI)
                copy_array(larger.W[:smaller.nO], smaller.W)
@ -809,7 +818,7 @@ cdef class Parser:
    def begin_training(self, gold_tuples, pipeline=None, **cfg):
        if 'model' in cfg:
            self.model = cfg['model']
-        gold_tuples = nonproj.preprocess_training_data(gold_tuples)
+        gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
        actions = self.moves.get_actions(gold_parses=gold_tuples)
        for action, labels in actions.items():
            for label in labels:
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -0,0 +1,37 @@
+'''Test Doc sets up tokens correctly.'''
+from __future__ import unicode_literals
+import pytest
+
+from ...vocab import Vocab
+from ...tokens.doc import Doc
+from ...lemmatizerlookup import Lemmatizer
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
+
+
+@pytest.fixture
+def vocab(lemmatizer):
+    return Vocab(lemmatizer=lemmatizer)
+
+
+def test_empty_doc(vocab):
+    doc = Doc(vocab)
+    assert len(doc) == 0
+
+
+def test_single_word(vocab):
+    doc = Doc(vocab, words=['a'])
+    assert doc.text == 'a '
+    doc = Doc(vocab, words=['a'], spaces=[False])
+    assert doc.text == 'a'
+
+
+def test_lookup_lemmatization(vocab):
+    doc = Doc(vocab, words=['dogs', 'dogses'])
+    assert doc[0].text == 'dogs'
+    assert doc[0].lemma_ == 'dog'
+    assert doc[1].text == 'dogses'
+    assert doc[1].lemma_ == 'dogses'
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -22,14 +22,14 @@ def vocab():
@pytest.fixture
 def parser(vocab):
    parser = NeuralDependencyParser(vocab)
-    parser.cfg['token_vector_width'] = 4
-    parser.cfg['hidden_width'] = 6
+    parser.cfg['token_vector_width'] = 8
+    parser.cfg['hidden_width'] = 30
    parser.cfg['hist_size'] = 0
    parser.add_label('left')
    parser.begin_training([], **parser.cfg)
    sgd = Adam(NumpyOps(), 0.001)

-    for i in range(30):
+    for i in range(10):
        losses = {}
        doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
        gold = GoldParse(doc, heads=[1, 1, 3, 3],
@ -37,6 +37,8 @@ def parser(vocab):
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    return parser

+def test_init_parser(parser):
+    pass

 def test_add_label(parser):
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -64,7 +64,7 @@ def test_sents_1_3(parser):
    doc[1].sent_start = True
    doc[3].sent_start = True
    doc = parser(doc)
-    assert len(list(doc.sents)) == 4
+    assert len(list(doc.sents)) >= 3
    doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
    doc[1].sent_start = True
    doc[2].sent_start = False
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -533,6 +533,8 @@ cdef class Doc:
        assert t.lex.orth != 0
        t.spacy = has_space
        self.length += 1
+        # Set morphological attributes, e.g. by lemma, if possible
+        self.vocab.morphology.assign_untagged(t)
        self._py_tokens.append(None)
        return t.idx + t.lex.length + t.spacy