Merge remote-tracking branch 'origin/develop' into feature/parser-history-model

2025-07-25 07:29:45 +03:00 · 2017-10-03 16:56:42 -05:00 · 2017-10-03 16:56:42 -05:00 · 246612cb53
commit 246612cb53
parent dc3c791947 f24c2e3a8a
12 changed files with 65 additions and 49 deletions
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -20,9 +20,10 @@ import plac
 from pathlib import Path
 import random
 import json
+import tqdm
+
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
-import tqdm

 from spacy.vocab import Vocab
 from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
@ -35,6 +36,7 @@ from spacy.gold import minibatch
 from spacy.scorer import Scorer
 import spacy.util

+
 try:
    unicode
 except NameError:
@ -55,20 +57,17 @@ def init_vocab():


 class Pipeline(object):
-    def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None):
+    def __init__(self, vocab=None, tokenizer=None, entity=None):
        if vocab is None:
            vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
-        if tensorizer is None:
-            tensorizer = TokenVectorEncoder(vocab)
        if entity is None:
            entity = NeuralEntityRecognizer(vocab)
        self.vocab = vocab
        self.tokenizer = tokenizer
-        self.tensorizer = tensorizer
        self.entity = entity
-        self.pipeline = [tensorizer, self.entity]
+        self.pipeline = [self.entity]

    def begin_training(self):
        for model in self.pipeline:
@ -102,10 +101,8 @@ class Pipeline(object):
        golds = [self.make_gold(input_, annot) for input_, annot in
                 zip(inputs, annots)]

-        tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop)
-        d_tensors = self.entity.update((docs, tensors), golds, drop=drop,
-                                      sgd=sgd, losses=losses)
-        bp_tensors(d_tensors, sgd=sgd)
+        self.entity.update(docs, golds, drop=drop,
+                           sgd=sgd, losses=losses)
        return losses

    def evaluate(self, examples):
@ -123,7 +120,6 @@ class Pipeline(object):
        elif not path.is_dir():
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
        self.vocab.to_disk(path / 'vocab')
-        self.tensorizer.to_disk(path / 'tensorizer')
        self.entity.to_disk(path / 'ner')

    def from_disk(self, path):
@ -133,7 +129,6 @@ class Pipeline(object):
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        self.vocab = self.vocab.from_disk(path / 'vocab')
-        self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
        self.entity = self.entity.from_disk(path / 'ner')


--- a/fabfile.py
+++ b/fabfile.py
@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV)
 def env(lang='python2.7'):
    if path.exists(VENV_DIR):
        local('rm -rf {env}'.format(env=VENV_DIR))
+    local('pip install virtualenv')
    local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR))


@ -32,6 +33,10 @@ def make():
            local('pip install -r requirements.txt')
            local('python setup.py build_ext --inplace')

+def sdist():
+    with virtualenv(VENV_DIR):
+        with lcd(path.dirname(__file__)):
+            local('python setup.py sdist')

 def clean():
    with lcd(path.dirname(__file__)):
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
 preshed>=1.0.0,<2.0.0
-thinc>=6.8.2,<6.9.0
+thinc>=6.9.0,<6.10.0
 murmurhash>=0.28,<0.29
 plac<1.0.0,>=0.9.6
 six
--- a/setup.py
+++ b/setup.py
@ -195,7 +195,7 @@ def setup_package():
                'murmurhash>=0.28,<0.29',
                'cymem>=1.30,<1.32',
                'preshed>=1.0.0,<2.0.0',
-                'thinc>=6.8.2,<6.9.0',
+                'thinc>=6.9.0,<6.10.0',
                'plac<1.0.0,>=0.9.6',
                'six',
                'pathlib',
--- a/spacy/init.py
+++ b/spacy/init.py
@ -4,11 +4,13 @@ from __future__ import unicode_literals
 from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
+#from .about import __version__
 from .about import __version__
 from . import util


 def load(name, **overrides):
+    from .deprecated import resolve_load_name
    name = resolve_load_name(name, **overrides)
    return util.load_model(name, **overrides)

--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -1,29 +1,27 @@
 import ujson
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
+from thinc.i2v import HashEmbed, StaticVectors
+from thinc.t2t import ExtractWindow, ParametricAttention
+from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
+from thinc.misc import Residual
+from thinc.misc import BatchNorm as BN
+from thinc.misc import LayerNorm as LN
+
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.neural import Model, Maxout, Softmax, Affine
-from thinc.neural._classes.hash_embed import HashEmbed
+from thinc.api import FeatureExtracter, with_getitem
+from thinc.api import uniqued, wrap, flatten_add_lengths, noop
+
+from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
-import thinc.extra.load_nlp
+
 import random
 import cytoolz

-from thinc.neural._classes.convolution import ExtractWindow
-from thinc.neural._classes.static_vectors import StaticVectors
-from thinc.neural._classes.batchnorm import BatchNorm as BN
-from thinc.neural._classes.layernorm import LayerNorm as LN
-from thinc.neural._classes.resnet import Residual
-from thinc.neural import ReLu
-from thinc.neural._classes.selu import SELU
 from thinc import describe
 from thinc.describe import Dimension, Synapses, Biases, Gradient
 from thinc.neural._classes.affine import _set_dimensions_if_needed
-from thinc.api import FeatureExtracter, with_getitem
-from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool
-from thinc.neural._classes.attention import ParametricAttention
-from thinc.linear.linear import LinearModel
-from thinc.api import uniqued, wrap, flatten_add_lengths, noop
-
+import thinc.extra.load_nlp

 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER
 from .tokens.doc import Doc
@ -32,6 +30,10 @@ from . import util
 import numpy
 import io

+# TODO: Unset this once we don't want to support models previous models.
+import thinc.neural._classes.layernorm
+thinc.neural._classes.layernorm.set_compat_six_eight(True)
+
 VECTORS_KEY = 'spacy_pretrained_vectors'

@layerize
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@ -32,18 +32,25 @@ numpy.random.seed(0)
    model=("Model name or path", "positional", None, str),
    data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
+    gpu_id=("Use GPU", "option", "g", int),
 )
-def evaluate(cmd, model, data_path, gold_preproc=False):
+def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
-    util.set_env_log(True)
+    util.use_gpu(gpu_id)
+    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
    if not data_path.exists():
        prints(data_path, title="Evaluation data not found", exits=1)
    corpus = GoldCorpus(data_path, data_path)
    nlp = util.load_model(model)
-    scorer = nlp.evaluate(list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)))
+    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
+    begin = timer()
+    scorer = nlp.evaluate(dev_docs, verbose=False)
+    end = timer()
+    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
+    print('Time', end-begin, 'words', nwords, 'w.p.s', nwords/(end-begin))
    print_results(scorer)


--- a/spacy/language.py
+++ b/spacy/language.py
@ -388,7 +388,7 @@ class Language(object):
        self._optimizer.device = device
        return self._optimizer

-    def evaluate(self, docs_golds):
+    def evaluate(self, docs_golds, verbose=False):
        scorer = Scorer()
        docs, golds = zip(*docs_golds)
        docs = list(docs)
@ -401,7 +401,9 @@ class Language(object):
                docs = list(pipe.pipe(docs))
        assert len(docs) == len(golds)
        for doc, gold in zip(docs, golds):
-            scorer.score(doc, gold)
+            if verbose:
+                print(doc)
+            scorer.score(doc, gold, verbose=verbose)
        return scorer

    @contextmanager
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -4,7 +4,6 @@
 from __future__ import unicode_literals

 from thinc.api import chain, layerize, with_getitem
-from thinc.neural import Model, Softmax
 import numpy
 cimport numpy as np
 import cytoolz
@ -14,17 +13,18 @@ import ujson
 import msgpack

 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
-from thinc.neural import Model, Maxout, Softmax, Affine
-from thinc.neural._classes.hash_embed import HashEmbed
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
+from thinc.i2v import HashEmbed
+from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
+from thinc.t2t import ExtractWindow, ParametricAttention
+from thinc.misc import Residual
+from thinc.misc import BatchNorm as BN
+from thinc.misc import LayerNorm as LN
+
 from thinc.neural.util import to_categorical

-from thinc.neural.pooling import Pooling, max_pool, mean_pool
 from thinc.neural._classes.difference import Siamese, CauchySimilarity

-from thinc.neural._classes.convolution import ExtractWindow
-from thinc.neural._classes.resnet import Residual
-from thinc.neural._classes.batchnorm import BatchNorm as BN
-
 from .tokens.doc cimport Doc
 from .syntax.parser cimport Parser as LinearParser
 from .syntax.nn_parser cimport Parser as NeuralParser
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -38,10 +38,9 @@ from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get

 from thinc.api import layerize, chain, noop, clone, with_flatten
-from thinc.neural import Model, Affine, ReLu, Maxout
-from thinc.neural._classes.batchnorm import BatchNorm as BN
-from thinc.neural._classes.selu import SELU
-from thinc.neural._classes.layernorm import LayerNorm
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
+from thinc.misc import LayerNorm
+
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module

--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -9,7 +9,8 @@ from .util import get_doc

 from pathlib import Path
 import pytest
-from thinc.neural import Maxout, Softmax
+from thinc.neural._classes.maxout import Maxout
+from thinc.neural._classes.softmax import Softmax
 from thinc.api import chain


--- a/spacy/util.py
+++ b/spacy/util.py
@ -563,7 +563,10 @@ def minify_html(html):


 def use_gpu(gpu_id):
-    import cupy.cuda.device
+    try:
+        import cupy.cuda.device
+    except ImportError:
+        return None
    from thinc.neural.ops import CupyOps
    device = cupy.cuda.device.Device(gpu_id)
    device.use()