From e49cd7aeaf81ed12490d82b8a65ca93088ec916e Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 3 Oct 2017 15:22:19 +0200 Subject: [PATCH 01/15] Move import into load to avoid circular imports --- spacy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 1cb7c0cbd..9acc566ad 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals from .cli.info import info as cli_info from .glossary import explain -from .deprecated import resolve_load_name from .about import __version__ from . import util def load(name, **overrides): + from .deprecated import resolve_load_name name = resolve_load_name(name, **overrides) return util.load_model(name, **overrides) From 02586a52431865a165439098bff8482cae96397a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 09:14:34 -0500 Subject: [PATCH 02/15] Add timing to spacy evaluate command --- spacy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 1cb7c0cbd..58a2f10a6 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .cli.info import info as cli_info from .glossary import explain from .deprecated import resolve_load_name -from .about import __version__ +#from .about import __version__ from . import util From 96da86b3e5d3a515f0f8db57ef1704750233ff38 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 09:14:57 -0500 Subject: [PATCH 03/15] Add support for verbose flag to Language --- spacy/language.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 701b5c140..c49c64b1d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -388,7 +388,7 @@ class Language(object): self._optimizer.device = device return self._optimizer - def evaluate(self, docs_golds): + def evaluate(self, docs_golds, verbose=False): scorer = Scorer() docs, golds = zip(*docs_golds) docs = list(docs) @@ -401,7 +401,9 @@ class Language(object): docs = list(pipe.pipe(docs)) assert len(docs) == len(golds) for doc, gold in zip(docs, golds): - scorer.score(doc, gold) + if verbose: + print(doc) + scorer.score(doc, gold, verbose=verbose) return scorer @contextmanager From a44c4c3a5b91dcf85681df57865942a888485a65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 09:15:35 -0500 Subject: [PATCH 04/15] Add timer to evaluate --- spacy/cli/evaluate.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 209660529..f409821b1 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -32,18 +32,25 @@ numpy.random.seed(0) model=("Model name or path", "positional", None, str), data_path=("Location of JSON-formatted evaluation data", "positional", None, str), gold_preproc=("Use gold preprocessing", "flag", "G", bool), + gpu_id=("Use GPU", "option", "g", int), ) -def evaluate(cmd, model, data_path, gold_preproc=False): +def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False): """ Train a model. Expects data in spaCy's JSON format. """ + util.use_gpu(gpu_id) util.set_env_log(True) data_path = util.ensure_path(data_path) if not data_path.exists(): prints(data_path, title="Evaluation data not found", exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) - scorer = nlp.evaluate(list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))) + dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) + begin = timer() + scorer = nlp.evaluate(dev_docs, verbose=False) + end = timer() + nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) + print('Time', end-begin, 'words', nwords, 'w.p.s', nwords/(end-begin)) print_results(scorer) From 338e1fda0effda0b749926c38dec6f19a2dd6b6f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 09:41:05 -0500 Subject: [PATCH 05/15] Unbreak merge artefact --- spacy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 25af17361..ba2479106 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .cli.info import info as cli_info from .glossary import explain -<<<<<<< HEAD from .deprecated import resolve_load_name #from .about import __version__ from .about import __version__ From e514d6aa0a0fe82a2ebc9cf4d867532769dcb26a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 18:39:57 +0200 Subject: [PATCH 06/15] Import thinc modules more explicitly, to avoid cycles --- spacy/_ml.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 62fc7543f..77d6e0615 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,10 +1,12 @@ import ujson from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.neural import Model, Maxout, Softmax, Affine +from thinc.neural._classes.model import Model +from thinc.neural._classes.maxout import Maxout +from thinc.neural._classes.softmax import Softmax +from thinc.neural._classes.affine import Affine from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module -import thinc.extra.load_nlp import random import cytoolz @@ -13,7 +15,7 @@ from thinc.neural._classes.static_vectors import StaticVectors from thinc.neural._classes.batchnorm import BatchNorm as BN from thinc.neural._classes.layernorm import LayerNorm as LN from thinc.neural._classes.resnet import Residual -from thinc.neural import ReLu +from thinc.neural._classes.relu import ReLu from thinc.neural._classes.selu import SELU from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient @@ -23,6 +25,7 @@ from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool from thinc.neural._classes.attention import ParametricAttention from thinc.linear.linear import LinearModel from thinc.api import uniqued, wrap, flatten_add_lengths, noop +import thinc.extra.load_nlp from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER From cbb1fbef80a15fe2f8415cd698d9f8b78c48ef04 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 18:38:55 +0200 Subject: [PATCH 07/15] Update train_ner_standalone example --- examples/training/train_ner_standalone.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/examples/training/train_ner_standalone.py b/examples/training/train_ner_standalone.py index 6cca56c69..e4fb1d1e8 100644 --- a/examples/training/train_ner_standalone.py +++ b/examples/training/train_ner_standalone.py @@ -20,9 +20,10 @@ import plac from pathlib import Path import random import json +import tqdm + from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps -import tqdm from spacy.vocab import Vocab from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer @@ -35,6 +36,7 @@ from spacy.gold import minibatch from spacy.scorer import Scorer import spacy.util + try: unicode except NameError: @@ -55,20 +57,17 @@ def init_vocab(): class Pipeline(object): - def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None): + def __init__(self, vocab=None, tokenizer=None, entity=None): if vocab is None: vocab = init_vocab() if tokenizer is None: tokenizer = Tokenizer(vocab, {}, None, None, None) - if tensorizer is None: - tensorizer = TokenVectorEncoder(vocab) if entity is None: entity = NeuralEntityRecognizer(vocab) self.vocab = vocab self.tokenizer = tokenizer - self.tensorizer = tensorizer self.entity = entity - self.pipeline = [tensorizer, self.entity] + self.pipeline = [self.entity] def begin_training(self): for model in self.pipeline: @@ -102,10 +101,8 @@ class Pipeline(object): golds = [self.make_gold(input_, annot) for input_, annot in zip(inputs, annots)] - tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop) - d_tensors = self.entity.update((docs, tensors), golds, drop=drop, - sgd=sgd, losses=losses) - bp_tensors(d_tensors, sgd=sgd) + self.entity.update(docs, golds, drop=drop, + sgd=sgd, losses=losses) return losses def evaluate(self, examples): @@ -123,7 +120,6 @@ class Pipeline(object): elif not path.is_dir(): raise IOError("Can't save pipeline to %s\nNot a directory" % path) self.vocab.to_disk(path / 'vocab') - self.tensorizer.to_disk(path / 'tensorizer') self.entity.to_disk(path / 'ner') def from_disk(self, path): @@ -133,7 +129,6 @@ class Pipeline(object): if not path.is_dir(): raise IOError("Cannot load pipeline from %s\nNot a directory" % path) self.vocab = self.vocab.from_disk(path / 'vocab') - self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer') self.entity = self.entity.from_disk(path / 'ner') From 4a59f6358cfb1926f363b5c094ce5d9d47608928 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 19:21:26 +0200 Subject: [PATCH 08/15] Fix thinc imports --- spacy/pipeline.pyx | 6 ++++-- spacy/syntax/nn_parser.pyx | 5 ++++- spacy/tests/test_misc.py | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 1a12107b7..f6ee257d8 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -4,7 +4,6 @@ from __future__ import unicode_literals from thinc.api import chain, layerize, with_getitem -from thinc.neural import Model, Softmax import numpy cimport numpy as np import cytoolz @@ -14,7 +13,10 @@ import ujson import msgpack from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.neural import Model, Maxout, Softmax, Affine +from thinc.neural._classes.model import Model +from thinc.neural._classes.maxout import Maxout +from thinc.neural._classes.softmax import Softmax +from thinc.neural._classes.affine import Affine from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural.util import to_categorical diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 1efdc4474..4043d6dd3 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -38,7 +38,10 @@ from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.api import layerize, chain, noop, clone, with_flatten -from thinc.neural import Model, Affine, ReLu, Maxout +from thinc.neural._classes.model import Model +from thinc.neural._classes.affine import Affine +from thinc.neural._classes.relu import ReLu +from thinc.neural._classes.maxout import Maxout from thinc.neural._classes.batchnorm import BatchNorm as BN from thinc.neural._classes.selu import SELU from thinc.neural._classes.layernorm import LayerNorm diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 80b859c70..762ea4c08 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -9,7 +9,8 @@ from .util import get_doc from pathlib import Path import pytest -from thinc.neural import Maxout, Softmax +from thinc.neural._classes.maxout import Maxout +from thinc.neural._classes.softmax import Softmax from thinc.api import chain From 5454b20cd7dbb41da577578e55274e556db00a4c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 20:07:17 +0200 Subject: [PATCH 09/15] Update thinc imports for 6.9 --- spacy/_ml.py | 31 +++++++++++++------------------ spacy/pipeline.pyx | 18 ++++++++---------- spacy/syntax/nn_parser.pyx | 10 +++------- 3 files changed, 24 insertions(+), 35 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 77d6e0615..47f5c545e 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,33 +1,28 @@ import ujson +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU +from thinc.i2v import HashEmbed, StaticVectors +from thinc.t2t import ExtractWindow, ParametricAttention +from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool +from thinc.misc import Residual +from thinc.misc import BatchNorm as BN +from thinc.misc import LayerNorm as LN + from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.neural._classes.model import Model -from thinc.neural._classes.maxout import Maxout -from thinc.neural._classes.softmax import Softmax -from thinc.neural._classes.affine import Affine -from thinc.neural._classes.hash_embed import HashEmbed +from thinc.api import FeatureExtracter, with_getitem +from thinc.api import uniqued, wrap, flatten_add_lengths, noop + +from thinc.linear.linear import LinearModel from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module + import random import cytoolz -from thinc.neural._classes.convolution import ExtractWindow -from thinc.neural._classes.static_vectors import StaticVectors -from thinc.neural._classes.batchnorm import BatchNorm as BN -from thinc.neural._classes.layernorm import LayerNorm as LN -from thinc.neural._classes.resnet import Residual -from thinc.neural._classes.relu import ReLu -from thinc.neural._classes.selu import SELU from thinc import describe from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.neural._classes.affine import _set_dimensions_if_needed -from thinc.api import FeatureExtracter, with_getitem -from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool -from thinc.neural._classes.attention import ParametricAttention -from thinc.linear.linear import LinearModel -from thinc.api import uniqued, wrap, flatten_add_lengths, noop import thinc.extra.load_nlp - from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP, CLUSTER from .tokens.doc import Doc from . import util diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index f6ee257d8..8d935335c 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -13,20 +13,18 @@ import ujson import msgpack from thinc.api import add, layerize, chain, clone, concatenate, with_flatten -from thinc.neural._classes.model import Model -from thinc.neural._classes.maxout import Maxout -from thinc.neural._classes.softmax import Softmax -from thinc.neural._classes.affine import Affine -from thinc.neural._classes.hash_embed import HashEmbed +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU +from thinc.i2v import HashEmbed +from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool +from thinc.t2t import ExtractWindow, ParametricAttention +from thinc.misc import Residual +from thinc.misc import BatchNorm as BN +from thinc.misc import LayerNorm as LN + from thinc.neural.util import to_categorical -from thinc.neural.pooling import Pooling, max_pool, mean_pool from thinc.neural._classes.difference import Siamese, CauchySimilarity -from thinc.neural._classes.convolution import ExtractWindow -from thinc.neural._classes.resnet import Residual -from thinc.neural._classes.batchnorm import BatchNorm as BN - from .tokens.doc cimport Doc from .syntax.parser cimport Parser as LinearParser from .syntax.nn_parser cimport Parser as NeuralParser diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 4043d6dd3..459c94463 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -38,13 +38,9 @@ from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.api import layerize, chain, noop, clone, with_flatten -from thinc.neural._classes.model import Model -from thinc.neural._classes.affine import Affine -from thinc.neural._classes.relu import ReLu -from thinc.neural._classes.maxout import Maxout -from thinc.neural._classes.batchnorm import BatchNorm as BN -from thinc.neural._classes.selu import SELU -from thinc.neural._classes.layernorm import LayerNorm +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU +from thinc.misc import LayerNorm + from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module From 5cbefcba1743701a4d895123178a885454cf6c45 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 20:29:58 +0200 Subject: [PATCH 10/15] Set backwards compatibility flag --- spacy/_ml.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/_ml.py b/spacy/_ml.py index 47f5c545e..3b96a69b5 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -30,6 +30,10 @@ from . import util import numpy import io +# TODO: Unset this once we don't want to support models previous models. +import thinc.neural._classes.layernorm +thinc.neural._classes.layernorm.set_compat_six_eight(True) + VECTORS_KEY = 'spacy_pretrained_vectors' @layerize From 252299ca2a518ea2d6e1e04208bce516e9d1ef59 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 21:29:43 +0200 Subject: [PATCH 11/15] Add sdist command --- fabfile.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fabfile.py b/fabfile.py index cfa80ead5..02a2110d9 100644 --- a/fabfile.py +++ b/fabfile.py @@ -32,6 +32,10 @@ def make(): local('pip install -r requirements.txt') local('python setup.py build_ext --inplace') +def sdist(): + with virtualenv(VENV_DIR): + with lcd(path.dirname(__file__)): + local('python setup.py sdist') def clean(): with lcd(path.dirname(__file__)): From c69b0836a0f8a10f9bc56517ffb0d8abe1918b10 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 21:31:41 +0200 Subject: [PATCH 12/15] Fix fabfile --- fabfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fabfile.py b/fabfile.py index 02a2110d9..2894fe477 100644 --- a/fabfile.py +++ b/fabfile.py @@ -14,6 +14,7 @@ VENV_DIR = path.join(PWD, ENV) def env(lang='python2.7'): if path.exists(VENV_DIR): local('rm -rf {env}'.format(env=VENV_DIR)) + local('pip install virtualenv') local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) From 2eb0fe4957f4b827e85e41b811f832c1970567d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 21:40:04 +0200 Subject: [PATCH 13/15] Fix setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8943d7a2e..23b4f9581 100755 --- a/setup.py +++ b/setup.py @@ -195,7 +195,7 @@ def setup_package(): 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.8.2,<6.9.0', + 'thinc>=6.9.0,<6.10.0', 'plac<1.0.0,>=0.9.6', 'six', 'pathlib', From 32b9f3d1a671f2a8aeb6f07cf897ec069cf4e06a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 22:17:31 +0200 Subject: [PATCH 14/15] Require new thinc --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7fa5d72d3..0b46b38d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.8.2,<6.9.0 +thinc>=6.9.0,<6.10.0 murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six From f24c2e3a8af785dd11b8d0a994732174290d688b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 3 Oct 2017 22:47:31 +0200 Subject: [PATCH 15/15] Fix evaluate for non-GPU --- spacy/cli/evaluate.py | 2 +- spacy/util.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index f409821b1..d9be95fae 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -39,7 +39,7 @@ def evaluate(cmd, model, data_path, gpu_id=-1, gold_preproc=False): Train a model. Expects data in spaCy's JSON format. """ util.use_gpu(gpu_id) - util.set_env_log(True) + util.set_env_log(False) data_path = util.ensure_path(data_path) if not data_path.exists(): prints(data_path, title="Evaluation data not found", exits=1) diff --git a/spacy/util.py b/spacy/util.py index 911970831..e1a721a12 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -563,7 +563,10 @@ def minify_html(html): def use_gpu(gpu_id): - import cupy.cuda.device + try: + import cupy.cuda.device + except ImportError: + return None from thinc.neural.ops import CupyOps device = cupy.cuda.device.Device(gpu_id) device.use()