From 39f390dba7e4a5e7ac224b27731e8c463cb92f7d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 25 Sep 2017 16:20:49 +0200 Subject: [PATCH 1/6] Add docstrings for Pipe API --- spacy/pipeline.pyx | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index dcc06cdf7..fef925d85 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -88,17 +88,30 @@ class BaseThincComponent(object): @classmethod def Model(cls, *shape, **kwargs): + '''Initialize a model for the pipe.''' raise NotImplementedError def __init__(self, vocab, model=True, **cfg): + '''Create a new pipe instance.''' raise NotImplementedError def __call__(self, doc): + '''Apply the pipe to one document. The document is + modified in-place, and returned. + + Both __call__ and pipe should delegate to the `predict()` + and `set_annotations()` methods. + ''' scores = self.predict([doc]) self.set_annotations([doc], scores) return doc def pipe(self, stream, batch_size=128, n_threads=-1): + '''Apply the pipe to a stream of documents. + + Both __call__ and pipe should delegate to the `predict()` + and `set_annotations()` methods. + ''' for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) scores = self.predict(docs) @@ -106,27 +119,42 @@ class BaseThincComponent(object): yield from docs def predict(self, docs): + '''Apply the pipeline's model to a batch of docs, without + modifying them. + ''' raise NotImplementedError def set_annotations(self, docs, scores): + '''Modify a batch of documents, using pre-computed scores.''' raise NotImplementedError - def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None): + '''Learn from a batch of documents and gold-standard information, + updating the pipe's model. + + Delegates to predict() and get_loss(). + ''' raise NotImplementedError def get_loss(self, docs, golds, scores): + '''Find the loss and gradient of loss for the batch of + documents and their predicted scores.''' raise NotImplementedError def begin_training(self, gold_tuples=tuple(), pipeline=None): - token_vector_width = pipeline[0].model.nO + '''Initialize the pipe for training, using data exampes if available. + If no model has been initialized yet, the model is added.''' if self.model is True: - self.model = self.Model(1, token_vector_width) + self.model = self.Model(**self.cfg) def use_params(self, params): + '''Modify the pipe's model, to use the given parameter values. + ''' with self.model.use_params(params): yield def to_bytes(self, **exclude): + '''Serialize the pipe to a bytestring.''' serialize = OrderedDict(( ('cfg', lambda: json_dumps(self.cfg)), ('model', lambda: self.model.to_bytes()), @@ -135,6 +163,7 @@ class BaseThincComponent(object): return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): + '''Load the pipe from a bytestring.''' def load_model(b): if self.model is True: self.cfg['pretrained_dims'] = self.vocab.vectors_length @@ -150,6 +179,7 @@ class BaseThincComponent(object): return self def to_disk(self, path, **exclude): + '''Serialize the pipe to disk.''' serialize = OrderedDict(( ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), ('model', lambda p: p.open('wb').write(self.model.to_bytes())), @@ -158,6 +188,7 @@ class BaseThincComponent(object): util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): + '''Load the pipe from disk.''' def load_model(p): if self.model is True: self.cfg['pretrained_dims'] = self.vocab.vectors_length From d2d35b63b770c74f80e560fbb2efc5491064608c Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 25 Sep 2017 18:37:13 +0200 Subject: [PATCH 2/6] Fix formatting --- spacy/pipeline.pyx | 50 +++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index f660f88a6..90ff1ad88 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -48,7 +48,7 @@ from .parts_of_speech import X class SentenceSegmenter(object): - '''A simple spaCy hook, to allow custom sentence boundary detection logic + """A simple spaCy hook, to allow custom sentence boundary detection logic (that doesn't require the dependency parse). To change the sentence boundary detection strategy, pass a generator @@ -57,7 +57,7 @@ class SentenceSegmenter(object): Sentence detection strategies should be generators that take `Doc` objects and yield `Span` objects for each sentence. - ''' + """ name = 'sbd' def __init__(self, vocab, strategy=None): @@ -89,30 +89,30 @@ class BaseThincComponent(object): @classmethod def Model(cls, *shape, **kwargs): - '''Initialize a model for the pipe.''' + """Initialize a model for the pipe.""" raise NotImplementedError def __init__(self, vocab, model=True, **cfg): - '''Create a new pipe instance.''' + """Create a new pipe instance.""" raise NotImplementedError def __call__(self, doc): - '''Apply the pipe to one document. The document is + """Apply the pipe to one document. The document is modified in-place, and returned. - + Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. - ''' + """ scores = self.predict([doc]) self.set_annotations([doc], scores) return doc def pipe(self, stream, batch_size=128, n_threads=-1): - '''Apply the pipe to a stream of documents. + """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. - ''' + """ for docs in cytoolz.partition_all(batch_size, stream): docs = list(docs) scores = self.predict(docs) @@ -120,43 +120,43 @@ class BaseThincComponent(object): yield from docs def predict(self, docs): - '''Apply the pipeline's model to a batch of docs, without + """Apply the pipeline's model to a batch of docs, without modifying them. - ''' + """ raise NotImplementedError def set_annotations(self, docs, scores): - '''Modify a batch of documents, using pre-computed scores.''' + """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError def update(self, docs, golds, drop=0., sgd=None, losses=None): - '''Learn from a batch of documents and gold-standard information, + """Learn from a batch of documents and gold-standard information, updating the pipe's model. Delegates to predict() and get_loss(). - ''' + """ raise NotImplementedError def get_loss(self, docs, golds, scores): - '''Find the loss and gradient of loss for the batch of - documents and their predicted scores.''' + """Find the loss and gradient of loss for the batch of + documents and their predicted scores.""" raise NotImplementedError def begin_training(self, gold_tuples=tuple(), pipeline=None): - '''Initialize the pipe for training, using data exampes if available. - If no model has been initialized yet, the model is added.''' + """Initialize the pipe for training, using data exampes if available. + If no model has been initialized yet, the model is added.""" if self.model is True: self.model = self.Model(**self.cfg) link_vectors_to_models(self.vocab) def use_params(self, params): - '''Modify the pipe's model, to use the given parameter values. - ''' + """Modify the pipe's model, to use the given parameter values. + """ with self.model.use_params(params): yield def to_bytes(self, **exclude): - '''Serialize the pipe to a bytestring.''' + """Serialize the pipe to a bytestring.""" serialize = OrderedDict(( ('cfg', lambda: json_dumps(self.cfg)), ('model', lambda: self.model.to_bytes()), @@ -165,7 +165,7 @@ class BaseThincComponent(object): return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): - '''Load the pipe from a bytestring.''' + """Load the pipe from a bytestring.""" def load_model(b): if self.model is True: self.cfg['pretrained_dims'] = self.vocab.vectors_length @@ -181,7 +181,7 @@ class BaseThincComponent(object): return self def to_disk(self, path, **exclude): - '''Serialize the pipe to disk.''' + """Serialize the pipe to disk.""" serialize = OrderedDict(( ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), ('vocab', lambda p: self.vocab.to_disk(p)), @@ -190,7 +190,7 @@ class BaseThincComponent(object): util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): - '''Load the pipe from disk.''' + """Load the pipe from disk.""" def load_model(p): if self.model is True: self.cfg['pretrained_dims'] = self.vocab.vectors_length @@ -596,7 +596,7 @@ class SimilarityHook(BaseThincComponent): return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) def __call__(self, doc): - '''Install similarity hook''' + """Install similarity hook""" doc.user_hooks['similarity'] = self.predict return doc From edf7e4881debb5244e445225a98117f9263d9b0d Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 25 Sep 2017 19:00:47 +0200 Subject: [PATCH 3/6] Add meta.json option to cli.train and add relevant properties Add accuracy scores to meta.json instead of accuracy.json and replace all relevant properties like lang, pipeline, spacy_version in existing meta.json. If not present, also add name and version placeholders to make it packagable. --- spacy/cli/train.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 96233406d..d71523a9c 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -18,6 +18,7 @@ from ..gold import GoldParse, merge_sents from ..gold import GoldCorpus, minibatch from ..util import prints from .. import util +from .. import about from .. import displacy from ..compat import json_dumps @@ -35,10 +36,11 @@ from ..compat import json_dumps no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool), + meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path) ) def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, - gold_preproc=False): + gold_preproc=False, meta_path=None): """ Train a model. Expects data in spaCy's JSON format. """ @@ -47,13 +49,19 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) + meta_path = util.ensure_path(meta_path) if not output_path.exists(): output_path.mkdir() if not train_path.exists(): prints(train_path, title="Training data not found", exits=1) if dev_path and not dev_path.exists(): prints(dev_path, title="Development data not found", exits=1) - + if meta_path is not None and not meta_path.exists(): + prints(meta_path, title="meta.json not found", exits=1) + meta = util.read_json(meta_path) if meta_path else {} + if not isinstance(meta, dict): + prints("Expected dict but got: {}".format(type(meta)), + title="Not a valid meta.json format", exits=1) pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] if no_tagger and 'tags' in pipeline: pipeline.remove('tags') @@ -105,9 +113,16 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, corpus.dev_docs( nlp, gold_preproc=gold_preproc)) - acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') - with acc_loc.open('w') as file_: - file_.write(json_dumps(scorer.scores)) + meta_loc = output_path / ('model%d' % i) / 'meta.json' + meta['accuracy'] = scorer.scores + meta['lang'] = nlp.lang + meta['pipeline'] = pipeline + meta['spacy_version'] = '>=%s' % about.__version__ + meta.setdefault('name', 'model%d' % i) + meta.setdefault('version', '0.0.0') + + with meta_loc.open('w') as file_: + file_.write(json_dumps(meta)) util.set_env_log(True) print_progress(i, losses, scorer.scores) finally: From 02c65155ababfc16d67714fefbe58723cb595f5f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 Sep 2017 12:50:31 +0200 Subject: [PATCH 4/6] Try to fix crazy travis error --- travis.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/travis.sh b/travis.sh index eed6a96f2..78599665c 100755 --- a/travis.sh +++ b/travis.sh @@ -17,6 +17,7 @@ fi if [ "${VIA}" == "compile" ]; then pip install -r requirements.txt + python setup.py clean --all python setup.py build_ext --inplace pip install -e . fi From ddee15cee957956394677b0dbcfa8ec48a76e6e4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 Sep 2017 14:00:25 +0200 Subject: [PATCH 5/6] Try to fix travis --- travis.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/travis.sh b/travis.sh index 78599665c..d1a1b6b29 100755 --- a/travis.sh +++ b/travis.sh @@ -18,7 +18,6 @@ fi if [ "${VIA}" == "compile" ]; then pip install -r requirements.txt python setup.py clean --all - python setup.py build_ext --inplace pip install -e . fi From 8c390e23a2ca8c0b24dece18d2faafcbe8066778 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 Sep 2017 14:14:46 +0200 Subject: [PATCH 6/6] Require older Cython --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6298b1982..5d515f7a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -cython>=0.24 +cython>=0.24,<0.27.0 pathlib numpy>=1.7 cymem>=1.30,<1.32