Update feature/noshare with recent develop changes

This commit is contained in:
Matthew Honnibal 2017-09-26 08:15:14 -05:00
commit defb68e94f
4 changed files with 60 additions and 11 deletions

View File

@ -1,4 +1,4 @@
cython>=0.24 cython>=0.24,<0.27.0
pathlib pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32

View File

@ -20,6 +20,7 @@ from ..gold import GoldParse, merge_sents
from ..gold import GoldCorpus, minibatch from ..gold import GoldCorpus, minibatch
from ..util import prints from ..util import prints
from .. import util from .. import util
from .. import about
from .. import displacy from .. import displacy
from ..compat import json_dumps from ..compat import json_dumps
@ -40,10 +41,11 @@ numpy.random.seed(0)
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool), no_entities=("Don't train NER", "flag", "N", bool),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
) )
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
gold_preproc=False): gold_preproc=False, meta_path=None):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
@ -52,13 +54,19 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_data)
meta_path = util.ensure_path(meta_path)
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
if not train_path.exists(): if not train_path.exists():
prints(train_path, title="Training data not found", exits=1) prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists(): if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=1) prints(dev_path, title="Development data not found", exits=1)
if meta_path is not None and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1)
meta = util.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1)
pipeline = ['tags', 'dependencies', 'entities'] pipeline = ['tags', 'dependencies', 'entities']
if no_tagger and 'tags' in pipeline: pipeline.remove('tags') if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
@ -112,6 +120,17 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json') acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_: with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores)) file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json'
meta['accuracy'] = scorer.scores
meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
meta.setdefault('name', 'model%d' % i)
meta.setdefault('version', '0.0.0')
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
>>>>>>> origin/develop
util.set_env_log(True) util.set_env_log(True)
print_progress(i, losses, scorer.scores) print_progress(i, losses, scorer.scores)
finally: finally:

View File

@ -48,7 +48,7 @@ from .parts_of_speech import X
class SentenceSegmenter(object): class SentenceSegmenter(object):
'''A simple spaCy hook, to allow custom sentence boundary detection logic """A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse). (that doesn't require the dependency parse).
To change the sentence boundary detection strategy, pass a generator To change the sentence boundary detection strategy, pass a generator
@ -57,7 +57,7 @@ class SentenceSegmenter(object):
Sentence detection strategies should be generators that take `Doc` objects Sentence detection strategies should be generators that take `Doc` objects
and yield `Span` objects for each sentence. and yield `Span` objects for each sentence.
''' """
name = 'sbd' name = 'sbd'
def __init__(self, vocab, strategy=None): def __init__(self, vocab, strategy=None):
@ -89,17 +89,30 @@ class BaseThincComponent(object):
@classmethod @classmethod
def Model(cls, *shape, **kwargs): def Model(cls, *shape, **kwargs):
"""Initialize a model for the pipe."""
raise NotImplementedError raise NotImplementedError
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError raise NotImplementedError
def __call__(self, doc): def __call__(self, doc):
"""Apply the pipe to one document. The document is
modified in-place, and returned.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
scores = self.predict([doc]) scores = self.predict([doc])
self.set_annotations([doc], scores) self.set_annotations([doc], scores)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
scores = self.predict(docs) scores = self.predict(docs)
@ -107,28 +120,43 @@ class BaseThincComponent(object):
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without
modifying them.
"""
raise NotImplementedError raise NotImplementedError
def set_annotations(self, docs, scores): def set_annotations(self, docs, scores):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError raise NotImplementedError
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
Delegates to predict() and get_loss().
"""
raise NotImplementedError raise NotImplementedError
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
"""Find the loss and gradient of loss for the batch of
documents and their predicted scores."""
raise NotImplementedError raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
token_vector_width = pipeline[0].model.nO """Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added."""
if self.model is True: if self.model is True:
self.model = self.Model(1, token_vector_width) self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
def use_params(self, params): def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values.
"""
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring."""
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda: json_dumps(self.cfg)), ('cfg', lambda: json_dumps(self.cfg)),
('model', lambda: self.model.to_bytes()), ('model', lambda: self.model.to_bytes()),
@ -137,6 +165,7 @@ class BaseThincComponent(object):
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load the pipe from a bytestring."""
def load_model(b): def load_model(b):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
@ -152,6 +181,7 @@ class BaseThincComponent(object):
return self return self
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Serialize the pipe to disk."""
serialize = OrderedDict(( serialize = OrderedDict((
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))), ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
('vocab', lambda p: self.vocab.to_disk(p)), ('vocab', lambda p: self.vocab.to_disk(p)),
@ -160,6 +190,7 @@ class BaseThincComponent(object):
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
"""Load the pipe from disk."""
def load_model(p): def load_model(p):
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length self.cfg['pretrained_dims'] = self.vocab.vectors_length
@ -610,7 +641,7 @@ class SimilarityHook(BaseThincComponent):
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
def __call__(self, doc): def __call__(self, doc):
'''Install similarity hook''' """Install similarity hook"""
doc.user_hooks['similarity'] = self.predict doc.user_hooks['similarity'] = self.predict
return doc return doc

View File

@ -17,7 +17,6 @@ fi
if [ "${VIA}" == "compile" ]; then if [ "${VIA}" == "compile" ]; then
pip install -r requirements.txt pip install -r requirements.txt
export PYTHONPATH=`pwd`
python setup.py build_ext --inplace python setup.py build_ext --inplace
pip install -e . pip install -e .
fi fi