Update feature/noshare with recent develop changes

2025-10-18 09:44:16 +03:00 · 2017-09-26 08:15:14 -05:00 · 2017-09-26 08:15:14 -05:00 · defb68e94f
commit defb68e94f
parent ca28590ddd 8c390e23a2
4 changed files with 60 additions and 11 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-cython>=0.24
+cython>=0.24,<0.27.0
 pathlib
 numpy>=1.7
 cymem>=1.30,<1.32
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -20,6 +20,7 @@ from ..gold import GoldParse, merge_sents
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
 from .. import about
 from .. import displacy
 from ..compat import json_dumps
@ -40,10 +41,11 @@ numpy.random.seed(0)
    no_parser=("Don't train parser", "flag", "P", bool),
    no_entities=("Don't train NER", "flag", "N", bool),
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
 )
 def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
          use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
-          gold_preproc=False):
+          gold_preproc=False, meta_path=None):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -52,13 +54,19 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title="Training data not found", exits=1)
    if dev_path and not dev_path.exists():
        prints(dev_path, title="Development data not found", exits=1)
-
+    if meta_path is not None and not meta_path.exists():
        prints(meta_path, title="meta.json not found", exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
        prints("Expected dict but got: {}".format(type(meta)),
               title="Not a valid meta.json format", exits=1)
    pipeline = ['tags', 'dependencies', 'entities']
    if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
@ -112,6 +120,17 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
                acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
                meta['lang'] = nlp.lang
                meta['pipeline'] = pipeline
                meta['spacy_version'] = '>=%s' % about.__version__
                meta.setdefault('name', 'model%d' % i)
                meta.setdefault('version', '0.0.0')
                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
 >>>>>>> origin/develop
                util.set_env_log(True)
            print_progress(i, losses, scorer.scores)
    finally:
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -48,7 +48,7 @@ from .parts_of_speech import X
 class SentenceSegmenter(object):
-    '''A simple spaCy hook, to allow custom sentence boundary detection logic
+    """A simple spaCy hook, to allow custom sentence boundary detection logic
    (that doesn't require the dependency parse).
    To change the sentence boundary detection strategy, pass a generator
@ -57,7 +57,7 @@ class SentenceSegmenter(object):
    Sentence detection strategies should be generators that take `Doc` objects
    and yield `Span` objects for each sentence.
-    '''
+    """
    name = 'sbd'
    def __init__(self, vocab, strategy=None):
@ -89,17 +89,30 @@ class BaseThincComponent(object):
    @classmethod
    def Model(cls, *shape, **kwargs):
        """Initialize a model for the pipe."""
        raise NotImplementedError
    def __init__(self, vocab, model=True, **cfg):
        """Create a new pipe instance."""
        raise NotImplementedError
    def __call__(self, doc):
        """Apply the pipe to one document. The document is
        modified in-place, and returned.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        scores = self.predict([doc])
        self.set_annotations([doc], scores)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
            scores = self.predict(docs)
@ -107,28 +120,43 @@ class BaseThincComponent(object):
            yield from docs
    def predict(self, docs):
        """Apply the pipeline's model to a batch of docs, without
        modifying them.
        """
        raise NotImplementedError
    def set_annotations(self, docs, scores):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError
-    def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model.
        Delegates to predict() and get_loss().
        """
        raise NotImplementedError
    def get_loss(self, docs, golds, scores):
        """Find the loss and gradient of loss for the batch of
        documents and their predicted scores."""
        raise NotImplementedError
    def begin_training(self, gold_tuples=tuple(), pipeline=None):
-        token_vector_width = pipeline[0].model.nO
+        """Initialize the pipe for training, using data exampes if available.
        If no model has been initialized yet, the model is added."""
        if self.model is True:
-            self.model = self.Model(1, token_vector_width)
+            self.model = self.Model(**self.cfg)
-            link_vectors_to_models(self.vocab)
+        link_vectors_to_models(self.vocab)
    def use_params(self, params):
        """Modify the pipe's model, to use the given parameter values.
        """
        with self.model.use_params(params):
            yield
    def to_bytes(self, **exclude):
        """Serialize the pipe to a bytestring."""
        serialize = OrderedDict((
            ('cfg', lambda: json_dumps(self.cfg)),
            ('model', lambda: self.model.to_bytes()),
@ -137,6 +165,7 @@ class BaseThincComponent(object):
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        """Load the pipe from a bytestring."""
        def load_model(b):
            if self.model is True:
                self.cfg['pretrained_dims'] = self.vocab.vectors_length
@ -152,6 +181,7 @@ class BaseThincComponent(object):
        return self
    def to_disk(self, path, **exclude):
        """Serialize the pipe to disk."""
        serialize = OrderedDict((
            ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
            ('vocab', lambda p: self.vocab.to_disk(p)),
@ -160,6 +190,7 @@ class BaseThincComponent(object):
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        """Load the pipe from disk."""
        def load_model(p):
            if self.model is True:
                self.cfg['pretrained_dims'] = self.vocab.vectors_length
@ -610,7 +641,7 @@ class SimilarityHook(BaseThincComponent):
        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
    def __call__(self, doc):
-        '''Install similarity hook'''
+        """Install similarity hook"""
        doc.user_hooks['similarity'] = self.predict
        return doc
--- a/travis.sh
+++ b/travis.sh
@ -17,7 +17,6 @@ fi
 if [ "${VIA}" == "compile" ]; then
  pip install -r requirements.txt
  export PYTHONPATH=`pwd`
  python setup.py build_ext --inplace
  pip install -e .
 fi