mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update feature/noshare with recent develop changes
This commit is contained in:
commit
defb68e94f
|
@ -1,4 +1,4 @@
|
||||||
cython>=0.24
|
cython>=0.24,<0.27.0
|
||||||
pathlib
|
pathlib
|
||||||
numpy>=1.7
|
numpy>=1.7
|
||||||
cymem>=1.30,<1.32
|
cymem>=1.30,<1.32
|
||||||
|
|
|
@ -20,6 +20,7 @@ from ..gold import GoldParse, merge_sents
|
||||||
from ..gold import GoldCorpus, minibatch
|
from ..gold import GoldCorpus, minibatch
|
||||||
from ..util import prints
|
from ..util import prints
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from .. import about
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
from ..compat import json_dumps
|
from ..compat import json_dumps
|
||||||
|
|
||||||
|
@ -40,10 +41,11 @@ numpy.random.seed(0)
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
|
meta_path=("Optional path to meta.json. All relevant properties will be overwritten.", "option", "m", Path)
|
||||||
)
|
)
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||||
gold_preproc=False):
|
gold_preproc=False, meta_path=None):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
@ -52,13 +54,19 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
train_path = util.ensure_path(train_data)
|
train_path = util.ensure_path(train_data)
|
||||||
dev_path = util.ensure_path(dev_data)
|
dev_path = util.ensure_path(dev_data)
|
||||||
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
if not train_path.exists():
|
if not train_path.exists():
|
||||||
prints(train_path, title="Training data not found", exits=1)
|
prints(train_path, title="Training data not found", exits=1)
|
||||||
if dev_path and not dev_path.exists():
|
if dev_path and not dev_path.exists():
|
||||||
prints(dev_path, title="Development data not found", exits=1)
|
prints(dev_path, title="Development data not found", exits=1)
|
||||||
|
if meta_path is not None and not meta_path.exists():
|
||||||
|
prints(meta_path, title="meta.json not found", exits=1)
|
||||||
|
meta = util.read_json(meta_path) if meta_path else {}
|
||||||
|
if not isinstance(meta, dict):
|
||||||
|
prints("Expected dict but got: {}".format(type(meta)),
|
||||||
|
title="Not a valid meta.json format", exits=1)
|
||||||
|
|
||||||
pipeline = ['tags', 'dependencies', 'entities']
|
pipeline = ['tags', 'dependencies', 'entities']
|
||||||
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
|
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
|
||||||
|
@ -112,6 +120,17 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
|
||||||
with acc_loc.open('w') as file_:
|
with acc_loc.open('w') as file_:
|
||||||
file_.write(json_dumps(scorer.scores))
|
file_.write(json_dumps(scorer.scores))
|
||||||
|
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
||||||
|
meta['accuracy'] = scorer.scores
|
||||||
|
meta['lang'] = nlp.lang
|
||||||
|
meta['pipeline'] = pipeline
|
||||||
|
meta['spacy_version'] = '>=%s' % about.__version__
|
||||||
|
meta.setdefault('name', 'model%d' % i)
|
||||||
|
meta.setdefault('version', '0.0.0')
|
||||||
|
|
||||||
|
with meta_loc.open('w') as file_:
|
||||||
|
file_.write(json_dumps(meta))
|
||||||
|
>>>>>>> origin/develop
|
||||||
util.set_env_log(True)
|
util.set_env_log(True)
|
||||||
print_progress(i, losses, scorer.scores)
|
print_progress(i, losses, scorer.scores)
|
||||||
finally:
|
finally:
|
||||||
|
|
|
@ -48,7 +48,7 @@ from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
class SentenceSegmenter(object):
|
class SentenceSegmenter(object):
|
||||||
'''A simple spaCy hook, to allow custom sentence boundary detection logic
|
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
(that doesn't require the dependency parse).
|
(that doesn't require the dependency parse).
|
||||||
|
|
||||||
To change the sentence boundary detection strategy, pass a generator
|
To change the sentence boundary detection strategy, pass a generator
|
||||||
|
@ -57,7 +57,7 @@ class SentenceSegmenter(object):
|
||||||
|
|
||||||
Sentence detection strategies should be generators that take `Doc` objects
|
Sentence detection strategies should be generators that take `Doc` objects
|
||||||
and yield `Span` objects for each sentence.
|
and yield `Span` objects for each sentence.
|
||||||
'''
|
"""
|
||||||
name = 'sbd'
|
name = 'sbd'
|
||||||
|
|
||||||
def __init__(self, vocab, strategy=None):
|
def __init__(self, vocab, strategy=None):
|
||||||
|
@ -89,17 +89,30 @@ class BaseThincComponent(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, *shape, **kwargs):
|
def Model(cls, *shape, **kwargs):
|
||||||
|
"""Initialize a model for the pipe."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
"""Apply the pipe to one document. The document is
|
||||||
|
modified in-place, and returned.
|
||||||
|
|
||||||
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
|
and `set_annotations()` methods.
|
||||||
|
"""
|
||||||
scores = self.predict([doc])
|
scores = self.predict([doc])
|
||||||
self.set_annotations([doc], scores)
|
self.set_annotations([doc], scores)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
"""Apply the pipe to a stream of documents.
|
||||||
|
|
||||||
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
|
and `set_annotations()` methods.
|
||||||
|
"""
|
||||||
for docs in cytoolz.partition_all(batch_size, stream):
|
for docs in cytoolz.partition_all(batch_size, stream):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores = self.predict(docs)
|
scores = self.predict(docs)
|
||||||
|
@ -107,28 +120,43 @@ class BaseThincComponent(object):
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
"""Apply the pipeline's model to a batch of docs, without
|
||||||
|
modifying them.
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def set_annotations(self, docs, scores):
|
def set_annotations(self, docs, scores):
|
||||||
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||||
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
|
updating the pipe's model.
|
||||||
|
|
||||||
|
Delegates to predict() and get_loss().
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
"""Find the loss and gradient of loss for the batch of
|
||||||
|
documents and their predicted scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
token_vector_width = pipeline[0].model.nO
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
|
If no model has been initialized yet, the model is added."""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(1, token_vector_width)
|
self.model = self.Model(**self.cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
|
"""Modify the pipe's model, to use the given parameter values.
|
||||||
|
"""
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
|
"""Serialize the pipe to a bytestring."""
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('cfg', lambda: json_dumps(self.cfg)),
|
('cfg', lambda: json_dumps(self.cfg)),
|
||||||
('model', lambda: self.model.to_bytes()),
|
('model', lambda: self.model.to_bytes()),
|
||||||
|
@ -137,6 +165,7 @@ class BaseThincComponent(object):
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
"""Load the pipe from a bytestring."""
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
|
@ -152,6 +181,7 @@ class BaseThincComponent(object):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
|
"""Serialize the pipe to disk."""
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))),
|
||||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||||
|
@ -160,6 +190,7 @@ class BaseThincComponent(object):
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
|
"""Load the pipe from disk."""
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||||
|
@ -610,7 +641,7 @@ class SimilarityHook(BaseThincComponent):
|
||||||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
'''Install similarity hook'''
|
"""Install similarity hook"""
|
||||||
doc.user_hooks['similarity'] = self.predict
|
doc.user_hooks['similarity'] = self.predict
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,6 @@ fi
|
||||||
|
|
||||||
if [ "${VIA}" == "compile" ]; then
|
if [ "${VIA}" == "compile" ]; then
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
export PYTHONPATH=`pwd`
|
|
||||||
python setup.py build_ext --inplace
|
python setup.py build_ext --inplace
|
||||||
pip install -e .
|
pip install -e .
|
||||||
fi
|
fi
|
||||||
|
|
Loading…
Reference in New Issue
Block a user