spaCy/spacy/language.py

737 lines
29 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import absolute_import, unicode_literals
2017-05-25 04:10:54 +03:00
import random
2017-05-29 14:42:55 +03:00
import ujson
2017-07-25 19:57:59 +03:00
import itertools
import weakref
import functools
2017-10-27 22:07:59 +03:00
from collections import OrderedDict
from contextlib import contextmanager
from copy import copy
from thinc.neural import Model
from thinc.neural.optimizers import Adam
2017-05-18 12:25:19 +03:00
from .tokenizer import Tokenizer
from .vocab import Vocab
from .lemmatizer import Lemmatizer
2017-10-27 22:07:59 +03:00
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer
from .compat import json_dumps, izip
from .scorer import Scorer
from ._ml import link_vectors_to_models
from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
2017-05-09 00:58:31 +03:00
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS, is_stop
from . import util
from . import about
2016-09-24 21:26:17 +03:00
class BaseDefaults(object):
2016-10-18 17:18:25 +03:00
@classmethod
def create_lemmatizer(cls, nlp=None):
2017-10-11 14:26:05 +03:00
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules,
cls.lemma_lookup)
2016-10-18 17:18:25 +03:00
@classmethod
def create_vocab(cls, nlp=None):
lemmatizer = cls.create_lemmatizer(nlp)
lex_attr_getters = dict(cls.lex_attr_getters)
# This is messy, but it's the minimal working fix to Issue #639.
lex_attr_getters[IS_STOP] = functools.partial(is_stop,
stops=cls.stop_words)
vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map,
lemmatizer=lemmatizer)
2017-03-15 17:24:40 +03:00
for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items():
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
return vocab
2016-12-18 18:54:52 +03:00
2016-10-18 17:18:25 +03:00
@classmethod
def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions
token_match = cls.token_match
prefix_search = (util.compile_prefix_regex(cls.prefixes).search
if cls.prefixes else None)
suffix_search = (util.compile_suffix_regex(cls.suffixes).search
if cls.suffixes else None)
infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
if cls.infixes else None)
2016-10-18 17:18:25 +03:00
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
2016-11-26 14:36:04 +03:00
return Tokenizer(vocab, rules=rules,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match)
pipe_names = ['tagger', 'parser', 'ner']
2017-05-09 00:58:31 +03:00
token_match = TOKEN_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP)
tokenizer_exceptions = {}
2016-09-24 21:26:17 +03:00
stop_words = set()
2016-12-18 17:50:09 +03:00
lemma_rules = {}
lemma_exc = {}
lemma_index = {}
2017-10-11 14:26:05 +03:00
lemma_lookup = {}
2017-03-15 17:24:40 +03:00
morph_rules = {}
2017-05-09 01:58:10 +03:00
lex_attr_getters = LEX_ATTRS
syntax_iterators = {}
2015-09-14 10:48:51 +03:00
class Language(object):
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
"""
2016-09-24 21:26:17 +03:00
Defaults = BaseDefaults
lang = None
2015-08-25 16:37:17 +03:00
factories = {
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
}
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (callable): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
disable (list): A list of component names to exclude from the pipeline.
The disable list has priority over the pipeline list -- if the same
string occurs in both, the component is not loaded.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
2017-07-23 01:50:18 +03:00
self._meta = dict(meta)
2017-10-25 12:57:43 +03:00
self._path = None
if vocab is True:
factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {}))
self.vocab = vocab
if make_doc is True:
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
2017-05-29 16:40:45 +03:00
self.tokenizer = make_doc
self.pipeline = []
self._optimizer = None
def __reduce__(self):
bytes_data = self.to_bytes(vocab=False)
return (unpickle_language, (self.vocab, self.meta, bytes_data))
2017-10-25 12:57:43 +03:00
@property
def path(self):
return self._path
2017-07-23 01:50:18 +03:00
@property
def meta(self):
self._meta.setdefault('lang', self.vocab.lang)
self._meta.setdefault('name', 'model')
2017-07-23 01:50:18 +03:00
self._meta.setdefault('version', '0.0.0')
self._meta.setdefault('spacy_version', about.__version__)
self._meta.setdefault('description', '')
self._meta.setdefault('author', '')
self._meta.setdefault('email', '')
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
2017-10-30 20:39:48 +03:00
self._meta['vectors'] = {'width': self.vocab.vectors_length,
2017-11-01 03:25:09 +03:00
'vectors': len(self.vocab.vectors),
'keys': self.vocab.vectors.n_keys}
self._meta['pipeline'] = self.pipe_names
2017-07-23 01:50:18 +03:00
return self._meta
@meta.setter
def meta(self, value):
self._meta = value
# Conveniences to access pipeline components
@property
def tensorizer(self):
return self.get_pipe('tensorizer')
@property
def tagger(self):
return self.get_pipe('tagger')
@property
def parser(self):
return self.get_pipe('parser')
@property
def entity(self):
return self.get_pipe('ner')
@property
def matcher(self):
return self.get_pipe('matcher')
@property
def pipe_names(self):
"""Get names of available pipeline components.
RETURNS (list): List of component name strings, in order.
"""
return [pipe_name for pipe_name, _ in self.pipeline]
def get_pipe(self, name):
"""Get a pipeline component for a given component name.
name (unicode): Name of pipeline component to get.
RETURNS (callable): The pipeline component.
"""
for pipe_name, component in self.pipeline:
if pipe_name == name:
return component
msg = "No component '{}' found in pipeline. Available names: {}"
raise KeyError(msg.format(name, self.pipe_names))
def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory.
name (unicode): Factory name to look up in `Language.factories`.
2017-10-07 02:04:50 +03:00
config (dict): Configuration parameters to initialise component.
RETURNS (callable): Pipeline component.
"""
if name not in self.factories:
raise KeyError("Can't find factory for '{}'.".format(name))
factory = self.factories[name]
return factory(self, **config)
def add_pipe(self, component, name=None, before=None, after=None,
first=None, last=None):
"""Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last".
component (callable): The pipeline component.
name (unicode): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline.
before (unicode): Component name to insert component directly before.
after (unicode): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline.
last (bool): Insert component last / not last in the pipeline.
EXAMPLE:
>>> nlp.add_pipe(component, before='ner')
>>> nlp.add_pipe(component, name='custom_name', last=True)
"""
if name is None:
if hasattr(component, 'name'):
name = component.name
elif hasattr(component, '__name__'):
name = component.__name__
elif (hasattr(component, '__class__') and
hasattr(component.__class__, '__name__')):
name = component.__class__.__name__
else:
name = repr(component)
if name in self.pipe_names:
raise ValueError("'{}' already exists in pipeline.".format(name))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
msg = ("Invalid constraints. You can only set one of the "
"following: before, after, first, last.")
raise ValueError(msg)
pipe = (name, component)
if last or not any([first, before, after]):
self.pipeline.append(pipe)
elif first:
self.pipeline.insert(0, pipe)
elif before and before in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(before), pipe)
elif after and after in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(after), pipe)
else:
msg = "Can't find '{}' in pipeline. Available names: {}"
unfound = before or after
raise ValueError(msg.format(unfound, self.pipe_names))
2017-10-17 12:20:07 +03:00
def has_pipe(self, name):
"""Check if a component name is present in the pipeline. Equivalent to
`name in nlp.pipe_names`.
name (unicode): Name of the component.
RETURNS (bool): Whether a component of the name exists in the pipeline.
2017-10-17 12:20:07 +03:00
"""
return name in self.pipe_names
def replace_pipe(self, name, component):
"""Replace a component in the pipeline.
name (unicode): Name of the component to replace.
component (callable): Pipeline component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
self.pipeline[self.pipe_names.index(name)] = (name, component)
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
old_name (unicode): Name of the component to rename.
new_name (unicode): New name of the component.
"""
if old_name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(old_name, self.pipe_names))
if new_name in self.pipe_names:
msg = "'{}' already exists in pipeline. Existing names: {}"
raise ValueError(msg.format(new_name, self.pipe_names))
i = self.pipe_names.index(old_name)
self.pipeline[i] = (new_name, self.pipeline[i][1])
def remove_pipe(self, name):
"""Remove a component from the pipeline.
name (unicode): Name of the component to remove.
2017-10-07 02:04:50 +03:00
RETURNS (tuple): A `(name, component)` tuple of the removed component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
return self.pipeline.pop(self.pipe_names.index(name))
def __call__(self, text, disable=[]):
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
2015-08-25 16:37:17 +03:00
is preserved.
2016-12-18 18:54:52 +03:00
text (unicode): The text to be processed.
disable (list): Names of the pipeline components to disable.
RETURNS (Doc): A container for accessing the annotations.
2016-11-01 14:25:36 +03:00
EXAMPLE:
2016-11-01 14:25:36 +03:00
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].text, tokens[0].head.tag_
2016-11-01 14:25:36 +03:00
('An', 'NN')
2015-08-25 16:37:17 +03:00
"""
doc = self.make_doc(text)
for name, proc in self.pipeline:
if name in disable:
continue
2017-05-28 16:11:58 +03:00
doc = proc(doc)
return doc
2015-08-25 16:37:17 +03:00
2017-10-25 14:46:41 +03:00
def disable_pipes(self, *names):
"""Disable one or more pipeline components. If used as a context
manager, the pipeline will be restored to the initial state at the end
of the block. Otherwise, a DisabledPipes object is returned, that has
a `.restore()` method you can use to undo your changes.
2017-10-25 14:46:41 +03:00
EXAMPLE:
>>> nlp.add_pipe('parser')
>>> nlp.add_pipe('tagger')
>>> with nlp.disable_pipes('parser', 'tagger'):
>>> assert not nlp.has_pipe('parser')
>>> assert nlp.has_pipe('parser')
>>> disabled = nlp.disable_pipes('parser')
>>> assert len(disabled) == 1
>>> assert not nlp.has_pipe('parser')
>>> disabled.restore()
>>> assert nlp.has_pipe('parser')
"""
2017-10-25 14:46:41 +03:00
return DisabledPipes(self, *names)
2017-05-29 16:40:45 +03:00
def make_doc(self, text):
return self.tokenizer(text)
2017-09-26 13:41:35 +03:00
def update(self, docs, golds, drop=0., sgd=None, losses=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
2017-08-01 23:10:17 +03:00
if len(docs) != len(golds):
raise IndexError("Update expects same number of docs and golds "
"Got: %d, %d" % (len(docs), len(golds)))
2017-08-01 23:10:17 +03:00
if len(docs) == 0:
return
if sgd is None:
if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer
2017-05-25 04:10:54 +03:00
grads = {}
2017-05-25 04:10:54 +03:00
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
pipes = list(self.pipeline)
random.shuffle(pipes)
for name, proc in pipes:
2017-05-22 02:43:31 +03:00
if not hasattr(proc, 'update'):
continue
grads = {}
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
2017-05-21 17:07:06 +03:00
def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for name, proc in self.pipeline:
2017-05-21 17:07:06 +03:00
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)
for doc, gold in docs_golds:
yield doc, gold
2017-09-21 03:15:20 +03:00
def resume_training(self, **cfg):
if cfg.get('device', -1) >= 0:
device = util.use_gpu(cfg['device'])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
else:
device = None
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
2017-09-21 03:15:20 +03:00
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
def begin_training(self, get_gold_tuples=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
get_gold_tuples (function): Function returning gold data
**cfg: Config parameters.
RETURNS: An optimizer
"""
if get_gold_tuples is None:
get_gold_tuples = lambda: []
# Populate vocab
else:
2017-09-21 03:15:20 +03:00
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
contexts = []
2017-06-04 00:10:23 +03:00
if cfg.get('device', -1) >= 0:
2017-09-21 03:15:20 +03:00
device = util.use_gpu(cfg['device'])
2017-09-19 02:04:16 +03:00
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(
self.vocab.vectors.data)
2017-06-04 00:10:23 +03:00
else:
device = None
2017-09-23 04:11:52 +03:00
link_vectors_to_models(self.vocab)
for name, proc in self.pipeline:
if hasattr(proc, 'begin_training'):
2017-05-21 17:07:06 +03:00
context = proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline)
contexts.append(context)
2017-05-25 19:19:26 +03:00
learn_rate = util.env_opt('learn_rate', 0.001)
beta1 = util.env_opt('optimizer_B1', 0.9)
beta2 = util.env_opt('optimizer_B2', 0.999)
eps = util.env_opt('optimizer_eps', 1e-08)
L2 = util.env_opt('L2_penalty', 1e-6)
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
beta2=beta2, eps=eps)
self._optimizer.max_grad_norm = max_grad_norm
self._optimizer.device = device
return self._optimizer
2017-05-21 17:07:06 +03:00
def evaluate(self, docs_golds, verbose=False):
scorer = Scorer()
2017-08-18 23:26:12 +03:00
docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for name, pipe in self.pipeline:
2017-08-18 23:26:12 +03:00
if not hasattr(pipe, 'pipe'):
2017-10-18 22:46:12 +03:00
docs = (pipe(doc) for doc in docs)
2017-08-18 23:26:12 +03:00
else:
2017-10-18 22:46:12 +03:00
docs = pipe.pipe(docs, batch_size=256)
2017-08-18 23:26:12 +03:00
for doc, gold in zip(docs, golds):
if verbose:
print(doc)
scorer.score(doc, gold, verbose=verbose)
2017-05-21 17:07:06 +03:00
return scorer
2017-05-18 12:25:19 +03:00
@contextmanager
def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for name, pipe
2017-05-18 16:30:59 +03:00
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
for context in contexts:
try:
next(context)
except StopIteration:
pass
2017-05-18 12:25:19 +03:00
yield
for context in contexts:
try:
2017-05-18 16:30:59 +03:00
next(context)
2017-05-18 12:25:19 +03:00
except StopIteration:
pass
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
disable=[]):
"""Process texts as a stream, and yield `Doc` objects in order.
Supports GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
as_tuples (bool):
If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False.
n_threads (int): The number of worker threads to use. If -1, OpenMP
will decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
disable (list): Names of the pipeline components to disable.
YIELDS (Doc): Documents in the order of the original text.
EXAMPLE:
>>> texts = [u'One document.', u'...', u'Lots of documents']
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
if as_tuples:
2017-07-25 19:57:59 +03:00
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
contexts = (tc[1] for tc in text_context2)
docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
disable=disable)
for doc, context in izip(docs, contexts):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
for name, proc in self.pipeline:
if name in disable:
continue
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads,
batch_size=batch_size)
else:
2017-05-22 02:43:31 +03:00
# Apply the function, but yield the doc
docs = _pipe(proc, docs)
# Track weakrefs of "recent" documents, so that we can see when they
# expire from memory. When they do, we know we don't need old strings.
# This way, we avoid maintaining an unbounded growth in string entries
# in the string store.
recent_refs = weakref.WeakSet()
old_refs = weakref.WeakSet()
original_strings_data = self.vocab.strings.to_bytes()
StringStore = self.vocab.strings.__class__
recent_strings = StringStore().from_bytes(original_strings_data)
nr_seen = 0
for doc in docs:
yield doc
for word in doc:
recent_strings.add(word.text)
recent_refs.add(doc)
2017-10-16 20:38:29 +03:00
if nr_seen < 10000:
old_refs.add(doc)
nr_seen += 1
elif len(old_refs) == 0:
# All the docs in the 'old' set have expired, so the only
# difference between the backup strings and the current
# string-store should be obsolete. We therefore swap out the
# old strings data.
old_refs, recent_refs = recent_refs, old_refs
self.vocab.strings._reset_and_load(recent_strings)
recent_strings = StringStore().from_bytes(original_strings_data)
nr_seen = 0
2017-05-31 14:42:39 +03:00
def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
2017-04-17 02:40:26 +03:00
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be strings or `Path`-like objects.
2017-05-31 14:42:39 +03:00
disable (list): Names of pipeline components to disable and prevent
from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
"""
path = util.ensure_path(path)
2017-05-31 14:42:39 +03:00
serializers = OrderedDict((
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for name, proc in self.pipeline:
2017-05-31 14:42:39 +03:00
if not hasattr(proc, 'name'):
continue
if name in disable:
2017-05-31 14:42:39 +03:00
continue
if not hasattr(proc, 'to_disk'):
continue
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
2017-09-24 13:01:45 +03:00
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
2017-05-31 14:42:39 +03:00
util.to_disk(path, serializers, {p: False for p in disable})
def from_disk(self, path, disable=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
model will be loaded.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
"""
path = util.ensure_path(path)
2017-05-31 14:42:39 +03:00
deserializers = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
('meta.json', lambda p: self.meta.update(ujson.load(p.open('r'))))
2017-05-31 14:42:39 +03:00
))
for name, proc in self.pipeline:
if name in disable:
2017-05-31 14:42:39 +03:00
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
2017-06-01 15:38:35 +03:00
exclude = {p: False for p in disable}
if not (path / 'vocab').exists():
exclude['vocab'] = True
util.from_disk(path, deserializers, exclude)
2017-10-25 12:57:43 +03:00
self._path = path
2017-05-31 14:42:39 +03:00
return self
def to_bytes(self, disable=[], **exclude):
"""Serialize the current state to a binary string.
2016-12-18 18:54:52 +03:00
disable (list): Nameds of pipeline components to disable and prevent
from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
2017-05-29 16:40:45 +03:00
serializers = OrderedDict((
('vocab', lambda: self.vocab.to_bytes()),
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
2017-10-27 22:07:59 +03:00
('meta', lambda: json_dumps(self.meta))
2017-05-29 16:40:45 +03:00
))
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
2017-05-29 21:23:28 +03:00
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
return util.to_bytes(serializers, exclude)
def from_bytes(self, bytes_data, disable=[]):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
disable (list): Names of the pipeline components to disable.
RETURNS (Language): The `Language` object.
"""
2017-05-29 16:40:45 +03:00
deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
('meta', lambda b: self.meta.update(ujson.loads(b)))
))
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue
2017-05-29 16:40:45 +03:00
if not hasattr(proc, 'from_bytes'):
continue
2017-05-29 21:23:28 +03:00
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
msg = util.from_bytes(bytes_data, deserializers, {})
return self
2017-05-22 02:43:31 +03:00
2017-10-25 14:46:41 +03:00
class DisabledPipes(list):
"""Manager for temporary pipeline disabling."""
2017-10-25 14:46:41 +03:00
def __init__(self, nlp, *names):
self.nlp = nlp
self.names = names
# Important! Not deep copy -- we just want the container (but we also
# want to support people providing arbitrarily typed nlp.pipeline
# objects.)
2017-10-27 22:07:59 +03:00
self.original_pipeline = copy(nlp.pipeline)
2017-10-25 14:46:41 +03:00
list.__init__(self)
self.extend(nlp.remove_pipe(name) for name in names)
def __enter__(self):
2017-10-25 15:56:16 +03:00
return self
2017-10-25 14:46:41 +03:00
def __exit__(self, *args):
self.restore()
def restore(self):
'''Restore the pipeline to its state when DisabledPipes was created.'''
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
unexpected = [name for name, pipe in current
if not self.nlp.has_pipe(name)]
2017-10-25 14:46:41 +03:00
if unexpected:
# Don't change the pipeline if we're raising an error.
self.nlp.pipeline = current
msg = (
"Some current components would be lost when restoring "
"previous pipeline state. If you added components after "
"calling nlp.disable_pipes(), you should remove them "
"explicitly with nlp.remove_pipe() before the pipeline is "
"restore. Names of the new components: %s"
)
raise ValueError(msg % unexpected)
self[:] = []
def unpickle_language(vocab, meta, bytes_data):
lang = Language(vocab=vocab)
lang.from_bytes(bytes_data)
return lang
2017-05-22 02:43:31 +03:00
def _pipe(func, docs):
for doc in docs:
func(doc)
yield doc