mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
737 lines
29 KiB
Python
737 lines
29 KiB
Python
# coding: utf8
|
|
from __future__ import absolute_import, unicode_literals
|
|
|
|
import random
|
|
import ujson
|
|
import itertools
|
|
import weakref
|
|
import functools
|
|
from collections import OrderedDict
|
|
from contextlib import contextmanager
|
|
from copy import copy
|
|
from thinc.neural import Model
|
|
from thinc.neural.optimizers import Adam
|
|
|
|
from .tokenizer import Tokenizer
|
|
from .vocab import Vocab
|
|
from .lemmatizer import Lemmatizer
|
|
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
|
from .pipeline import SimilarityHook, TextCategorizer
|
|
from .compat import json_dumps, izip
|
|
from .scorer import Scorer
|
|
from ._ml import link_vectors_to_models
|
|
from .attrs import IS_STOP
|
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
|
from .lang.punctuation import TOKENIZER_INFIXES
|
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
|
from .lang.tag_map import TAG_MAP
|
|
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
|
from . import util
|
|
from . import about
|
|
|
|
|
|
class BaseDefaults(object):
|
|
@classmethod
|
|
def create_lemmatizer(cls, nlp=None):
|
|
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules,
|
|
cls.lemma_lookup)
|
|
|
|
@classmethod
|
|
def create_vocab(cls, nlp=None):
|
|
lemmatizer = cls.create_lemmatizer(nlp)
|
|
lex_attr_getters = dict(cls.lex_attr_getters)
|
|
# This is messy, but it's the minimal working fix to Issue #639.
|
|
lex_attr_getters[IS_STOP] = functools.partial(is_stop,
|
|
stops=cls.stop_words)
|
|
vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map,
|
|
lemmatizer=lemmatizer)
|
|
for tag_str, exc in cls.morph_rules.items():
|
|
for orth_str, attrs in exc.items():
|
|
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
|
|
return vocab
|
|
|
|
@classmethod
|
|
def create_tokenizer(cls, nlp=None):
|
|
rules = cls.tokenizer_exceptions
|
|
token_match = cls.token_match
|
|
prefix_search = (util.compile_prefix_regex(cls.prefixes).search
|
|
if cls.prefixes else None)
|
|
suffix_search = (util.compile_suffix_regex(cls.suffixes).search
|
|
if cls.suffixes else None)
|
|
infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
|
|
if cls.infixes else None)
|
|
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
|
return Tokenizer(vocab, rules=rules,
|
|
prefix_search=prefix_search,
|
|
suffix_search=suffix_search,
|
|
infix_finditer=infix_finditer,
|
|
token_match=token_match)
|
|
|
|
pipe_names = ['tagger', 'parser', 'ner']
|
|
token_match = TOKEN_MATCH
|
|
prefixes = tuple(TOKENIZER_PREFIXES)
|
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
|
infixes = tuple(TOKENIZER_INFIXES)
|
|
tag_map = dict(TAG_MAP)
|
|
tokenizer_exceptions = {}
|
|
stop_words = set()
|
|
lemma_rules = {}
|
|
lemma_exc = {}
|
|
lemma_index = {}
|
|
lemma_lookup = {}
|
|
morph_rules = {}
|
|
lex_attr_getters = LEX_ATTRS
|
|
syntax_iterators = {}
|
|
|
|
|
|
class Language(object):
|
|
"""A text-processing pipeline. Usually you'll load this once per process,
|
|
and pass the instance around your application.
|
|
|
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
|
object and processing pipeline.
|
|
lang (unicode): Two-letter language ID, i.e. ISO code.
|
|
"""
|
|
Defaults = BaseDefaults
|
|
lang = None
|
|
|
|
factories = {
|
|
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
|
'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
|
|
'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
|
'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
|
'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
|
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
|
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
|
|
}
|
|
|
|
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
|
"""Initialise a Language object.
|
|
|
|
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
|
`Language.Defaults.create_vocab`.
|
|
make_doc (callable): A function that takes text and returns a `Doc`
|
|
object. Usually a `Tokenizer`.
|
|
pipeline (list): A list of annotation processes or IDs of annotation,
|
|
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
|
|
up in `Language.Defaults.factories`.
|
|
disable (list): A list of component names to exclude from the pipeline.
|
|
The disable list has priority over the pipeline list -- if the same
|
|
string occurs in both, the component is not loaded.
|
|
meta (dict): Custom meta data for the Language class. Is written to by
|
|
models to add model meta data.
|
|
RETURNS (Language): The newly constructed object.
|
|
"""
|
|
self._meta = dict(meta)
|
|
self._path = None
|
|
if vocab is True:
|
|
factory = self.Defaults.create_vocab
|
|
vocab = factory(self, **meta.get('vocab', {}))
|
|
self.vocab = vocab
|
|
if make_doc is True:
|
|
factory = self.Defaults.create_tokenizer
|
|
make_doc = factory(self, **meta.get('tokenizer', {}))
|
|
self.tokenizer = make_doc
|
|
self.pipeline = []
|
|
self._optimizer = None
|
|
|
|
def __reduce__(self):
|
|
bytes_data = self.to_bytes(vocab=False)
|
|
return (unpickle_language, (self.vocab, self.meta, bytes_data))
|
|
|
|
@property
|
|
def path(self):
|
|
return self._path
|
|
|
|
@property
|
|
def meta(self):
|
|
self._meta.setdefault('lang', self.vocab.lang)
|
|
self._meta.setdefault('name', 'model')
|
|
self._meta.setdefault('version', '0.0.0')
|
|
self._meta.setdefault('spacy_version', about.__version__)
|
|
self._meta.setdefault('description', '')
|
|
self._meta.setdefault('author', '')
|
|
self._meta.setdefault('email', '')
|
|
self._meta.setdefault('url', '')
|
|
self._meta.setdefault('license', '')
|
|
self._meta['vectors'] = {'width': self.vocab.vectors_length,
|
|
'vectors': len(self.vocab.vectors),
|
|
'keys': self.vocab.vectors.n_keys}
|
|
self._meta['pipeline'] = self.pipe_names
|
|
return self._meta
|
|
|
|
@meta.setter
|
|
def meta(self, value):
|
|
self._meta = value
|
|
|
|
# Conveniences to access pipeline components
|
|
@property
|
|
def tensorizer(self):
|
|
return self.get_pipe('tensorizer')
|
|
|
|
@property
|
|
def tagger(self):
|
|
return self.get_pipe('tagger')
|
|
|
|
@property
|
|
def parser(self):
|
|
return self.get_pipe('parser')
|
|
|
|
@property
|
|
def entity(self):
|
|
return self.get_pipe('ner')
|
|
|
|
@property
|
|
def matcher(self):
|
|
return self.get_pipe('matcher')
|
|
|
|
@property
|
|
def pipe_names(self):
|
|
"""Get names of available pipeline components.
|
|
|
|
RETURNS (list): List of component name strings, in order.
|
|
"""
|
|
return [pipe_name for pipe_name, _ in self.pipeline]
|
|
|
|
def get_pipe(self, name):
|
|
"""Get a pipeline component for a given component name.
|
|
|
|
name (unicode): Name of pipeline component to get.
|
|
RETURNS (callable): The pipeline component.
|
|
"""
|
|
for pipe_name, component in self.pipeline:
|
|
if pipe_name == name:
|
|
return component
|
|
msg = "No component '{}' found in pipeline. Available names: {}"
|
|
raise KeyError(msg.format(name, self.pipe_names))
|
|
|
|
def create_pipe(self, name, config=dict()):
|
|
"""Create a pipeline component from a factory.
|
|
|
|
name (unicode): Factory name to look up in `Language.factories`.
|
|
config (dict): Configuration parameters to initialise component.
|
|
RETURNS (callable): Pipeline component.
|
|
"""
|
|
if name not in self.factories:
|
|
raise KeyError("Can't find factory for '{}'.".format(name))
|
|
factory = self.factories[name]
|
|
return factory(self, **config)
|
|
|
|
def add_pipe(self, component, name=None, before=None, after=None,
|
|
first=None, last=None):
|
|
"""Add a component to the processing pipeline. Valid components are
|
|
callables that take a `Doc` object, modify it and return it. Only one
|
|
of before/after/first/last can be set. Default behaviour is "last".
|
|
|
|
component (callable): The pipeline component.
|
|
name (unicode): Name of pipeline component. Overwrites existing
|
|
component.name attribute if available. If no name is set and
|
|
the component exposes no name attribute, component.__name__ is
|
|
used. An error is raised if a name already exists in the pipeline.
|
|
before (unicode): Component name to insert component directly before.
|
|
after (unicode): Component name to insert component directly after.
|
|
first (bool): Insert component first / not first in the pipeline.
|
|
last (bool): Insert component last / not last in the pipeline.
|
|
|
|
EXAMPLE:
|
|
>>> nlp.add_pipe(component, before='ner')
|
|
>>> nlp.add_pipe(component, name='custom_name', last=True)
|
|
"""
|
|
if name is None:
|
|
if hasattr(component, 'name'):
|
|
name = component.name
|
|
elif hasattr(component, '__name__'):
|
|
name = component.__name__
|
|
elif (hasattr(component, '__class__') and
|
|
hasattr(component.__class__, '__name__')):
|
|
name = component.__class__.__name__
|
|
else:
|
|
name = repr(component)
|
|
if name in self.pipe_names:
|
|
raise ValueError("'{}' already exists in pipeline.".format(name))
|
|
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
|
msg = ("Invalid constraints. You can only set one of the "
|
|
"following: before, after, first, last.")
|
|
raise ValueError(msg)
|
|
pipe = (name, component)
|
|
if last or not any([first, before, after]):
|
|
self.pipeline.append(pipe)
|
|
elif first:
|
|
self.pipeline.insert(0, pipe)
|
|
elif before and before in self.pipe_names:
|
|
self.pipeline.insert(self.pipe_names.index(before), pipe)
|
|
elif after and after in self.pipe_names:
|
|
self.pipeline.insert(self.pipe_names.index(after), pipe)
|
|
else:
|
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
|
unfound = before or after
|
|
raise ValueError(msg.format(unfound, self.pipe_names))
|
|
|
|
def has_pipe(self, name):
|
|
"""Check if a component name is present in the pipeline. Equivalent to
|
|
`name in nlp.pipe_names`.
|
|
|
|
name (unicode): Name of the component.
|
|
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
|
"""
|
|
return name in self.pipe_names
|
|
|
|
def replace_pipe(self, name, component):
|
|
"""Replace a component in the pipeline.
|
|
|
|
name (unicode): Name of the component to replace.
|
|
component (callable): Pipeline component.
|
|
"""
|
|
if name not in self.pipe_names:
|
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
|
raise ValueError(msg.format(name, self.pipe_names))
|
|
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
|
|
|
def rename_pipe(self, old_name, new_name):
|
|
"""Rename a pipeline component.
|
|
|
|
old_name (unicode): Name of the component to rename.
|
|
new_name (unicode): New name of the component.
|
|
"""
|
|
if old_name not in self.pipe_names:
|
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
|
raise ValueError(msg.format(old_name, self.pipe_names))
|
|
if new_name in self.pipe_names:
|
|
msg = "'{}' already exists in pipeline. Existing names: {}"
|
|
raise ValueError(msg.format(new_name, self.pipe_names))
|
|
i = self.pipe_names.index(old_name)
|
|
self.pipeline[i] = (new_name, self.pipeline[i][1])
|
|
|
|
def remove_pipe(self, name):
|
|
"""Remove a component from the pipeline.
|
|
|
|
name (unicode): Name of the component to remove.
|
|
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
|
"""
|
|
if name not in self.pipe_names:
|
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
|
raise ValueError(msg.format(name, self.pipe_names))
|
|
return self.pipeline.pop(self.pipe_names.index(name))
|
|
|
|
def __call__(self, text, disable=[]):
|
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
|
and can contain arbtrary whitespace. Alignment into the original string
|
|
is preserved.
|
|
|
|
text (unicode): The text to be processed.
|
|
disable (list): Names of the pipeline components to disable.
|
|
RETURNS (Doc): A container for accessing the annotations.
|
|
|
|
EXAMPLE:
|
|
>>> tokens = nlp('An example sentence. Another example sentence.')
|
|
>>> tokens[0].text, tokens[0].head.tag_
|
|
('An', 'NN')
|
|
"""
|
|
doc = self.make_doc(text)
|
|
for name, proc in self.pipeline:
|
|
if name in disable:
|
|
continue
|
|
doc = proc(doc)
|
|
return doc
|
|
|
|
def disable_pipes(self, *names):
|
|
"""Disable one or more pipeline components. If used as a context
|
|
manager, the pipeline will be restored to the initial state at the end
|
|
of the block. Otherwise, a DisabledPipes object is returned, that has
|
|
a `.restore()` method you can use to undo your changes.
|
|
|
|
EXAMPLE:
|
|
>>> nlp.add_pipe('parser')
|
|
>>> nlp.add_pipe('tagger')
|
|
>>> with nlp.disable_pipes('parser', 'tagger'):
|
|
>>> assert not nlp.has_pipe('parser')
|
|
>>> assert nlp.has_pipe('parser')
|
|
>>> disabled = nlp.disable_pipes('parser')
|
|
>>> assert len(disabled) == 1
|
|
>>> assert not nlp.has_pipe('parser')
|
|
>>> disabled.restore()
|
|
>>> assert nlp.has_pipe('parser')
|
|
"""
|
|
return DisabledPipes(self, *names)
|
|
|
|
def make_doc(self, text):
|
|
return self.tokenizer(text)
|
|
|
|
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
|
"""Update the models in the pipeline.
|
|
|
|
docs (iterable): A batch of `Doc` objects.
|
|
golds (iterable): A batch of `GoldParse` objects.
|
|
drop (float): The droput rate.
|
|
sgd (callable): An optimizer.
|
|
RETURNS (dict): Results from the update.
|
|
|
|
EXAMPLE:
|
|
>>> with nlp.begin_training(gold) as (trainer, optimizer):
|
|
>>> for epoch in trainer.epochs(gold):
|
|
>>> for docs, golds in epoch:
|
|
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
|
"""
|
|
if len(docs) != len(golds):
|
|
raise IndexError("Update expects same number of docs and golds "
|
|
"Got: %d, %d" % (len(docs), len(golds)))
|
|
if len(docs) == 0:
|
|
return
|
|
if sgd is None:
|
|
if self._optimizer is None:
|
|
self._optimizer = Adam(Model.ops, 0.001)
|
|
sgd = self._optimizer
|
|
grads = {}
|
|
|
|
def get_grads(W, dW, key=None):
|
|
grads[key] = (W, dW)
|
|
|
|
pipes = list(self.pipeline)
|
|
random.shuffle(pipes)
|
|
for name, proc in pipes:
|
|
if not hasattr(proc, 'update'):
|
|
continue
|
|
grads = {}
|
|
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
|
for key, (W, dW) in grads.items():
|
|
sgd(W, dW, key=key)
|
|
|
|
def preprocess_gold(self, docs_golds):
|
|
"""Can be called before training to pre-process gold data. By default,
|
|
it handles nonprojectivity and adds missing tags to the tag map.
|
|
|
|
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
|
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
|
"""
|
|
for name, proc in self.pipeline:
|
|
if hasattr(proc, 'preprocess_gold'):
|
|
docs_golds = proc.preprocess_gold(docs_golds)
|
|
for doc, gold in docs_golds:
|
|
yield doc, gold
|
|
|
|
def resume_training(self, **cfg):
|
|
if cfg.get('device', -1) >= 0:
|
|
device = util.use_gpu(cfg['device'])
|
|
if self.vocab.vectors.data.shape[1] >= 1:
|
|
self.vocab.vectors.data = Model.ops.asarray(
|
|
self.vocab.vectors.data)
|
|
else:
|
|
device = None
|
|
learn_rate = util.env_opt('learn_rate', 0.001)
|
|
beta1 = util.env_opt('optimizer_B1', 0.9)
|
|
beta2 = util.env_opt('optimizer_B2', 0.999)
|
|
eps = util.env_opt('optimizer_eps', 1e-08)
|
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
|
beta2=beta2, eps=eps)
|
|
self._optimizer.max_grad_norm = max_grad_norm
|
|
self._optimizer.device = device
|
|
return self._optimizer
|
|
|
|
def begin_training(self, get_gold_tuples=None, **cfg):
|
|
"""Allocate models, pre-process training data and acquire a trainer and
|
|
optimizer. Used as a contextmanager.
|
|
|
|
get_gold_tuples (function): Function returning gold data
|
|
**cfg: Config parameters.
|
|
RETURNS: An optimizer
|
|
"""
|
|
if get_gold_tuples is None:
|
|
get_gold_tuples = lambda: []
|
|
# Populate vocab
|
|
else:
|
|
for _, annots_brackets in get_gold_tuples():
|
|
for annots, _ in annots_brackets:
|
|
for word in annots[1]:
|
|
_ = self.vocab[word]
|
|
contexts = []
|
|
if cfg.get('device', -1) >= 0:
|
|
device = util.use_gpu(cfg['device'])
|
|
if self.vocab.vectors.data.shape[1] >= 1:
|
|
self.vocab.vectors.data = Model.ops.asarray(
|
|
self.vocab.vectors.data)
|
|
else:
|
|
device = None
|
|
link_vectors_to_models(self.vocab)
|
|
for name, proc in self.pipeline:
|
|
if hasattr(proc, 'begin_training'):
|
|
context = proc.begin_training(get_gold_tuples(),
|
|
pipeline=self.pipeline)
|
|
contexts.append(context)
|
|
learn_rate = util.env_opt('learn_rate', 0.001)
|
|
beta1 = util.env_opt('optimizer_B1', 0.9)
|
|
beta2 = util.env_opt('optimizer_B2', 0.999)
|
|
eps = util.env_opt('optimizer_eps', 1e-08)
|
|
L2 = util.env_opt('L2_penalty', 1e-6)
|
|
max_grad_norm = util.env_opt('grad_norm_clip', 1.)
|
|
self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1,
|
|
beta2=beta2, eps=eps)
|
|
self._optimizer.max_grad_norm = max_grad_norm
|
|
self._optimizer.device = device
|
|
return self._optimizer
|
|
|
|
def evaluate(self, docs_golds, verbose=False):
|
|
scorer = Scorer()
|
|
docs, golds = zip(*docs_golds)
|
|
docs = list(docs)
|
|
golds = list(golds)
|
|
for name, pipe in self.pipeline:
|
|
if not hasattr(pipe, 'pipe'):
|
|
docs = (pipe(doc) for doc in docs)
|
|
else:
|
|
docs = pipe.pipe(docs, batch_size=256)
|
|
for doc, gold in zip(docs, golds):
|
|
if verbose:
|
|
print(doc)
|
|
scorer.score(doc, gold, verbose=verbose)
|
|
return scorer
|
|
|
|
@contextmanager
|
|
def use_params(self, params, **cfg):
|
|
"""Replace weights of models in the pipeline with those provided in the
|
|
params dictionary. Can be used as a contextmanager, in which case,
|
|
models go back to their original weights after the block.
|
|
|
|
params (dict): A dictionary of parameters keyed by model ID.
|
|
**cfg: Config parameters.
|
|
|
|
EXAMPLE:
|
|
>>> with nlp.use_params(optimizer.averages):
|
|
>>> nlp.to_disk('/tmp/checkpoint')
|
|
"""
|
|
contexts = [pipe.use_params(params) for name, pipe
|
|
in self.pipeline if hasattr(pipe, 'use_params')]
|
|
# TODO: Having trouble with contextlib
|
|
# Workaround: these aren't actually context managers atm.
|
|
for context in contexts:
|
|
try:
|
|
next(context)
|
|
except StopIteration:
|
|
pass
|
|
yield
|
|
for context in contexts:
|
|
try:
|
|
next(context)
|
|
except StopIteration:
|
|
pass
|
|
|
|
def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
|
|
disable=[]):
|
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
|
Supports GIL-free multi-threading.
|
|
|
|
texts (iterator): A sequence of texts to process.
|
|
as_tuples (bool):
|
|
If set to True, inputs should be a sequence of
|
|
(text, context) tuples. Output will then be a sequence of
|
|
(doc, context) tuples. Defaults to False.
|
|
n_threads (int): The number of worker threads to use. If -1, OpenMP
|
|
will decide how many to use at run time. Default is 2.
|
|
batch_size (int): The number of texts to buffer.
|
|
disable (list): Names of the pipeline components to disable.
|
|
YIELDS (Doc): Documents in the order of the original text.
|
|
|
|
EXAMPLE:
|
|
>>> texts = [u'One document.', u'...', u'Lots of documents']
|
|
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
|
>>> assert doc.is_parsed
|
|
"""
|
|
if as_tuples:
|
|
text_context1, text_context2 = itertools.tee(texts)
|
|
texts = (tc[0] for tc in text_context1)
|
|
contexts = (tc[1] for tc in text_context2)
|
|
docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size,
|
|
disable=disable)
|
|
for doc, context in izip(docs, contexts):
|
|
yield (doc, context)
|
|
return
|
|
docs = (self.make_doc(text) for text in texts)
|
|
for name, proc in self.pipeline:
|
|
if name in disable:
|
|
continue
|
|
if hasattr(proc, 'pipe'):
|
|
docs = proc.pipe(docs, n_threads=n_threads,
|
|
batch_size=batch_size)
|
|
else:
|
|
# Apply the function, but yield the doc
|
|
docs = _pipe(proc, docs)
|
|
# Track weakrefs of "recent" documents, so that we can see when they
|
|
# expire from memory. When they do, we know we don't need old strings.
|
|
# This way, we avoid maintaining an unbounded growth in string entries
|
|
# in the string store.
|
|
recent_refs = weakref.WeakSet()
|
|
old_refs = weakref.WeakSet()
|
|
original_strings_data = self.vocab.strings.to_bytes()
|
|
StringStore = self.vocab.strings.__class__
|
|
recent_strings = StringStore().from_bytes(original_strings_data)
|
|
nr_seen = 0
|
|
for doc in docs:
|
|
yield doc
|
|
for word in doc:
|
|
recent_strings.add(word.text)
|
|
recent_refs.add(doc)
|
|
if nr_seen < 10000:
|
|
old_refs.add(doc)
|
|
nr_seen += 1
|
|
elif len(old_refs) == 0:
|
|
# All the docs in the 'old' set have expired, so the only
|
|
# difference between the backup strings and the current
|
|
# string-store should be obsolete. We therefore swap out the
|
|
# old strings data.
|
|
old_refs, recent_refs = recent_refs, old_refs
|
|
self.vocab.strings._reset_and_load(recent_strings)
|
|
recent_strings = StringStore().from_bytes(original_strings_data)
|
|
nr_seen = 0
|
|
|
|
def to_disk(self, path, disable=tuple()):
|
|
"""Save the current state to a directory. If a model is loaded, this
|
|
will include the model.
|
|
|
|
path (unicode or Path): A path to a directory, which will be created if
|
|
it doesn't exist. Paths may be strings or `Path`-like objects.
|
|
disable (list): Names of pipeline components to disable and prevent
|
|
from being saved.
|
|
|
|
EXAMPLE:
|
|
>>> nlp.to_disk('/path/to/models')
|
|
"""
|
|
path = util.ensure_path(path)
|
|
serializers = OrderedDict((
|
|
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
|
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
|
))
|
|
for name, proc in self.pipeline:
|
|
if not hasattr(proc, 'name'):
|
|
continue
|
|
if name in disable:
|
|
continue
|
|
if not hasattr(proc, 'to_disk'):
|
|
continue
|
|
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
|
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
|
|
util.to_disk(path, serializers, {p: False for p in disable})
|
|
|
|
def from_disk(self, path, disable=tuple()):
|
|
"""Loads state from a directory. Modifies the object in place and
|
|
returns it. If the saved `Language` object contains a model, the
|
|
model will be loaded.
|
|
|
|
path (unicode or Path): A path to a directory. Paths may be either
|
|
strings or `Path`-like objects.
|
|
disable (list): Names of the pipeline components to disable.
|
|
RETURNS (Language): The modified `Language` object.
|
|
|
|
EXAMPLE:
|
|
>>> from spacy.language import Language
|
|
>>> nlp = Language().from_disk('/path/to/models')
|
|
"""
|
|
path = util.ensure_path(path)
|
|
deserializers = OrderedDict((
|
|
('vocab', lambda p: self.vocab.from_disk(p)),
|
|
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
|
('meta.json', lambda p: self.meta.update(ujson.load(p.open('r'))))
|
|
))
|
|
for name, proc in self.pipeline:
|
|
if name in disable:
|
|
continue
|
|
if not hasattr(proc, 'to_disk'):
|
|
continue
|
|
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
|
exclude = {p: False for p in disable}
|
|
if not (path / 'vocab').exists():
|
|
exclude['vocab'] = True
|
|
util.from_disk(path, deserializers, exclude)
|
|
self._path = path
|
|
return self
|
|
|
|
def to_bytes(self, disable=[], **exclude):
|
|
"""Serialize the current state to a binary string.
|
|
|
|
disable (list): Nameds of pipeline components to disable and prevent
|
|
from being serialized.
|
|
RETURNS (bytes): The serialized form of the `Language` object.
|
|
"""
|
|
serializers = OrderedDict((
|
|
('vocab', lambda: self.vocab.to_bytes()),
|
|
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
|
('meta', lambda: json_dumps(self.meta))
|
|
))
|
|
for i, (name, proc) in enumerate(self.pipeline):
|
|
if name in disable:
|
|
continue
|
|
if not hasattr(proc, 'to_bytes'):
|
|
continue
|
|
serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False)
|
|
return util.to_bytes(serializers, exclude)
|
|
|
|
def from_bytes(self, bytes_data, disable=[]):
|
|
"""Load state from a binary string.
|
|
|
|
bytes_data (bytes): The data to load from.
|
|
disable (list): Names of the pipeline components to disable.
|
|
RETURNS (Language): The `Language` object.
|
|
"""
|
|
deserializers = OrderedDict((
|
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
|
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
|
('meta', lambda b: self.meta.update(ujson.loads(b)))
|
|
))
|
|
for i, (name, proc) in enumerate(self.pipeline):
|
|
if name in disable:
|
|
continue
|
|
if not hasattr(proc, 'from_bytes'):
|
|
continue
|
|
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
|
|
msg = util.from_bytes(bytes_data, deserializers, {})
|
|
return self
|
|
|
|
|
|
class DisabledPipes(list):
|
|
"""Manager for temporary pipeline disabling."""
|
|
def __init__(self, nlp, *names):
|
|
self.nlp = nlp
|
|
self.names = names
|
|
# Important! Not deep copy -- we just want the container (but we also
|
|
# want to support people providing arbitrarily typed nlp.pipeline
|
|
# objects.)
|
|
self.original_pipeline = copy(nlp.pipeline)
|
|
list.__init__(self)
|
|
self.extend(nlp.remove_pipe(name) for name in names)
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, *args):
|
|
self.restore()
|
|
|
|
def restore(self):
|
|
'''Restore the pipeline to its state when DisabledPipes was created.'''
|
|
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
|
unexpected = [name for name, pipe in current
|
|
if not self.nlp.has_pipe(name)]
|
|
if unexpected:
|
|
# Don't change the pipeline if we're raising an error.
|
|
self.nlp.pipeline = current
|
|
msg = (
|
|
"Some current components would be lost when restoring "
|
|
"previous pipeline state. If you added components after "
|
|
"calling nlp.disable_pipes(), you should remove them "
|
|
"explicitly with nlp.remove_pipe() before the pipeline is "
|
|
"restore. Names of the new components: %s"
|
|
)
|
|
raise ValueError(msg % unexpected)
|
|
self[:] = []
|
|
|
|
|
|
def unpickle_language(vocab, meta, bytes_data):
|
|
lang = Language(vocab=vocab)
|
|
lang.from_bytes(bytes_data)
|
|
return lang
|
|
|
|
|
|
def _pipe(func, docs):
|
|
for doc in docs:
|
|
func(doc)
|
|
yield doc
|