mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-25 11:23:40 +03:00
Merge branch 'feature/pipeline-management' into feature/dot-underscore
This commit is contained in:
commit
de374dc72a
|
@ -1,12 +1,9 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
import dill
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
from thinc.neural import Model
|
from thinc.neural import Model
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.optimizers import Adam
|
||||||
from thinc.neural.optimizers import Adam, SGD
|
|
||||||
import random
|
import random
|
||||||
import ujson
|
import ujson
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
@ -17,24 +14,20 @@ from .vocab import Vocab
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .syntax.parser import get_templates
|
from .syntax.parser import get_templates
|
||||||
from .syntax import nonproj
|
|
||||||
|
|
||||||
from .pipeline import NeuralDependencyParser, EntityRecognizer
|
from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
|
||||||
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
|
from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer
|
||||||
from .pipeline import NeuralLabeller
|
|
||||||
from .pipeline import SimilarityHook
|
|
||||||
from .pipeline import TextCategorizer
|
|
||||||
from . import about
|
|
||||||
|
|
||||||
from .compat import json_dumps, izip
|
from .compat import json_dumps, izip
|
||||||
|
from .scorer import Scorer
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
from .attrs import IS_STOP
|
from .attrs import IS_STOP
|
||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||||
from .lang.tag_map import TAG_MAP
|
from .lang.tag_map import TAG_MAP
|
||||||
from .lang.lex_attrs import LEX_ATTRS
|
from .lang.lex_attrs import LEX_ATTRS
|
||||||
from . import util
|
from . import util
|
||||||
from .scorer import Scorer
|
from . import about
|
||||||
from ._ml import link_vectors_to_models
|
|
||||||
|
|
||||||
|
|
||||||
class BaseDefaults(object):
|
class BaseDefaults(object):
|
||||||
|
@ -70,59 +63,7 @@ class BaseDefaults(object):
|
||||||
prefix_search=prefix_search, suffix_search=suffix_search,
|
prefix_search=prefix_search, suffix_search=suffix_search,
|
||||||
infix_finditer=infix_finditer, token_match=token_match)
|
infix_finditer=infix_finditer, token_match=token_match)
|
||||||
|
|
||||||
@classmethod
|
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
|
||||||
def create_tagger(cls, nlp=None, **cfg):
|
|
||||||
if nlp is None:
|
|
||||||
return NeuralTagger(cls.create_vocab(nlp), **cfg)
|
|
||||||
else:
|
|
||||||
return NeuralTagger(nlp.vocab, **cfg)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_parser(cls, nlp=None, **cfg):
|
|
||||||
if nlp is None:
|
|
||||||
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
|
|
||||||
else:
|
|
||||||
return NeuralDependencyParser(nlp.vocab, **cfg)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_entity(cls, nlp=None, **cfg):
|
|
||||||
if nlp is None:
|
|
||||||
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
|
|
||||||
else:
|
|
||||||
return NeuralEntityRecognizer(nlp.vocab, **cfg)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_pipeline(cls, nlp=None, disable=tuple()):
|
|
||||||
meta = nlp.meta if nlp is not None else {}
|
|
||||||
# Resolve strings, like "cnn", "lstm", etc
|
|
||||||
pipeline = []
|
|
||||||
for entry in meta.get('pipeline', []):
|
|
||||||
if entry in disable or getattr(entry, 'name', entry) in disable:
|
|
||||||
continue
|
|
||||||
factory = cls.Defaults.factories[entry]
|
|
||||||
pipeline.append(factory(nlp, **meta.get(entry, {})))
|
|
||||||
return pipeline
|
|
||||||
|
|
||||||
factories = {
|
|
||||||
'make_doc': create_tokenizer,
|
|
||||||
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
|
||||||
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
|
||||||
'parser': lambda nlp, **cfg: [
|
|
||||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
|
||||||
nonproj.deprojectivize],
|
|
||||||
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
|
||||||
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
|
|
||||||
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
|
|
||||||
# Temporary compatibility -- delete after pivot
|
|
||||||
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
|
|
||||||
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
|
|
||||||
'dependencies': lambda nlp, **cfg: [
|
|
||||||
NeuralDependencyParser(nlp.vocab, **cfg),
|
|
||||||
nonproj.deprojectivize,
|
|
||||||
],
|
|
||||||
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
|
|
||||||
}
|
|
||||||
|
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||||
|
@ -152,8 +93,17 @@ class Language(object):
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
lang = None
|
lang = None
|
||||||
|
|
||||||
def __init__(self, vocab=True, make_doc=True, pipeline=None,
|
factories = {
|
||||||
meta={}, disable=tuple(), **kwargs):
|
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||||
|
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
||||||
|
'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
|
||||||
|
'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
|
||||||
|
'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
|
||||||
|
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
||||||
|
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
||||||
"""Initialise a Language object.
|
"""Initialise a Language object.
|
||||||
|
|
||||||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
||||||
|
@ -179,28 +129,7 @@ class Language(object):
|
||||||
factory = self.Defaults.create_tokenizer
|
factory = self.Defaults.create_tokenizer
|
||||||
make_doc = factory(self, **meta.get('tokenizer', {}))
|
make_doc = factory(self, **meta.get('tokenizer', {}))
|
||||||
self.tokenizer = make_doc
|
self.tokenizer = make_doc
|
||||||
if pipeline is True:
|
|
||||||
self.pipeline = self.Defaults.create_pipeline(self, disable)
|
|
||||||
elif pipeline:
|
|
||||||
# Careful not to do getattr(p, 'name', None) here
|
|
||||||
# If we had disable=[None], we'd disable everything!
|
|
||||||
self.pipeline = [p for p in pipeline
|
|
||||||
if p not in disable
|
|
||||||
and getattr(p, 'name', p) not in disable]
|
|
||||||
# Resolve strings, like "cnn", "lstm", etc
|
|
||||||
for i, entry in enumerate(self.pipeline):
|
|
||||||
if entry in self.Defaults.factories:
|
|
||||||
factory = self.Defaults.factories[entry]
|
|
||||||
self.pipeline[i] = factory(self, **meta.get(entry, {}))
|
|
||||||
else:
|
|
||||||
self.pipeline = []
|
self.pipeline = []
|
||||||
flat_list = []
|
|
||||||
for pipe in self.pipeline:
|
|
||||||
if isinstance(pipe, list):
|
|
||||||
flat_list.extend(pipe)
|
|
||||||
else:
|
|
||||||
flat_list.append(pipe)
|
|
||||||
self.pipeline = flat_list
|
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -214,11 +143,7 @@ class Language(object):
|
||||||
self._meta.setdefault('email', '')
|
self._meta.setdefault('email', '')
|
||||||
self._meta.setdefault('url', '')
|
self._meta.setdefault('url', '')
|
||||||
self._meta.setdefault('license', '')
|
self._meta.setdefault('license', '')
|
||||||
pipeline = []
|
self._meta['pipeline'] = self.pipe_names
|
||||||
for component in self.pipeline:
|
|
||||||
if hasattr(component, 'name'):
|
|
||||||
pipeline.append(component.name)
|
|
||||||
self._meta['pipeline'] = pipeline
|
|
||||||
return self._meta
|
return self._meta
|
||||||
|
|
||||||
@meta.setter
|
@meta.setter
|
||||||
|
@ -228,34 +153,137 @@ class Language(object):
|
||||||
# Conveniences to access pipeline components
|
# Conveniences to access pipeline components
|
||||||
@property
|
@property
|
||||||
def tensorizer(self):
|
def tensorizer(self):
|
||||||
return self.get_component('tensorizer')
|
return self.get_pipe('tensorizer')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tagger(self):
|
def tagger(self):
|
||||||
return self.get_component('tagger')
|
return self.get_pipe('tagger')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def parser(self):
|
def parser(self):
|
||||||
return self.get_component('parser')
|
return self.get_pipe('parser')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def entity(self):
|
def entity(self):
|
||||||
return self.get_component('ner')
|
return self.get_pipe('ner')
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def matcher(self):
|
def matcher(self):
|
||||||
return self.get_component('matcher')
|
return self.get_pipe('matcher')
|
||||||
|
|
||||||
def get_component(self, name):
|
@property
|
||||||
if self.pipeline in (True, None):
|
def pipe_names(self):
|
||||||
return None
|
"""Get names of available pipeline components.
|
||||||
for proc in self.pipeline:
|
|
||||||
if hasattr(proc, 'name') and proc.name.endswith(name):
|
RETURNS (list): List of component name strings, in order.
|
||||||
return proc
|
"""
|
||||||
return None
|
return [pipe_name for pipe_name, _ in self.pipeline]
|
||||||
|
|
||||||
|
def get_pipe(self, name):
|
||||||
|
"""Get a pipeline component for a given component name.
|
||||||
|
|
||||||
|
name (unicode): Name of pipeline component to get.
|
||||||
|
RETURNS (callable): The pipeline component.
|
||||||
|
"""
|
||||||
|
for pipe_name, component in self.pipeline:
|
||||||
|
if pipe_name == name:
|
||||||
|
return component
|
||||||
|
msg = "No component '{}' found in pipeline. Available names: {}"
|
||||||
|
raise KeyError(msg.format(name, self.pipe_names))
|
||||||
|
|
||||||
|
def create_pipe(self, name, config=dict()):
|
||||||
|
"""Create a pipeline component from a factory.
|
||||||
|
|
||||||
|
name (unicode): Factory name to look up in `Language.factories`.
|
||||||
|
config (dict): Configuration parameters to initialise component.
|
||||||
|
RETURNS (callable): Pipeline component.
|
||||||
|
"""
|
||||||
|
if name not in self.factories:
|
||||||
|
raise KeyError("Can't find factory for '{}'.".format(name))
|
||||||
|
factory = self.factories[name]
|
||||||
|
return factory(self, **config)
|
||||||
|
|
||||||
|
def add_pipe(self, component, name=None, before=None, after=None,
|
||||||
|
first=None, last=None):
|
||||||
|
"""Add a component to the processing pipeline. Valid components are
|
||||||
|
callables that take a `Doc` object, modify it and return it. Only one of
|
||||||
|
before, after, first or last can be set. Default behaviour is "last".
|
||||||
|
|
||||||
|
component (callable): The pipeline component.
|
||||||
|
name (unicode): Name of pipeline component. Overwrites existing
|
||||||
|
component.name attribute if available. If no name is set and
|
||||||
|
the component exposes no name attribute, component.__name__ is
|
||||||
|
used. An error is raised if the name already exists in the pipeline.
|
||||||
|
before (unicode): Component name to insert component directly before.
|
||||||
|
after (unicode): Component name to insert component directly after.
|
||||||
|
first (bool): Insert component first / not first in the pipeline.
|
||||||
|
last (bool): Insert component last / not last in the pipeline.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> nlp.add_pipe(component, before='ner')
|
||||||
|
>>> nlp.add_pipe(component, name='custom_name', last=True)
|
||||||
|
"""
|
||||||
|
if name is None:
|
||||||
|
name = getattr(component, 'name', component.__name__)
|
||||||
|
if name in self.pipe_names:
|
||||||
|
raise ValueError("'{}' already exists in pipeline.".format(name))
|
||||||
|
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
||||||
|
msg = ("Invalid constraints. You can only set one of the "
|
||||||
|
"following: before, after, first, last.")
|
||||||
|
raise ValueError(msg)
|
||||||
|
pipe = (name, component)
|
||||||
|
if last or not any([first, before, after]):
|
||||||
|
self.pipeline.append(pipe)
|
||||||
|
elif first:
|
||||||
|
self.pipeline.insert(0, pipe)
|
||||||
|
elif before and before in self.pipe_names:
|
||||||
|
self.pipeline.insert(self.pipe_names.index(before), pipe)
|
||||||
|
elif after and after in self.pipe_names:
|
||||||
|
self.pipeline.insert(self.pipe_names.index(after), pipe)
|
||||||
|
else:
|
||||||
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||||
|
unfound = before or after
|
||||||
|
raise ValueError(msg.format(unfound, self.pipe_names))
|
||||||
|
|
||||||
|
def replace_pipe(self, name, component):
|
||||||
|
"""Replace a component in the pipeline.
|
||||||
|
|
||||||
|
name (unicode): Name of the component to replace.
|
||||||
|
component (callable): Pipeline component.
|
||||||
|
"""
|
||||||
|
if name not in self.pipe_names:
|
||||||
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||||
|
raise ValueError(msg.format(name, self.pipe_names))
|
||||||
|
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
||||||
|
|
||||||
|
def rename_pipe(self, old_name, new_name):
|
||||||
|
"""Rename a pipeline component.
|
||||||
|
|
||||||
|
old_name (unicode): Name of the component to rename.
|
||||||
|
new_name (unicode): New name of the component.
|
||||||
|
"""
|
||||||
|
if old_name not in self.pipe_names:
|
||||||
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||||
|
raise ValueError(msg.format(old_name, self.pipe_names))
|
||||||
|
if new_name in self.pipe_names:
|
||||||
|
msg = "'{}' already exists in pipeline. Existing names: {}"
|
||||||
|
raise ValueError(msg.format(new_name, self.pipe_names))
|
||||||
|
i = self.pipe_names.index(old_name)
|
||||||
|
self.pipeline[i] = (new_name, self.pipeline[i][1])
|
||||||
|
|
||||||
|
def remove_pipe(self, name):
|
||||||
|
"""Remove a component from the pipeline.
|
||||||
|
|
||||||
|
name (unicode): Name of the component to remove.
|
||||||
|
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||||
|
"""
|
||||||
|
if name not in self.pipe_names:
|
||||||
|
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||||
|
raise ValueError(msg.format(name, self.pipe_names))
|
||||||
|
return self.pipeline.pop(self.pipe_names.index(name))
|
||||||
|
|
||||||
def __call__(self, text, disable=[]):
|
def __call__(self, text, disable=[]):
|
||||||
"""'Apply the pipeline to some text. The text can span multiple sentences,
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
||||||
|
@ -269,8 +297,7 @@ class Language(object):
|
||||||
('An', 'NN')
|
('An', 'NN')
|
||||||
"""
|
"""
|
||||||
doc = self.make_doc(text)
|
doc = self.make_doc(text)
|
||||||
for proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
name = getattr(proc, 'name', None)
|
|
||||||
if name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
doc = proc(doc)
|
doc = proc(doc)
|
||||||
|
@ -308,7 +335,7 @@ class Language(object):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
pipes = list(self.pipeline)
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
for proc in pipes:
|
for name, proc in pipes:
|
||||||
if not hasattr(proc, 'update'):
|
if not hasattr(proc, 'update'):
|
||||||
continue
|
continue
|
||||||
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
||||||
|
@ -322,7 +349,7 @@ class Language(object):
|
||||||
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
||||||
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
||||||
"""
|
"""
|
||||||
for proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, 'preprocess_gold'):
|
if hasattr(proc, 'preprocess_gold'):
|
||||||
docs_golds = proc.preprocess_gold(docs_golds)
|
docs_golds = proc.preprocess_gold(docs_golds)
|
||||||
for doc, gold in docs_golds:
|
for doc, gold in docs_golds:
|
||||||
|
@ -354,7 +381,7 @@ class Language(object):
|
||||||
|
|
||||||
get_gold_tuples (function): Function returning gold data
|
get_gold_tuples (function): Function returning gold data
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
returns: An optimizer
|
RETURNS: An optimizer
|
||||||
"""
|
"""
|
||||||
# Populate vocab
|
# Populate vocab
|
||||||
if get_gold_tuples is not None:
|
if get_gold_tuples is not None:
|
||||||
|
@ -371,7 +398,7 @@ class Language(object):
|
||||||
else:
|
else:
|
||||||
device = None
|
device = None
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
for proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, 'begin_training'):
|
if hasattr(proc, 'begin_training'):
|
||||||
context = proc.begin_training(get_gold_tuples(),
|
context = proc.begin_training(get_gold_tuples(),
|
||||||
pipeline=self.pipeline)
|
pipeline=self.pipeline)
|
||||||
|
@ -393,7 +420,7 @@ class Language(object):
|
||||||
docs, golds = zip(*docs_golds)
|
docs, golds = zip(*docs_golds)
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
golds = list(golds)
|
golds = list(golds)
|
||||||
for pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
if not hasattr(pipe, 'pipe'):
|
if not hasattr(pipe, 'pipe'):
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
pipe(doc)
|
pipe(doc)
|
||||||
|
@ -419,7 +446,7 @@ class Language(object):
|
||||||
>>> with nlp.use_params(optimizer.averages):
|
>>> with nlp.use_params(optimizer.averages):
|
||||||
>>> nlp.to_disk('/tmp/checkpoint')
|
>>> nlp.to_disk('/tmp/checkpoint')
|
||||||
"""
|
"""
|
||||||
contexts = [pipe.use_params(params) for pipe
|
contexts = [pipe.use_params(params) for name, pipe
|
||||||
in self.pipeline if hasattr(pipe, 'use_params')]
|
in self.pipeline if hasattr(pipe, 'use_params')]
|
||||||
# TODO: Having trouble with contextlib
|
# TODO: Having trouble with contextlib
|
||||||
# Workaround: these aren't actually context managers atm.
|
# Workaround: these aren't actually context managers atm.
|
||||||
|
@ -466,8 +493,7 @@ class Language(object):
|
||||||
yield (doc, context)
|
yield (doc, context)
|
||||||
return
|
return
|
||||||
docs = (self.make_doc(text) for text in texts)
|
docs = (self.make_doc(text) for text in texts)
|
||||||
for proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
name = getattr(proc, 'name', None)
|
|
||||||
if name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
if hasattr(proc, 'pipe'):
|
if hasattr(proc, 'pipe'):
|
||||||
|
@ -495,14 +521,14 @@ class Language(object):
|
||||||
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||||
))
|
))
|
||||||
for proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if not hasattr(proc, 'name'):
|
if not hasattr(proc, 'name'):
|
||||||
continue
|
continue
|
||||||
if proc.name in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, 'to_disk'):
|
if not hasattr(proc, 'to_disk'):
|
||||||
continue
|
continue
|
||||||
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
|
||||||
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
|
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||||
util.to_disk(path, serializers, {p: False for p in disable})
|
util.to_disk(path, serializers, {p: False for p in disable})
|
||||||
|
|
||||||
|
@ -526,14 +552,12 @@ class Language(object):
|
||||||
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
|
||||||
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
|
||||||
))
|
))
|
||||||
for proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if not hasattr(proc, 'name'):
|
if name in disable:
|
||||||
continue
|
|
||||||
if proc.name in disable:
|
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, 'to_disk'):
|
if not hasattr(proc, 'to_disk'):
|
||||||
continue
|
continue
|
||||||
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
|
||||||
exclude = {p: False for p in disable}
|
exclude = {p: False for p in disable}
|
||||||
if not (path / 'vocab').exists():
|
if not (path / 'vocab').exists():
|
||||||
exclude['vocab'] = True
|
exclude['vocab'] = True
|
||||||
|
@ -552,8 +576,8 @@ class Language(object):
|
||||||
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||||
('meta', lambda: ujson.dumps(self.meta))
|
('meta', lambda: ujson.dumps(self.meta))
|
||||||
))
|
))
|
||||||
for i, proc in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
if getattr(proc, 'name', None) in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, 'to_bytes'):
|
if not hasattr(proc, 'to_bytes'):
|
||||||
continue
|
continue
|
||||||
|
@ -572,8 +596,8 @@ class Language(object):
|
||||||
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
|
||||||
('meta', lambda b: self.meta.update(ujson.loads(b)))
|
('meta', lambda b: self.meta.update(ujson.loads(b)))
|
||||||
))
|
))
|
||||||
for i, proc in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
if getattr(proc, 'name', None) in disable:
|
if name in disable:
|
||||||
continue
|
continue
|
||||||
if not hasattr(proc, 'from_bytes'):
|
if not hasattr(proc, 'from_bytes'):
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .syntax.parser cimport Parser as LinearParser
|
from .syntax.parser cimport Parser as LinearParser
|
||||||
from .syntax.nn_parser cimport Parser as NeuralParser
|
from .syntax.nn_parser cimport Parser as NeuralParser
|
||||||
|
from .syntax import nonproj
|
||||||
from .syntax.parser import get_templates as get_feature_templates
|
from .syntax.parser import get_templates as get_feature_templates
|
||||||
from .syntax.beam_parser cimport BeamParser
|
from .syntax.beam_parser cimport BeamParser
|
||||||
from .syntax.ner cimport BiluoPushDown
|
from .syntax.ner cimport BiluoPushDown
|
||||||
|
@ -773,11 +774,19 @@ cdef class DependencyParser(LinearParser):
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def postprocesses(self):
|
||||||
|
return [nonproj.deprojectivize]
|
||||||
|
|
||||||
|
|
||||||
cdef class NeuralDependencyParser(NeuralParser):
|
cdef class NeuralDependencyParser(NeuralParser):
|
||||||
name = 'parser'
|
name = 'parser'
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
|
@property
|
||||||
|
def postprocesses(self):
|
||||||
|
return [nonproj.deprojectivize]
|
||||||
|
|
||||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||||
for target in []:
|
for target in []:
|
||||||
labeller = NeuralLabeller(self.vocab, target=target)
|
labeller = NeuralLabeller(self.vocab, target=target)
|
||||||
|
@ -818,6 +827,11 @@ cdef class BeamDependencyParser(BeamParser):
|
||||||
if isinstance(label, basestring):
|
if isinstance(label, basestring):
|
||||||
label = self.vocab.strings[label]
|
label = self.vocab.strings[label]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def postprocesses(self):
|
||||||
|
return [nonproj.deprojectivize]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
|
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
|
||||||
'BeamEntityRecognizer', 'TokenVectorEnoder']
|
'BeamEntityRecognizer', 'TokenVectorEnoder']
|
||||||
|
|
|
@ -779,6 +779,14 @@ cdef class Parser:
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
doc.c[i] = state.c._sent[i]
|
doc.c[i] = state.c._sent[i]
|
||||||
self.moves.finalize_doc(doc)
|
self.moves.finalize_doc(doc)
|
||||||
|
for hook in self.postprocesses:
|
||||||
|
for doc in docs:
|
||||||
|
hook(doc)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def postprocesses(self):
|
||||||
|
# Available for subclasses, e.g. to deprojectivize
|
||||||
|
return []
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
for action in self.moves.action_types:
|
for action in self.moves.action_types:
|
||||||
|
|
|
@ -58,8 +58,9 @@ def en_vocab():
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def en_parser():
|
def en_parser(en_vocab):
|
||||||
return util.get_lang_class('en').Defaults.create_parser()
|
nlp = util.get_lang_class('en')(en_vocab)
|
||||||
|
return nlp.create_pipe('parser')
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import spacy
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_beam_parse():
|
|
||||||
nlp = spacy.load('en_core_web_sm')
|
|
||||||
doc = nlp(u'Australia is a country', disable=['ner'])
|
|
||||||
ents = nlp.entity(doc, beam_width=2)
|
|
||||||
print(ents)
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_beam_parse(EN):
|
||||||
|
doc = EN(u'Australia is a country', disable=['ner'])
|
||||||
|
ents = EN.entity(doc, beam_width=2)
|
||||||
|
print(ents)
|
||||||
|
|
0
spacy/tests/pipeline/__init__.py
Normal file
0
spacy/tests/pipeline/__init__.py
Normal file
84
spacy/tests/pipeline/test_pipe_methods.py
Normal file
84
spacy/tests/pipeline/test_pipe_methods.py
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...language import Language
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def nlp():
|
||||||
|
return Language()
|
||||||
|
|
||||||
|
|
||||||
|
def new_pipe(doc):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_pipe_no_name(nlp):
|
||||||
|
nlp.add_pipe(new_pipe)
|
||||||
|
assert 'new_pipe' in nlp.pipe_names
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_pipe_duplicate_name(nlp):
|
||||||
|
nlp.add_pipe(new_pipe, name='duplicate_name')
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.add_pipe(new_pipe, name='duplicate_name')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('name', ['parser'])
|
||||||
|
def test_add_pipe_first(nlp, name):
|
||||||
|
nlp.add_pipe(new_pipe, name=name, first=True)
|
||||||
|
assert nlp.pipeline[0][0] == name
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
|
||||||
|
def test_add_pipe_last(nlp, name1, name2):
|
||||||
|
nlp.add_pipe(lambda doc: doc, name=name2)
|
||||||
|
nlp.add_pipe(new_pipe, name=name1, last=True)
|
||||||
|
assert nlp.pipeline[0][0] != name1
|
||||||
|
assert nlp.pipeline[-1][0] == name1
|
||||||
|
|
||||||
|
|
||||||
|
def test_cant_add_pipe_first_and_last(nlp):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.add_pipe(new_pipe, first=True, last=True)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('name', ['my_component'])
|
||||||
|
def test_get_pipe(nlp, name):
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
nlp.get_pipe(name)
|
||||||
|
nlp.add_pipe(new_pipe, name=name)
|
||||||
|
assert nlp.get_pipe(name) == new_pipe
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
|
||||||
|
def test_replace_pipe(nlp, name, replacement):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.replace_pipe(name, new_pipe)
|
||||||
|
nlp.add_pipe(new_pipe, name=name)
|
||||||
|
nlp.replace_pipe(name, replacement)
|
||||||
|
assert nlp.get_pipe(name) != new_pipe
|
||||||
|
assert nlp.get_pipe(name) == replacement
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
|
||||||
|
def test_rename_pipe(nlp, old_name, new_name):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.rename_pipe(old_name, new_name)
|
||||||
|
nlp.add_pipe(new_pipe, name=old_name)
|
||||||
|
nlp.rename_pipe(old_name, new_name)
|
||||||
|
assert nlp.pipeline[0][0] == new_name
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('name', ['my_component'])
|
||||||
|
def test_remove_pipe(nlp, name):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
nlp.remove_pipe(name)
|
||||||
|
nlp.add_pipe(new_pipe, name=name)
|
||||||
|
assert len(nlp.pipeline) == 1
|
||||||
|
removed_name, removed_component = nlp.remove_pipe(name)
|
||||||
|
assert not len(nlp.pipeline)
|
||||||
|
assert removed_name == name
|
||||||
|
assert removed_component == new_pipe
|
|
@ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
cls = get_lang_class(meta['lang'])
|
cls = get_lang_class(meta['lang'])
|
||||||
nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides)
|
nlp = cls(meta=meta, **overrides)
|
||||||
|
pipeline = meta.get('pipeline', [])
|
||||||
|
disable = overrides.get('disable', [])
|
||||||
|
if pipeline is True:
|
||||||
|
pipeline = nlp.Defaults.pipe_names
|
||||||
|
elif pipeline in (False, None):
|
||||||
|
pipeline = []
|
||||||
|
for name in pipeline:
|
||||||
|
if name not in disable:
|
||||||
|
config = meta.get('pipeline_args', {}).get(name, {})
|
||||||
|
component = nlp.create_pipe(name, config=config)
|
||||||
|
nlp.add_pipe(component, name=name)
|
||||||
return nlp.from_disk(model_path)
|
return nlp.from_disk(model_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap)
|
||||||
|
|
||||||
//- Code blocks to display old/new versions
|
//- Code blocks to display old/new versions
|
||||||
|
|
||||||
|
mixin code-compare()
|
||||||
|
span.u-inline-block.u-padding-top.u-width-full
|
||||||
|
block
|
||||||
|
|
||||||
mixin code-old()
|
mixin code-old()
|
||||||
+code(false, false, false, false, "reject").o-block-small
|
+code(false, false, false, false, "reject").o-block-small
|
||||||
block
|
block
|
||||||
|
|
|
@ -43,6 +43,20 @@ p
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell A #[code Language] object with the loaded model.
|
+cell A #[code Language] object with the loaded model.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Essentially, #[code spacy.load()] is a convenience wrapper that reads
|
||||||
|
| the language ID and pipeline components from a model's #[code meta.json],
|
||||||
|
| initialises the #[code Language] class, loads in the model data and
|
||||||
|
| returns it.
|
||||||
|
|
||||||
|
+code("Abstract example").
|
||||||
|
cls = util.get_lang_class(lang) # get language for ID, e.g. 'en'
|
||||||
|
nlp = cls() # initialise the language
|
||||||
|
for name in pipeline:
|
||||||
|
component = nlp.create_pipe(name) # create each pipeline component
|
||||||
|
nlp.add_pipe(component) # add component to pipeline
|
||||||
|
nlp.from_disk(model_data_path) # load in model data
|
||||||
|
|
||||||
+infobox("Deprecation note", "⚠️")
|
+infobox("Deprecation note", "⚠️")
|
||||||
.o-block
|
.o-block
|
||||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||||
|
@ -141,37 +155,3 @@ p
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell The explanation, or #[code None] if not found in the glossary.
|
+cell The explanation, or #[code None] if not found in the glossary.
|
||||||
|
|
||||||
+h(3, "spacy.set_factory") spacy.set_factory
|
|
||||||
+tag function
|
|
||||||
+tag-new(2)
|
|
||||||
|
|
||||||
p
|
|
||||||
| Set a factory that returns a custom
|
|
||||||
| #[+a("/usage/processing-pipelines") processing pipeline]
|
|
||||||
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
def my_factory(vocab):
|
|
||||||
def my_component(doc):
|
|
||||||
return doc
|
|
||||||
return my_component
|
|
||||||
|
|
||||||
spacy.set_factory('my_factory', my_factory)
|
|
||||||
nlp = Language(pipeline=['my_factory'])
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code factory_id]
|
|
||||||
+cell unicode
|
|
||||||
+cell
|
|
||||||
| Unique name of factory. If added to a new pipeline, spaCy will
|
|
||||||
| look up the factory for this ID and use it to create the
|
|
||||||
| component.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code factory]
|
|
||||||
+cell callable
|
|
||||||
+cell
|
|
||||||
| Callable that takes a #[code Vocab] object and returns a pipeline
|
|
||||||
| component.
|
|
||||||
|
|
|
@ -4,7 +4,14 @@ include ../_includes/_mixins
|
||||||
|
|
||||||
p
|
p
|
||||||
| Usually you'll load this once per process as #[code nlp] and pass the
|
| Usually you'll load this once per process as #[code nlp] and pass the
|
||||||
| instance around your application.
|
| instance around your application. The #[code Language] class is created
|
||||||
|
| when you call #[+api("spacy#load") #[code spacy.load()]] and contains
|
||||||
|
| the shared vocabulary and #[+a("/usage/adding-languages") language data],
|
||||||
|
| optional model data loaded from a #[+a("/models") model package] or
|
||||||
|
| a path, and a #[+a("/usage/processing-pipelines") processing pipeline]
|
||||||
|
| containing components like the tagger or parser that are called on a
|
||||||
|
| document in order. You can also add your own processing pipeline
|
||||||
|
| components that take a #[code Doc] object, modify it and return it.
|
||||||
|
|
||||||
+h(2, "init") Language.__init__
|
+h(2, "init") Language.__init__
|
||||||
+tag method
|
+tag method
|
||||||
|
@ -12,9 +19,9 @@ p
|
||||||
p Initialise a #[code Language] object.
|
p Initialise a #[code Language] object.
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
|
from spacy.vocab import Vocab
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
nlp = Language(pipeline=['token_vectors', 'tags',
|
nlp = Language(Vocab())
|
||||||
'dependencies'])
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -34,14 +41,6 @@ p Initialise a #[code Language] object.
|
||||||
| A function that takes text and returns a #[code Doc] object.
|
| A function that takes text and returns a #[code Doc] object.
|
||||||
| Usually a #[code Tokenizer].
|
| Usually a #[code Tokenizer].
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code pipeline]
|
|
||||||
+cell list
|
|
||||||
+cell
|
|
||||||
| A list of annotation processes or IDs of annotation, processes,
|
|
||||||
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
|
|
||||||
| up in #[code Language.Defaults.factories].
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code meta]
|
+cell #[code meta]
|
||||||
+cell dict
|
+cell dict
|
||||||
|
@ -235,7 +234,6 @@ p
|
||||||
| Can be called before training to pre-process gold data. By default, it
|
| Can be called before training to pre-process gold data. By default, it
|
||||||
| handles nonprojectivity and adds missing tags to the tag map.
|
| handles nonprojectivity and adds missing tags to the tag map.
|
||||||
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code docs_golds]
|
+cell #[code docs_golds]
|
||||||
|
@ -247,6 +245,177 @@ p
|
||||||
+cell tuple
|
+cell tuple
|
||||||
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
|
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
|
||||||
|
|
||||||
|
+h(2, "create_pipe") Language.create_pipe
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p Create a pipeline component from a factory.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
parser = nlp.create_pipe('parser')
|
||||||
|
nlp.add_pipe(parser)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code name]
|
||||||
|
+cell unicode
|
||||||
|
+cell
|
||||||
|
| Factory name to look up in
|
||||||
|
| #[+api("language#class-attributes") #[code Language.factories]].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code config]
|
||||||
|
+cell dict
|
||||||
|
+cell Configuration parameters to initialise component.
|
||||||
|
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell callable
|
||||||
|
+cell The pipeline component.
|
||||||
|
|
||||||
|
+h(2, "add_pipe") Language.add_pipe
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Add a component to the processing pipeline. Valid components are
|
||||||
|
| callables that take a #[code Doc] object, modify it and return it. Only
|
||||||
|
| one of #[code before], #[code after], #[code first] or #[code last] can
|
||||||
|
| be set. Default behaviour is #[code last=True].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
def component(doc):
|
||||||
|
# modify Doc and return it
|
||||||
|
return doc
|
||||||
|
|
||||||
|
nlp.add_pipe(component, before='ner')
|
||||||
|
nlp.add_pipe(component, name='custom_name', last=True)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code component]
|
||||||
|
+cell callable
|
||||||
|
+cell The pipeline component.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code name]
|
||||||
|
+cell unicode
|
||||||
|
+cell
|
||||||
|
| Name of pipeline component. Overwrites existing
|
||||||
|
| #[code component.name] attribute if available. If no #[code name]
|
||||||
|
| is set and the component exposes no name attribute,
|
||||||
|
| #[code component.__name__] is used. An error is raised if the
|
||||||
|
| name already exists in the pipeline.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code before]
|
||||||
|
+cell unicode
|
||||||
|
+cell Component name to insert component directly before.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code after]
|
||||||
|
+cell unicode
|
||||||
|
+cell Component name to insert component directly after:
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code first]
|
||||||
|
+cell bool
|
||||||
|
+cell Insert component first / not first in the pipeline.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code last]
|
||||||
|
+cell bool
|
||||||
|
+cell Insert component last / not last in the pipeline.
|
||||||
|
|
||||||
|
+h(2, "get_pipe") Language.get_pipe
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p Get a pipeline component for a given component name.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
parser = nlp.get_pipe('parser')
|
||||||
|
custom_component = nlp.get_pipe('custom_component')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code name]
|
||||||
|
+cell unicode
|
||||||
|
+cell Name of the pipeline component to get.
|
||||||
|
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell callable
|
||||||
|
+cell The pipeline component.
|
||||||
|
|
||||||
|
+h(2, "replace_pipe") Language.replace_pipe
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p Replace a component in the pipeline.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
nlp.replace_pipe('parser', my_custom_parser)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code name]
|
||||||
|
+cell unicode
|
||||||
|
+cell Name of the component to replace.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code component]
|
||||||
|
+cell callable
|
||||||
|
+cell The pipeline component to inser.
|
||||||
|
|
||||||
|
|
||||||
|
+h(2, "rename_pipe") Language.rename_pipe
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Rename a component in the pipeline. Useful to create custom names for
|
||||||
|
| pre-defined and pre-loaded components. To change the default name of
|
||||||
|
| a component added to the pipeline, you can also use the #[code name]
|
||||||
|
| argument on #[+api("language#add_pipe") #[code add_pipe]].
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
nlp.rename_pipe('parser', 'spacy_parser')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code old_name]
|
||||||
|
+cell unicode
|
||||||
|
+cell Name of the component to rename.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code new_name]
|
||||||
|
+cell unicode
|
||||||
|
+cell New name of the component.
|
||||||
|
|
||||||
|
+h(2, "remove_pipe") Language.remove_pipe
|
||||||
|
+tag method
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Remove a component from the pipeline. Returns the removed component name
|
||||||
|
| and component function.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
name, component = nlp.remove_pipe('parser')
|
||||||
|
assert name == 'parser'
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code name]
|
||||||
|
+cell unicode
|
||||||
|
+cell Name of the component to remove.
|
||||||
|
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell tuple
|
||||||
|
+cell A #[code (name, component)] tuple of the removed component.
|
||||||
|
|
||||||
+h(2, "to_disk") Language.to_disk
|
+h(2, "to_disk") Language.to_disk
|
||||||
+tag method
|
+tag method
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
@ -399,7 +568,15 @@ p Load state from a binary string.
|
||||||
+row
|
+row
|
||||||
+cell #[code pipeline]
|
+cell #[code pipeline]
|
||||||
+cell list
|
+cell list
|
||||||
+cell Sequence of annotation functions.
|
+cell
|
||||||
|
| List of #[code (name, component)] tuples describing the current
|
||||||
|
| processing pipeline, in order.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code pipe_names]
|
||||||
|
+tag-new(2)
|
||||||
|
+cell list
|
||||||
|
+cell List of pipeline component names, in order.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code meta]
|
+cell #[code meta]
|
||||||
|
@ -424,3 +601,12 @@ p Load state from a binary string.
|
||||||
+cell
|
+cell
|
||||||
| Two-letter language ID, i.e.
|
| Two-letter language ID, i.e.
|
||||||
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
|
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code factories]
|
||||||
|
+tag-new(2)
|
||||||
|
+cell dict
|
||||||
|
+cell
|
||||||
|
| Factories that create pre-defined pipeline components, e.g. the
|
||||||
|
| tagger, parser or entity recognizer, keyed by their component
|
||||||
|
| name.
|
||||||
|
|
|
@ -143,6 +143,9 @@
|
||||||
|
|
||||||
//- Layout
|
//- Layout
|
||||||
|
|
||||||
|
.u-width-full
|
||||||
|
width: 100%
|
||||||
|
|
||||||
.u-float-left
|
.u-float-left
|
||||||
float: left
|
float: left
|
||||||
margin-right: 1rem
|
margin-right: 1rem
|
||||||
|
@ -166,6 +169,9 @@
|
||||||
.u-padding-medium
|
.u-padding-medium
|
||||||
padding: 1.8rem
|
padding: 1.8rem
|
||||||
|
|
||||||
|
.u-padding-top
|
||||||
|
padding-top: 2rem
|
||||||
|
|
||||||
.u-inline-block
|
.u-inline-block
|
||||||
display: inline-block
|
display: inline-block
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
display: inline-block
|
display: inline-block
|
||||||
font-size: 0.6em
|
font-size: 0.6em
|
||||||
font-weight: bold
|
font-weight: bold
|
||||||
padding-right: 1.25rem
|
padding-right: 1em
|
||||||
margin-left: -3.75rem
|
margin-left: -3.75rem
|
||||||
text-align: right
|
text-align: right
|
||||||
width: 2.5rem
|
width: 2.5rem
|
||||||
|
|
|
@ -103,11 +103,11 @@
|
||||||
"title": "Language Processing Pipelines",
|
"title": "Language Processing Pipelines",
|
||||||
"next": "vectors-similarity",
|
"next": "vectors-similarity",
|
||||||
"menu": {
|
"menu": {
|
||||||
"How pipelines work": "pipelines",
|
"How Pipelines Work": "pipelines",
|
||||||
"Examples": "examples",
|
"Custom Components": "custom-components",
|
||||||
"Multi-threading": "multithreading",
|
"Multi-threading": "multithreading",
|
||||||
"User Hooks": "user-hooks",
|
"Serialization": "serialization",
|
||||||
"Serialization": "serialization"
|
"Developing Extensions": "extensions"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
151
website/usage/_processing-pipelines/_custom-components.jade
Normal file
151
website/usage/_processing-pipelines/_custom-components.jade
Normal file
|
@ -0,0 +1,151 @@
|
||||||
|
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
|
||||||
|
|
||||||
|
p
|
||||||
|
| A component receives a #[code Doc] object and
|
||||||
|
| #[strong performs the actual processing] – for example, using the current
|
||||||
|
| weights to make a prediction and set some annotation on the document. By
|
||||||
|
| adding a component to the pipeline, you'll get access to the #[code Doc]
|
||||||
|
| at any point #[strong during] processing – instead of only being able to
|
||||||
|
| modify it afterwards.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
def my_component(doc):
|
||||||
|
# do something to the doc here
|
||||||
|
return doc
|
||||||
|
|
||||||
|
+table(["Argument", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code doc]
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell The #[code Doc] object processed by the previous component.
|
||||||
|
|
||||||
|
+row("foot")
|
||||||
|
+cell returns
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell The #[code Doc] object processed by this pipeline component.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Custom components can be added to the pipeline using the
|
||||||
|
| #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
|
||||||
|
| can either specify a component to add it before or after, tell spaCy
|
||||||
|
| to add it first or last in the pipeline, or define a custom name.
|
||||||
|
| If no name is set and no #[code name] attribute is present on your
|
||||||
|
| component, the function name, e.g. #[code component.__name__] is used.
|
||||||
|
|
||||||
|
+code("Adding pipeline components").
|
||||||
|
def my_component(doc):
|
||||||
|
print("After tokenization, this doc has %s tokens." % len(doc))
|
||||||
|
if len(doc) < 10:
|
||||||
|
print("This is a pretty short document.")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
nlp = spacy.load('en')
|
||||||
|
nlp.pipeline.add_pipe(my_component, name='print_info', first=True)
|
||||||
|
print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner']
|
||||||
|
doc = nlp(u"This is a sentence.")
|
||||||
|
|
||||||
|
p
|
||||||
|
| Of course, you can also wrap your component as a class to allow
|
||||||
|
| initialising it with custom settings and hold state within the component.
|
||||||
|
| This is useful for #[strong stateful components], especially ones which
|
||||||
|
| #[strong depend on shared data].
|
||||||
|
|
||||||
|
+code.
|
||||||
|
class MyComponent(object):
|
||||||
|
name = 'print_info'
|
||||||
|
|
||||||
|
def __init__(vocab, short_limit=10):
|
||||||
|
self.vocab = nlp.vocab
|
||||||
|
self.short_limit = short_limit
|
||||||
|
|
||||||
|
def __call__(doc):
|
||||||
|
if len(doc) < self.short_limit:
|
||||||
|
print("This is a pretty short document.")
|
||||||
|
return doc
|
||||||
|
|
||||||
|
my_component = MyComponent(nlp.vocab, short_limit=25)
|
||||||
|
nlp.add_pipe(my_component, first=True)
|
||||||
|
|
||||||
|
+h(3, "custom-components-attributes")
|
||||||
|
| Setting attributes on the #[code Doc], #[code Span] and #[code Token]
|
||||||
|
|
||||||
|
+aside("Why ._?")
|
||||||
|
| Writing to a #[code ._] attribute instead of to the #[code Doc] directly
|
||||||
|
| keeps a clearer separation and makes it easier to ensure backwards
|
||||||
|
| compatibility. For example, if you've implemented your own #[code .coref]
|
||||||
|
| property and spaCy claims it one day, it'll break your code. Similarly,
|
||||||
|
| just by looking at the code, you'll immediately know what's built-in and
|
||||||
|
| what's custom – for example, #[code doc.sentiment] is spaCy, while
|
||||||
|
| #[code doc._.sent_score] isn't.
|
||||||
|
|
||||||
|
+under-construction
|
||||||
|
|
||||||
|
+h(3, "custom-components-user-hooks") Other user hooks
|
||||||
|
|
||||||
|
p
|
||||||
|
| While it's generally recommended to use the #[code Doc._], #[code Span._]
|
||||||
|
| and #[code Token._] proxies to add your own custom attributes, spaCy
|
||||||
|
| offers a few exceptions to allow #[strong customising the built-in methods]
|
||||||
|
| like #[+api("doc#similarity") #[code Doc.similarity]] or
|
||||||
|
| #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can
|
||||||
|
| rely on statistical models you train yourself. For instance, you can
|
||||||
|
| provide your own on-the-fly sentence segmentation algorithm or document
|
||||||
|
| similarity method.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Hooks let you customize some of the behaviours of the #[code Doc],
|
||||||
|
| #[code Span] or #[code Token] objects by adding a component to the
|
||||||
|
| pipeline. For instance, to customize the
|
||||||
|
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
|
||||||
|
| component that sets a custom function to
|
||||||
|
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
|
||||||
|
| method will check the #[code user_hooks] dict, and delegate to your
|
||||||
|
| function if you've set one. Similar results can be achieved by setting
|
||||||
|
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
|
||||||
|
|
||||||
|
+aside("Implementation note")
|
||||||
|
| The hooks live on the #[code Doc] object because the #[code Span] and
|
||||||
|
| #[code Token] objects are created lazily, and don't own any data. They
|
||||||
|
| just proxy to their parent #[code Doc]. This turns out to be convenient
|
||||||
|
| here — we only have to worry about installing hooks in one place.
|
||||||
|
|
||||||
|
+table(["Name", "Customises"])
|
||||||
|
+row
|
||||||
|
+cell #[code user_hooks]
|
||||||
|
+cell
|
||||||
|
+api("doc#vector") #[code Doc.vector]
|
||||||
|
+api("doc#has_vector") #[code Doc.has_vector]
|
||||||
|
+api("doc#vector_norm") #[code Doc.vector_norm]
|
||||||
|
+api("doc#sents") #[code Doc.sents]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code user_token_hooks]
|
||||||
|
+cell
|
||||||
|
+api("token#similarity") #[code Token.similarity]
|
||||||
|
+api("token#vector") #[code Token.vector]
|
||||||
|
+api("token#has_vector") #[code Token.has_vector]
|
||||||
|
+api("token#vector_norm") #[code Token.vector_norm]
|
||||||
|
+api("token#conjuncts") #[code Token.conjuncts]
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code user_span_hooks]
|
||||||
|
+cell
|
||||||
|
+api("span#similarity") #[code Span.similarity]
|
||||||
|
+api("span#vector") #[code Span.vector]
|
||||||
|
+api("span#has_vector") #[code Span.has_vector]
|
||||||
|
+api("span#vector_norm") #[code Span.vector_norm]
|
||||||
|
+api("span#root") #[code Span.root]
|
||||||
|
|
||||||
|
+code("Add custom similarity hooks").
|
||||||
|
class SimilarityModel(object):
|
||||||
|
def __init__(self, model):
|
||||||
|
self._model = model
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
doc.user_hooks['similarity'] = self.similarity
|
||||||
|
doc.user_span_hooks['similarity'] = self.similarity
|
||||||
|
doc.user_token_hooks['similarity'] = self.similarity
|
||||||
|
|
||||||
|
def similarity(self, obj1, obj2):
|
||||||
|
y = self._model([obj1.vector, obj2.vector])
|
||||||
|
return float(y[0])
|
3
website/usage/_processing-pipelines/_extensions.jade
Normal file
3
website/usage/_processing-pipelines/_extensions.jade
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
|
||||||
|
|
||||||
|
+under-construction
|
|
@ -11,7 +11,7 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| When you load a model, spaCy first consults the model's
|
| When you load a model, spaCy first consults the model's
|
||||||
| #[+a("/usage/saving-loading#models-generating") meta.json]. The
|
| #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The
|
||||||
| meta typically includes the model details, the ID of a language class,
|
| meta typically includes the model details, the ID of a language class,
|
||||||
| and an optional list of pipeline components. spaCy then does the
|
| and an optional list of pipeline components. spaCy then does the
|
||||||
| following:
|
| following:
|
||||||
|
@ -21,24 +21,26 @@ p
|
||||||
"name": "example_model",
|
"name": "example_model",
|
||||||
"lang": "en"
|
"lang": "en"
|
||||||
"description": "Example model for spaCy",
|
"description": "Example model for spaCy",
|
||||||
"pipeline": ["tensorizer", "tagger"]
|
"pipeline": ["tagger", "parser"]
|
||||||
}
|
}
|
||||||
|
|
||||||
+list("numbers")
|
+list("numbers")
|
||||||
+item
|
|
||||||
| Look up #[strong pipeline IDs] in the available
|
|
||||||
| #[strong pipeline factories].
|
|
||||||
+item
|
|
||||||
| Initialise the #[strong pipeline components] by calling their
|
|
||||||
| factories with the #[code Vocab] as an argument. This gives each
|
|
||||||
| factory and component access to the pipeline's shared data, like
|
|
||||||
| strings, morphology and annotation scheme.
|
|
||||||
+item
|
+item
|
||||||
| Load the #[strong language class and data] for the given ID via
|
| Load the #[strong language class and data] for the given ID via
|
||||||
| #[+api("util.get_lang_class") #[code get_lang_class]].
|
| #[+api("util.get_lang_class") #[code get_lang_class]] and initialise
|
||||||
|
| it. The #[code Language] class contains the shared vocabulary,
|
||||||
|
| tokenization rules and the language-specific annotation scheme.
|
||||||
+item
|
+item
|
||||||
| Pass the path to the #[strong model data] to the #[code Language]
|
| Iterate over the #[strong pipeline names] and create each component
|
||||||
| class and return it.
|
| using #[+api("language#create_pipe") #[code create_pipe]], which
|
||||||
|
| looks them up in #[code Language.factories].
|
||||||
|
+item
|
||||||
|
| Add each pipeline component to the pipeline in order, using
|
||||||
|
| #[+api("language#add_pipe") #[code add_pipe]].
|
||||||
|
+item
|
||||||
|
| Make the #[strong model data] available to the #[code Language] class
|
||||||
|
| by calling #[+api("language#from_disk") #[code from_disk]] with the
|
||||||
|
| path to the model data ditectory.
|
||||||
|
|
||||||
p
|
p
|
||||||
| So when you call this...
|
| So when you call this...
|
||||||
|
@ -47,12 +49,12 @@ p
|
||||||
nlp = spacy.load('en')
|
nlp = spacy.load('en')
|
||||||
|
|
||||||
p
|
p
|
||||||
| ... the model tells spaCy to use the pipeline
|
| ... the model tells spaCy to use the language #[code "en"] and the pipeline
|
||||||
| #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
|
| #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
|
||||||
| then look up each string in its internal factories registry and
|
| then initialise #[code spacy.lang.en.English], and create each pipeline
|
||||||
| initialise the individual components. It'll then load
|
| component and add it to the processing pipeline. It'll then load in the
|
||||||
| #[code spacy.lang.en.English], pass it the path to the model's data
|
| model's data from its data ditectory and return the modified
|
||||||
| directory, and return it for you to use as the #[code nlp] object.
|
| #[code Language] class for you to use as the #[code nlp] object.
|
||||||
|
|
||||||
p
|
p
|
||||||
| Fundamentally, a #[+a("/models") spaCy model] consists of three
|
| Fundamentally, a #[+a("/models") spaCy model] consists of three
|
||||||
|
@ -74,8 +76,11 @@ p
|
||||||
data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
|
data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
|
||||||
|
|
||||||
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
|
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
|
||||||
nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline
|
nlp = cls() # 2. initialise it
|
||||||
nlp.from_disk(model_data_path) # 3. load in the binary data
|
for name in pipeline:
|
||||||
|
component = nlp.create_pipe(name) # 3. create the pipeline components
|
||||||
|
nlp.add_pipe(component) # 4. add the component to the pipeline
|
||||||
|
nlp.from_disk(model_data_path) # 5. load in the binary data
|
||||||
|
|
||||||
p
|
p
|
||||||
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
|
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
|
||||||
|
@ -87,124 +92,23 @@ p
|
||||||
| document, which is then processed by the component next in the pipeline.
|
| document, which is then processed by the component next in the pipeline.
|
||||||
|
|
||||||
+code("The pipeline under the hood").
|
+code("The pipeline under the hood").
|
||||||
doc = nlp.make_doc(u'This is a sentence')
|
doc = nlp.make_doc(u'This is a sentence') # create a Doc from raw text
|
||||||
for proc in nlp.pipeline:
|
for name, proc in nlp.pipeline: # iterate over components in order
|
||||||
doc = proc(doc)
|
doc = proc(doc) # apply each component
|
||||||
|
|
||||||
+h(3, "creating") Creating pipeline components and factories
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy lets you customise the pipeline with your own components. Components
|
| The current processing pipeline is available as #[code nlp.pipeline],
|
||||||
| are functions that receive a #[code Doc] object, modify and return it.
|
| which returns a list of #[code (name, component)] tuples, or
|
||||||
| If your component is stateful, you'll want to create a new one for each
|
| #[code nlp.pipe_names], which only returns a list of human-readable
|
||||||
| pipeline. You can do that by defining and registering a factory which
|
| component names.
|
||||||
| receives the shared #[code Vocab] object and returns a component.
|
|
||||||
|
|
||||||
+h(4, "creating-component") Creating a component
|
|
||||||
|
|
||||||
p
|
|
||||||
| A component receives a #[code Doc] object and
|
|
||||||
| #[strong performs the actual processing] – for example, using the current
|
|
||||||
| weights to make a prediction and set some annotation on the document. By
|
|
||||||
| adding a component to the pipeline, you'll get access to the #[code Doc]
|
|
||||||
| at any point #[strong during] processing – instead of only being able to
|
|
||||||
| modify it afterwards.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
def my_component(doc):
|
|
||||||
# do something to the doc here
|
|
||||||
return doc
|
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code doc]
|
|
||||||
+cell #[code Doc]
|
|
||||||
+cell The #[code Doc] object processed by the previous component.
|
|
||||||
|
|
||||||
+row("foot")
|
|
||||||
+cell returns
|
|
||||||
+cell #[code Doc]
|
|
||||||
+cell The #[code Doc] object processed by this pipeline component.
|
|
||||||
|
|
||||||
p
|
|
||||||
| When creating a new #[code Language] class, you can pass it a list of
|
|
||||||
| pipeline component functions to execute in that order. You can also
|
|
||||||
| add it to an existing pipeline by modifying #[code nlp.pipeline] – just
|
|
||||||
| be careful not to overwrite a pipeline or its components by accident!
|
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
# Create a new Language object with a pipeline
|
nlp.pipeline
|
||||||
from spacy.language import Language
|
# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
|
||||||
nlp = Language(pipeline=[my_component])
|
nlp.pipe_names
|
||||||
|
# ['tagger', 'parser', 'ner']
|
||||||
|
|
||||||
# Modify an existing pipeline
|
+h(3, "disabling") Disabling and modifying pipeline components
|
||||||
nlp = spacy.load('en')
|
|
||||||
nlp.pipeline.append(my_component)
|
|
||||||
|
|
||||||
+h(4, "creating-factory") Creating a factory
|
|
||||||
|
|
||||||
p
|
|
||||||
| A factory is a #[strong function that returns a pipeline component].
|
|
||||||
| It's called with the #[code Vocab] object, to give it access to the
|
|
||||||
| shared data between components – for example, the strings, morphology,
|
|
||||||
| vectors or annotation scheme. Factories are useful for creating
|
|
||||||
| #[strong stateful components], especially ones which
|
|
||||||
| #[strong depend on shared data].
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
def my_factory(vocab):
|
|
||||||
# load some state
|
|
||||||
def my_component(doc):
|
|
||||||
# process the doc
|
|
||||||
return doc
|
|
||||||
return my_component
|
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code vocab]
|
|
||||||
+cell #[code Vocab]
|
|
||||||
+cell
|
|
||||||
| Shared data between components, including strings, morphology,
|
|
||||||
| vectors etc.
|
|
||||||
|
|
||||||
+row("foot")
|
|
||||||
+cell returns
|
|
||||||
+cell callable
|
|
||||||
+cell The pipeline component.
|
|
||||||
|
|
||||||
p
|
|
||||||
| By creating a factory, you're essentially telling spaCy how to get the
|
|
||||||
| pipeline component #[strong once the vocab is available]. Factories need to
|
|
||||||
| be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
|
|
||||||
| by assigning them a unique ID. This ID can be added to the pipeline as a
|
|
||||||
| string. When creating a pipeline, you're free to mix strings and
|
|
||||||
| callable components:
|
|
||||||
|
|
||||||
+code.
|
|
||||||
spacy.set_factory('my_factory', my_factory)
|
|
||||||
nlp = Language(pipeline=['my_factory', my_other_component])
|
|
||||||
|
|
||||||
p
|
|
||||||
| If spaCy comes across a string in the pipeline, it will try to resolve it
|
|
||||||
| by looking it up in the available factories. The factory will then be
|
|
||||||
| initialised with the #[code Vocab]. Providing factory names instead of
|
|
||||||
| callables also makes it easy to specify them in the model's
|
|
||||||
| #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
|
|
||||||
| training your own model and want to use one of spaCy's default components,
|
|
||||||
| you won't have to worry about finding and implementing it either – to use
|
|
||||||
| the default tagger, simply add #[code "tagger"] to the pipeline, and
|
|
||||||
| #[strong spaCy will know what to do].
|
|
||||||
|
|
||||||
+infobox("Important note")
|
|
||||||
| Because factories are #[strong resolved on initialisation] of the
|
|
||||||
| #[code Language] class, it's #[strong not possible] to add them to the
|
|
||||||
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
|
|
||||||
| works with individual component functions. To use factories, you need to
|
|
||||||
| create a new #[code Language] object, or generate a
|
|
||||||
| #[+a("/usage/training#models-generating") model package] with
|
|
||||||
| a custom pipeline.
|
|
||||||
|
|
||||||
+h(3, "disabling") Disabling pipeline components
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you don't need a particular component of the pipeline – for
|
| If you don't need a particular component of the pipeline – for
|
||||||
|
@ -217,16 +121,19 @@ p
|
||||||
+code.
|
+code.
|
||||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||||
nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
|
nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
|
||||||
doc = nlp(u"I don't want parsed", disable=['parser'])
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| Note that you can't write directly to #[code nlp.pipeline], as this list
|
| You can also use the #[+api("language#remove_pipe") #[code remove_pipe]]
|
||||||
| holds the #[em actual components], not the IDs. However, if you know the
|
| method to remove pipeline components from an existing pipeline, the
|
||||||
| order of the components, you can still slice the list:
|
| #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them,
|
||||||
|
| or the #[+api("language#replace_pipe") #[code replace_pipe]] method
|
||||||
|
| to replace them with a custom component entirely (more details on this
|
||||||
|
| in the section on #[+a("#custom-components") custom components].
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
nlp = spacy.load('en')
|
nlp.remove_pipe('parser')
|
||||||
nlp.pipeline = nlp.pipeline[:2] # only use the first two components
|
nlp.rename_pipe('ner', 'entityrecognizer')
|
||||||
|
nlp.replace_pipe('tagger', my_custom_tagger)
|
||||||
|
|
||||||
+infobox("Important note: disabling pipeline components")
|
+infobox("Important note: disabling pipeline components")
|
||||||
.o-block
|
.o-block
|
||||||
|
@ -234,12 +141,14 @@ p
|
||||||
| processing pipeline components, the #[code parser], #[code tagger]
|
| processing pipeline components, the #[code parser], #[code tagger]
|
||||||
| and #[code entity] keyword arguments have been replaced with
|
| and #[code entity] keyword arguments have been replaced with
|
||||||
| #[code disable], which takes a list of pipeline component names.
|
| #[code disable], which takes a list of pipeline component names.
|
||||||
| This lets you disable both default and custom components when loading
|
| This lets you disable pre-defined components when loading
|
||||||
| a model, or initialising a Language class via
|
| a model, or initialising a Language class via
|
||||||
| #[+api("language-from_disk") #[code from_disk]].
|
| #[+api("language-from_disk") #[code from_disk]].
|
||||||
|
|
||||||
+code-new.
|
+code-new.
|
||||||
nlp = spacy.load('en', disable=['tagger', 'ner'])
|
nlp = spacy.load('en', disable=['ner'])
|
||||||
doc = nlp(u"I don't want parsed", disable=['parser'])
|
nlp.remove_pipe('parser')
|
||||||
|
doc = nlp(u"I don't want parsed")
|
||||||
+code-old.
|
+code-old.
|
||||||
nlp = spacy.load('en', tagger=False, entity=False)
|
nlp = spacy.load('en', tagger=False, entity=False)
|
||||||
doc = nlp(u"I don't want parsed", parse=False)
|
doc = nlp(u"I don't want parsed", parse=False)
|
||||||
|
|
|
@ -21,7 +21,7 @@ p
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.tokens import Span
|
from spacy.tokens.span import Span
|
||||||
|
|
||||||
text = u'Netflix is hiring a new VP of global policy'
|
text = u'Netflix is hiring a new VP of global policy'
|
||||||
|
|
||||||
|
|
|
@ -1,61 +0,0 @@
|
||||||
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
|
|
||||||
|
|
||||||
p
|
|
||||||
| Hooks let you customize some of the behaviours of the #[code Doc],
|
|
||||||
| #[code Span] or #[code Token] objects by adding a component to the
|
|
||||||
| pipeline. For instance, to customize the
|
|
||||||
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
|
|
||||||
| component that sets a custom function to
|
|
||||||
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
|
|
||||||
| method will check the #[code user_hooks] dict, and delegate to your
|
|
||||||
| function if you've set one. Similar results can be achieved by setting
|
|
||||||
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
|
|
||||||
|
|
||||||
+code("Polymorphic similarity example").
|
|
||||||
span.similarity(doc)
|
|
||||||
token.similarity(span)
|
|
||||||
doc1.similarity(doc2)
|
|
||||||
|
|
||||||
p
|
|
||||||
| By default, this just averages the vectors for each document, and
|
|
||||||
| computes their cosine. Obviously, spaCy should make it easy for you to
|
|
||||||
| install your own similarity model. This introduces a tricky design
|
|
||||||
| challenge. The current solution is to add three more dicts to the
|
|
||||||
| #[code Doc] object:
|
|
||||||
|
|
||||||
+aside("Implementation note")
|
|
||||||
| The hooks live on the #[code Doc] object because the #[code Span] and
|
|
||||||
| #[code Token] objects are created lazily, and don't own any data. They
|
|
||||||
| just proxy to their parent #[code Doc]. This turns out to be convenient
|
|
||||||
| here — we only have to worry about installing hooks in one place.
|
|
||||||
|
|
||||||
+table(["Name", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code user_hooks]
|
|
||||||
+cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code user_token_hooks]
|
|
||||||
+cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code user_span_hooks]
|
|
||||||
+cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
|
|
||||||
|
|
||||||
p
|
|
||||||
| To sum up, here's an example of hooking in custom #[code .similarity()]
|
|
||||||
| methods:
|
|
||||||
|
|
||||||
+code("Add custom similarity hooks").
|
|
||||||
class SimilarityModel(object):
|
|
||||||
def __init__(self, model):
|
|
||||||
self._model = model
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
doc.user_hooks['similarity'] = self.similarity
|
|
||||||
doc.user_span_hooks['similarity'] = self.similarity
|
|
||||||
doc.user_token_hooks['similarity'] = self.similarity
|
|
||||||
|
|
||||||
def similarity(self, obj1, obj2):
|
|
||||||
y = self._model([obj1.vector, obj2.vector])
|
|
||||||
return float(y[0])
|
|
|
@ -8,18 +8,18 @@ include _spacy-101/_pipelines
|
||||||
+h(2, "pipelines") How pipelines work
|
+h(2, "pipelines") How pipelines work
|
||||||
include _processing-pipelines/_pipelines
|
include _processing-pipelines/_pipelines
|
||||||
|
|
||||||
+section("examples")
|
+section("custom-components")
|
||||||
+h(2, "examples") Examples
|
+h(2, "custom-components") Creating custom pipeline components
|
||||||
include _processing-pipelines/_examples
|
include _processing-pipelines/_custom-components
|
||||||
|
|
||||||
+section("multithreading")
|
+section("multithreading")
|
||||||
+h(2, "multithreading") Multi-threading
|
+h(2, "multithreading") Multi-threading
|
||||||
include _processing-pipelines/_multithreading
|
include _processing-pipelines/_multithreading
|
||||||
|
|
||||||
+section("user-hooks")
|
|
||||||
+h(2, "user-hooks") User hooks
|
|
||||||
include _processing-pipelines/_user-hooks
|
|
||||||
|
|
||||||
+section("serialization")
|
+section("serialization")
|
||||||
+h(2, "serialization") Serialization
|
+h(2, "serialization") Serialization
|
||||||
include _processing-pipelines/_serialization
|
include _processing-pipelines/_serialization
|
||||||
|
|
||||||
|
+section("extensions")
|
||||||
|
+h(2, "extensions") Developing spaCy extensions
|
||||||
|
include _processing-pipelines/_extensions
|
||||||
|
|
Loading…
Reference in New Issue
Block a user