mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'feature/pipeline-management' into feature/dot-underscore
This commit is contained in:
		
						commit
						de374dc72a
					
				|  | @ -1,12 +1,9 @@ | |||
| # coding: utf8 | ||||
| from __future__ import absolute_import, unicode_literals | ||||
| from contextlib import contextmanager | ||||
| import dill | ||||
| 
 | ||||
| import numpy | ||||
| from thinc.neural import Model | ||||
| from thinc.neural.ops import NumpyOps, CupyOps | ||||
| from thinc.neural.optimizers import Adam, SGD | ||||
| from thinc.neural.optimizers import Adam | ||||
| import random | ||||
| import ujson | ||||
| from collections import OrderedDict | ||||
|  | @ -17,24 +14,20 @@ from .vocab import Vocab | |||
| from .tagger import Tagger | ||||
| from .lemmatizer import Lemmatizer | ||||
| from .syntax.parser import get_templates | ||||
| from .syntax import nonproj | ||||
| 
 | ||||
| from .pipeline import NeuralDependencyParser, EntityRecognizer | ||||
| from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer | ||||
| from .pipeline import NeuralLabeller | ||||
| from .pipeline import SimilarityHook | ||||
| from .pipeline import TextCategorizer | ||||
| from . import about | ||||
| from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger | ||||
| from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer | ||||
| 
 | ||||
| from .compat import json_dumps, izip | ||||
| from .scorer import Scorer | ||||
| from ._ml import link_vectors_to_models | ||||
| from .attrs import IS_STOP | ||||
| from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||
| from .lang.tokenizer_exceptions import TOKEN_MATCH | ||||
| from .lang.tag_map import TAG_MAP | ||||
| from .lang.lex_attrs import LEX_ATTRS | ||||
| from . import util | ||||
| from .scorer import Scorer | ||||
| from ._ml import link_vectors_to_models | ||||
| from . import about | ||||
| 
 | ||||
| 
 | ||||
| class BaseDefaults(object): | ||||
|  | @ -70,59 +63,7 @@ class BaseDefaults(object): | |||
|                          prefix_search=prefix_search, suffix_search=suffix_search, | ||||
|                          infix_finditer=infix_finditer, token_match=token_match) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_tagger(cls, nlp=None, **cfg): | ||||
|         if nlp is None: | ||||
|             return NeuralTagger(cls.create_vocab(nlp), **cfg) | ||||
|         else: | ||||
|             return NeuralTagger(nlp.vocab, **cfg) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_parser(cls, nlp=None, **cfg): | ||||
|         if nlp is None: | ||||
|             return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) | ||||
|         else: | ||||
|             return NeuralDependencyParser(nlp.vocab, **cfg) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_entity(cls, nlp=None, **cfg): | ||||
|         if nlp is None: | ||||
|             return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) | ||||
|         else: | ||||
|             return NeuralEntityRecognizer(nlp.vocab, **cfg) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def create_pipeline(cls, nlp=None, disable=tuple()): | ||||
|         meta = nlp.meta if nlp is not None else {} | ||||
|         # Resolve strings, like "cnn", "lstm", etc | ||||
|         pipeline = [] | ||||
|         for entry in meta.get('pipeline', []): | ||||
|             if entry in disable or getattr(entry, 'name', entry) in disable: | ||||
|                 continue | ||||
|             factory = cls.Defaults.factories[entry] | ||||
|             pipeline.append(factory(nlp, **meta.get(entry, {}))) | ||||
|         return pipeline | ||||
| 
 | ||||
|     factories = { | ||||
|         'make_doc': create_tokenizer, | ||||
|         'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], | ||||
|         'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], | ||||
|         'parser': lambda nlp, **cfg: [ | ||||
|             NeuralDependencyParser(nlp.vocab, **cfg), | ||||
|             nonproj.deprojectivize], | ||||
|         'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], | ||||
|         'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], | ||||
|         'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], | ||||
|         # Temporary compatibility -- delete after pivot | ||||
|         'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], | ||||
|         'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], | ||||
|         'dependencies': lambda nlp, **cfg: [ | ||||
|             NeuralDependencyParser(nlp.vocab, **cfg), | ||||
|             nonproj.deprojectivize, | ||||
|         ], | ||||
|         'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], | ||||
|     } | ||||
| 
 | ||||
|     pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] | ||||
|     token_match = TOKEN_MATCH | ||||
|     prefixes = tuple(TOKENIZER_PREFIXES) | ||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||
|  | @ -152,8 +93,17 @@ class Language(object): | |||
|     Defaults = BaseDefaults | ||||
|     lang = None | ||||
| 
 | ||||
|     def __init__(self, vocab=True, make_doc=True, pipeline=None, | ||||
|                  meta={}, disable=tuple(), **kwargs): | ||||
|     factories = { | ||||
|         'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), | ||||
|         'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), | ||||
|         'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), | ||||
|         'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), | ||||
|         'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), | ||||
|         'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), | ||||
|         'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) | ||||
|     } | ||||
| 
 | ||||
|     def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): | ||||
|         """Initialise a Language object. | ||||
| 
 | ||||
|         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via | ||||
|  | @ -179,28 +129,7 @@ class Language(object): | |||
|             factory = self.Defaults.create_tokenizer | ||||
|             make_doc = factory(self, **meta.get('tokenizer', {})) | ||||
|         self.tokenizer = make_doc | ||||
|         if pipeline is True: | ||||
|             self.pipeline = self.Defaults.create_pipeline(self, disable) | ||||
|         elif pipeline: | ||||
|             # Careful not to do getattr(p, 'name', None) here | ||||
|             # If we had disable=[None], we'd disable everything! | ||||
|             self.pipeline = [p for p in pipeline | ||||
|                              if p not in disable | ||||
|                              and getattr(p, 'name', p) not in disable] | ||||
|             # Resolve strings, like "cnn", "lstm", etc | ||||
|             for i, entry in enumerate(self.pipeline): | ||||
|                 if entry in self.Defaults.factories: | ||||
|                     factory = self.Defaults.factories[entry] | ||||
|                     self.pipeline[i] = factory(self, **meta.get(entry, {})) | ||||
|         else: | ||||
|         self.pipeline = [] | ||||
|         flat_list = [] | ||||
|         for pipe in self.pipeline: | ||||
|             if isinstance(pipe, list): | ||||
|                 flat_list.extend(pipe) | ||||
|             else: | ||||
|                 flat_list.append(pipe) | ||||
|         self.pipeline = flat_list | ||||
|         self._optimizer = None | ||||
| 
 | ||||
|     @property | ||||
|  | @ -214,11 +143,7 @@ class Language(object): | |||
|         self._meta.setdefault('email', '') | ||||
|         self._meta.setdefault('url', '') | ||||
|         self._meta.setdefault('license', '') | ||||
|         pipeline = [] | ||||
|         for component in self.pipeline: | ||||
|             if hasattr(component, 'name'): | ||||
|                 pipeline.append(component.name) | ||||
|         self._meta['pipeline'] = pipeline | ||||
|         self._meta['pipeline'] = self.pipe_names | ||||
|         return self._meta | ||||
| 
 | ||||
|     @meta.setter | ||||
|  | @ -228,34 +153,137 @@ class Language(object): | |||
|     # Conveniences to access pipeline components | ||||
|     @property | ||||
|     def tensorizer(self): | ||||
|         return self.get_component('tensorizer') | ||||
|         return self.get_pipe('tensorizer') | ||||
| 
 | ||||
|     @property | ||||
|     def tagger(self): | ||||
|         return self.get_component('tagger') | ||||
|         return self.get_pipe('tagger') | ||||
| 
 | ||||
|     @property | ||||
|     def parser(self): | ||||
|         return self.get_component('parser') | ||||
|         return self.get_pipe('parser') | ||||
| 
 | ||||
|     @property | ||||
|     def entity(self): | ||||
|         return self.get_component('ner') | ||||
|         return self.get_pipe('ner') | ||||
| 
 | ||||
|     @property | ||||
|     def matcher(self): | ||||
|         return self.get_component('matcher') | ||||
|         return self.get_pipe('matcher') | ||||
| 
 | ||||
|     def get_component(self, name): | ||||
|         if self.pipeline in (True, None): | ||||
|             return None | ||||
|         for proc in self.pipeline: | ||||
|             if hasattr(proc, 'name') and proc.name.endswith(name): | ||||
|                 return proc | ||||
|         return None | ||||
|     @property | ||||
|     def pipe_names(self): | ||||
|         """Get names of available pipeline components. | ||||
| 
 | ||||
|         RETURNS (list): List of component name strings, in order. | ||||
|         """ | ||||
|         return [pipe_name for pipe_name, _ in self.pipeline] | ||||
| 
 | ||||
|     def get_pipe(self, name): | ||||
|         """Get a pipeline component for a given component name. | ||||
| 
 | ||||
|         name (unicode): Name of pipeline component to get. | ||||
|         RETURNS (callable): The pipeline component. | ||||
|         """ | ||||
|         for pipe_name, component in self.pipeline: | ||||
|             if pipe_name == name: | ||||
|                 return component | ||||
|         msg = "No component '{}' found in pipeline. Available names: {}" | ||||
|         raise KeyError(msg.format(name, self.pipe_names)) | ||||
| 
 | ||||
|     def create_pipe(self, name, config=dict()): | ||||
|         """Create a pipeline component from a factory. | ||||
| 
 | ||||
|         name (unicode): Factory name to look up in `Language.factories`. | ||||
|         config (dict): Configuration parameters to initialise component. | ||||
|         RETURNS (callable): Pipeline component. | ||||
|         """ | ||||
|         if name not in self.factories: | ||||
|             raise KeyError("Can't find factory for '{}'.".format(name)) | ||||
|         factory = self.factories[name] | ||||
|         return factory(self, **config) | ||||
| 
 | ||||
|     def add_pipe(self, component, name=None, before=None, after=None, | ||||
|                  first=None, last=None): | ||||
|         """Add a component to the processing pipeline. Valid components are | ||||
|         callables that take a `Doc` object, modify it and return it. Only one of | ||||
|         before, after, first or last can be set. Default behaviour is "last". | ||||
| 
 | ||||
|         component (callable): The pipeline component. | ||||
|         name (unicode): Name of pipeline component. Overwrites existing | ||||
|             component.name attribute if available. If no name is set and | ||||
|             the component exposes no name attribute, component.__name__ is | ||||
|             used. An error is raised if the name already exists in the pipeline. | ||||
|         before (unicode): Component name to insert component directly before. | ||||
|         after (unicode): Component name to insert component directly after. | ||||
|         first (bool): Insert component first / not first in the pipeline. | ||||
|         last (bool): Insert component last / not last in the pipeline. | ||||
| 
 | ||||
|         EXAMPLE: | ||||
|             >>> nlp.add_pipe(component, before='ner') | ||||
|             >>> nlp.add_pipe(component, name='custom_name', last=True) | ||||
|         """ | ||||
|         if name is None: | ||||
|             name = getattr(component, 'name', component.__name__) | ||||
|         if name in self.pipe_names: | ||||
|             raise ValueError("'{}' already exists in pipeline.".format(name)) | ||||
|         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: | ||||
|             msg = ("Invalid constraints. You can only set one of the " | ||||
|                    "following: before, after, first, last.") | ||||
|             raise ValueError(msg) | ||||
|         pipe = (name, component) | ||||
|         if last or not any([first, before, after]): | ||||
|             self.pipeline.append(pipe) | ||||
|         elif first: | ||||
|             self.pipeline.insert(0, pipe) | ||||
|         elif before and before in self.pipe_names: | ||||
|             self.pipeline.insert(self.pipe_names.index(before), pipe) | ||||
|         elif after and after in self.pipe_names: | ||||
|             self.pipeline.insert(self.pipe_names.index(after), pipe) | ||||
|         else: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             unfound = before or after | ||||
|             raise ValueError(msg.format(unfound, self.pipe_names)) | ||||
| 
 | ||||
|     def replace_pipe(self, name, component): | ||||
|         """Replace a component in the pipeline. | ||||
| 
 | ||||
|         name (unicode): Name of the component to replace. | ||||
|         component (callable): Pipeline component. | ||||
|         """ | ||||
|         if name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(name, self.pipe_names)) | ||||
|         self.pipeline[self.pipe_names.index(name)] = (name, component) | ||||
| 
 | ||||
|     def rename_pipe(self, old_name, new_name): | ||||
|         """Rename a pipeline component. | ||||
| 
 | ||||
|         old_name (unicode): Name of the component to rename. | ||||
|         new_name (unicode): New name of the component. | ||||
|         """ | ||||
|         if old_name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(old_name, self.pipe_names)) | ||||
|         if new_name in self.pipe_names: | ||||
|             msg = "'{}' already exists in pipeline. Existing names: {}" | ||||
|             raise ValueError(msg.format(new_name, self.pipe_names)) | ||||
|         i = self.pipe_names.index(old_name) | ||||
|         self.pipeline[i] = (new_name, self.pipeline[i][1]) | ||||
| 
 | ||||
|     def remove_pipe(self, name): | ||||
|         """Remove a component from the pipeline. | ||||
| 
 | ||||
|         name (unicode): Name of the component to remove. | ||||
|         RETURNS (tuple): A `(name, component)` tuple of the removed component. | ||||
|         """ | ||||
|         if name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(name, self.pipe_names)) | ||||
|         return self.pipeline.pop(self.pipe_names.index(name)) | ||||
| 
 | ||||
|     def __call__(self, text, disable=[]): | ||||
|         """'Apply the pipeline to some text. The text can span multiple sentences, | ||||
|         """Apply the pipeline to some text. The text can span multiple sentences, | ||||
|         and can contain arbtrary whitespace. Alignment into the original string | ||||
|         is preserved. | ||||
| 
 | ||||
|  | @ -269,8 +297,7 @@ class Language(object): | |||
|             ('An', 'NN') | ||||
|         """ | ||||
|         doc = self.make_doc(text) | ||||
|         for proc in self.pipeline: | ||||
|             name = getattr(proc, 'name', None) | ||||
|         for name, proc in self.pipeline: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             doc = proc(doc) | ||||
|  | @ -308,7 +335,7 @@ class Language(object): | |||
|             grads[key] = (W, dW) | ||||
|         pipes = list(self.pipeline) | ||||
|         random.shuffle(pipes) | ||||
|         for proc in pipes: | ||||
|         for name, proc in pipes: | ||||
|             if not hasattr(proc, 'update'): | ||||
|                 continue | ||||
|             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) | ||||
|  | @ -322,7 +349,7 @@ class Language(object): | |||
|         docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. | ||||
|         YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. | ||||
|         """ | ||||
|         for proc in self.pipeline: | ||||
|         for name, proc in self.pipeline: | ||||
|             if hasattr(proc, 'preprocess_gold'): | ||||
|                 docs_golds = proc.preprocess_gold(docs_golds) | ||||
|         for doc, gold in docs_golds: | ||||
|  | @ -354,7 +381,7 @@ class Language(object): | |||
| 
 | ||||
|         get_gold_tuples (function): Function returning gold data | ||||
|         **cfg: Config parameters. | ||||
|         returns: An optimizer | ||||
|         RETURNS: An optimizer | ||||
|         """ | ||||
|         # Populate vocab | ||||
|         if get_gold_tuples is not None: | ||||
|  | @ -371,7 +398,7 @@ class Language(object): | |||
|         else: | ||||
|             device = None | ||||
|         link_vectors_to_models(self.vocab) | ||||
|         for proc in self.pipeline: | ||||
|         for name, proc in self.pipeline: | ||||
|             if hasattr(proc, 'begin_training'): | ||||
|                 context = proc.begin_training(get_gold_tuples(), | ||||
|                                               pipeline=self.pipeline) | ||||
|  | @ -393,7 +420,7 @@ class Language(object): | |||
|         docs, golds = zip(*docs_golds) | ||||
|         docs = list(docs) | ||||
|         golds = list(golds) | ||||
|         for pipe in self.pipeline: | ||||
|         for name, pipe in self.pipeline: | ||||
|             if not hasattr(pipe, 'pipe'): | ||||
|                 for doc in docs: | ||||
|                     pipe(doc) | ||||
|  | @ -419,7 +446,7 @@ class Language(object): | |||
|             >>> with nlp.use_params(optimizer.averages): | ||||
|             >>>     nlp.to_disk('/tmp/checkpoint') | ||||
|         """ | ||||
|         contexts = [pipe.use_params(params) for pipe | ||||
|         contexts = [pipe.use_params(params) for name, pipe | ||||
|                     in self.pipeline if hasattr(pipe, 'use_params')] | ||||
|         # TODO: Having trouble with contextlib | ||||
|         # Workaround: these aren't actually context managers atm. | ||||
|  | @ -466,8 +493,7 @@ class Language(object): | |||
|                 yield (doc, context) | ||||
|             return | ||||
|         docs = (self.make_doc(text) for text in texts) | ||||
|         for proc in self.pipeline: | ||||
|             name = getattr(proc, 'name', None) | ||||
|         for name, proc in self.pipeline: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if hasattr(proc, 'pipe'): | ||||
|  | @ -495,14 +521,14 @@ class Language(object): | |||
|             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), | ||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||
|         )) | ||||
|         for proc in self.pipeline: | ||||
|         for name, proc in self.pipeline: | ||||
|             if not hasattr(proc, 'name'): | ||||
|                 continue | ||||
|             if proc.name in disable: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'to_disk'): | ||||
|                 continue | ||||
|             serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||
|             serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||
|         serializers['vocab'] = lambda p: self.vocab.to_disk(p) | ||||
|         util.to_disk(path, serializers, {p: False for p in disable}) | ||||
| 
 | ||||
|  | @ -526,14 +552,12 @@ class Language(object): | |||
|             ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), | ||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||
|         )) | ||||
|         for proc in self.pipeline: | ||||
|             if not hasattr(proc, 'name'): | ||||
|                 continue | ||||
|             if proc.name in disable: | ||||
|         for name, proc in self.pipeline: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'to_disk'): | ||||
|                 continue | ||||
|             deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) | ||||
|             deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) | ||||
|         exclude = {p: False for p in disable} | ||||
|         if not (path / 'vocab').exists(): | ||||
|             exclude['vocab'] = True | ||||
|  | @ -552,8 +576,8 @@ class Language(object): | |||
|             ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), | ||||
|             ('meta', lambda: ujson.dumps(self.meta)) | ||||
|         )) | ||||
|         for i, proc in enumerate(self.pipeline): | ||||
|             if getattr(proc, 'name', None) in disable: | ||||
|         for i, (name, proc) in enumerate(self.pipeline): | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'to_bytes'): | ||||
|                 continue | ||||
|  | @ -572,8 +596,8 @@ class Language(object): | |||
|             ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), | ||||
|             ('meta', lambda b: self.meta.update(ujson.loads(b))) | ||||
|         )) | ||||
|         for i, proc in enumerate(self.pipeline): | ||||
|             if getattr(proc, 'name', None) in disable: | ||||
|         for i, (name, proc) in enumerate(self.pipeline): | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, 'from_bytes'): | ||||
|                 continue | ||||
|  |  | |||
|  | @ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity | |||
| from .tokens.doc cimport Doc | ||||
| from .syntax.parser cimport Parser as LinearParser | ||||
| from .syntax.nn_parser cimport Parser as NeuralParser | ||||
| from .syntax import nonproj | ||||
| from .syntax.parser import get_templates as get_feature_templates | ||||
| from .syntax.beam_parser cimport BeamParser | ||||
| from .syntax.ner cimport BiluoPushDown | ||||
|  | @ -773,11 +774,19 @@ cdef class DependencyParser(LinearParser): | |||
|         if isinstance(label, basestring): | ||||
|             label = self.vocab.strings[label] | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         return [nonproj.deprojectivize] | ||||
| 
 | ||||
| 
 | ||||
| cdef class NeuralDependencyParser(NeuralParser): | ||||
|     name = 'parser' | ||||
|     TransitionSystem = ArcEager | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         return [nonproj.deprojectivize] | ||||
| 
 | ||||
|     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||
|         for target in []: | ||||
|             labeller = NeuralLabeller(self.vocab, target=target) | ||||
|  | @ -818,6 +827,11 @@ cdef class BeamDependencyParser(BeamParser): | |||
|         if isinstance(label, basestring): | ||||
|             label = self.vocab.strings[label] | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         return [nonproj.deprojectivize] | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', | ||||
|            'BeamEntityRecognizer', 'TokenVectorEnoder'] | ||||
|  |  | |||
|  | @ -779,6 +779,14 @@ cdef class Parser: | |||
|             for i in range(doc.length): | ||||
|                 doc.c[i] = state.c._sent[i] | ||||
|             self.moves.finalize_doc(doc) | ||||
|             for hook in self.postprocesses: | ||||
|                 for doc in docs: | ||||
|                     hook(doc) | ||||
| 
 | ||||
|     @property | ||||
|     def postprocesses(self): | ||||
|         # Available for subclasses, e.g. to deprojectivize | ||||
|         return [] | ||||
| 
 | ||||
|     def add_label(self, label): | ||||
|         for action in self.moves.action_types: | ||||
|  |  | |||
|  | @ -58,8 +58,9 @@ def en_vocab(): | |||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def en_parser(): | ||||
|     return util.get_lang_class('en').Defaults.create_parser() | ||||
| def en_parser(en_vocab): | ||||
|     nlp = util.get_lang_class('en')(en_vocab) | ||||
|     return nlp.create_pipe('parser') | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
|  |  | |||
|  | @ -1,10 +1,11 @@ | |||
| import spacy | ||||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_beam_parse(): | ||||
|     nlp = spacy.load('en_core_web_sm') | ||||
|     doc = nlp(u'Australia is a country', disable=['ner']) | ||||
|     ents = nlp.entity(doc, beam_width=2) | ||||
|     print(ents) | ||||
| 
 | ||||
| @pytest.mark.models('en') | ||||
| def test_beam_parse(EN): | ||||
|     doc = EN(u'Australia is a country', disable=['ner']) | ||||
|     ents = EN.entity(doc, beam_width=2) | ||||
|     print(ents) | ||||
|  |  | |||
							
								
								
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,84 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from ...language import Language | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def nlp(): | ||||
|     return Language() | ||||
| 
 | ||||
| 
 | ||||
| def new_pipe(doc): | ||||
|     return doc | ||||
| 
 | ||||
| 
 | ||||
| def test_add_pipe_no_name(nlp): | ||||
|     nlp.add_pipe(new_pipe) | ||||
|     assert 'new_pipe' in nlp.pipe_names | ||||
| 
 | ||||
| 
 | ||||
| def test_add_pipe_duplicate_name(nlp): | ||||
|     nlp.add_pipe(new_pipe, name='duplicate_name') | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.add_pipe(new_pipe, name='duplicate_name') | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name', ['parser']) | ||||
| def test_add_pipe_first(nlp, name): | ||||
|     nlp.add_pipe(new_pipe, name=name, first=True) | ||||
|     assert nlp.pipeline[0][0] == name | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) | ||||
| def test_add_pipe_last(nlp, name1, name2): | ||||
|     nlp.add_pipe(lambda doc: doc, name=name2) | ||||
|     nlp.add_pipe(new_pipe, name=name1, last=True) | ||||
|     assert nlp.pipeline[0][0] != name1 | ||||
|     assert nlp.pipeline[-1][0] == name1 | ||||
| 
 | ||||
| 
 | ||||
| def test_cant_add_pipe_first_and_last(nlp): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.add_pipe(new_pipe, first=True, last=True) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name', ['my_component']) | ||||
| def test_get_pipe(nlp, name): | ||||
|     with pytest.raises(KeyError): | ||||
|         nlp.get_pipe(name) | ||||
|     nlp.add_pipe(new_pipe, name=name) | ||||
|     assert nlp.get_pipe(name) == new_pipe | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) | ||||
| def test_replace_pipe(nlp, name, replacement): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.replace_pipe(name, new_pipe) | ||||
|     nlp.add_pipe(new_pipe, name=name) | ||||
|     nlp.replace_pipe(name, replacement) | ||||
|     assert nlp.get_pipe(name) != new_pipe | ||||
|     assert nlp.get_pipe(name) == replacement | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) | ||||
| def test_rename_pipe(nlp, old_name, new_name): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.rename_pipe(old_name, new_name) | ||||
|     nlp.add_pipe(new_pipe, name=old_name) | ||||
|     nlp.rename_pipe(old_name, new_name) | ||||
|     assert nlp.pipeline[0][0] == new_name | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('name', ['my_component']) | ||||
| def test_remove_pipe(nlp, name): | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp.remove_pipe(name) | ||||
|     nlp.add_pipe(new_pipe, name=name) | ||||
|     assert len(nlp.pipeline) == 1 | ||||
|     removed_name, removed_component = nlp.remove_pipe(name) | ||||
|     assert not len(nlp.pipeline) | ||||
|     assert removed_name == name | ||||
|     assert removed_component == new_pipe | ||||
|  | @ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides): | |||
|     if not meta: | ||||
|         meta = get_model_meta(model_path) | ||||
|     cls = get_lang_class(meta['lang']) | ||||
|     nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides) | ||||
|     nlp = cls(meta=meta, **overrides) | ||||
|     pipeline = meta.get('pipeline', []) | ||||
|     disable = overrides.get('disable', []) | ||||
|     if pipeline is True: | ||||
|         pipeline = nlp.Defaults.pipe_names | ||||
|     elif pipeline in (False, None): | ||||
|         pipeline = [] | ||||
|     for name in pipeline: | ||||
|         if name not in disable: | ||||
|             config = meta.get('pipeline_args', {}).get(name, {}) | ||||
|             component = nlp.create_pipe(name, config=config) | ||||
|             nlp.add_pipe(component, name=name) | ||||
|     return nlp.from_disk(model_path) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap) | |||
| 
 | ||||
| //- Code blocks to display old/new versions | ||||
| 
 | ||||
| mixin code-compare() | ||||
|     span.u-inline-block.u-padding-top.u-width-full | ||||
|         block | ||||
| 
 | ||||
| mixin code-old() | ||||
|     +code(false, false, false, false, "reject").o-block-small | ||||
|         block | ||||
|  |  | |||
|  | @ -43,6 +43,20 @@ p | |||
|         +cell #[code Language] | ||||
|         +cell A #[code Language] object with the loaded model. | ||||
| 
 | ||||
| p | ||||
|     |  Essentially, #[code spacy.load()] is a convenience wrapper that reads | ||||
|     |  the language ID and pipeline components from a model's #[code meta.json], | ||||
|     |  initialises the #[code Language] class, loads in the model data and | ||||
|     |  returns it. | ||||
| 
 | ||||
| +code("Abstract example"). | ||||
|     cls = util.get_lang_class(lang)         #  get language for ID, e.g. 'en' | ||||
|     nlp = cls()                             #  initialise the language | ||||
|     for name in pipeline: | ||||
|         component = nlp.create_pipe(name)   #  create each pipeline component | ||||
|         nlp.add_pipe(component)             #  add component to pipeline | ||||
|     nlp.from_disk(model_data_path)          #  load in model data | ||||
| 
 | ||||
| +infobox("Deprecation note", "⚠️") | ||||
|     .o-block | ||||
|         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | ||||
|  | @ -141,37 +155,3 @@ p | |||
|         +cell returns | ||||
|         +cell unicode | ||||
|         +cell The explanation, or #[code None] if not found in the glossary. | ||||
| 
 | ||||
| +h(3, "spacy.set_factory") spacy.set_factory | ||||
|     +tag function | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Set a factory that returns a custom | ||||
|     |  #[+a("/usage/processing-pipelines") processing pipeline] | ||||
|     |  component. Factories are useful for creating stateful components, especially ones which depend on shared data. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_factory(vocab): | ||||
|         def my_component(doc): | ||||
|             return doc | ||||
|         return my_component | ||||
| 
 | ||||
|     spacy.set_factory('my_factory', my_factory) | ||||
|     nlp = Language(pipeline=['my_factory']) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code factory_id] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Unique name of factory. If added to a new pipeline, spaCy will | ||||
|             |  look up the factory for this ID and use it to create the | ||||
|             |  component. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code factory] | ||||
|         +cell callable | ||||
|         +cell | ||||
|             |  Callable that takes a #[code Vocab] object and returns a pipeline | ||||
|             |  component. | ||||
|  |  | |||
|  | @ -4,7 +4,14 @@ include ../_includes/_mixins | |||
| 
 | ||||
| p | ||||
|     |  Usually you'll load this once per process as #[code nlp] and pass the | ||||
|     |  instance around your application. | ||||
|     |  instance around your application. The #[code Language] class is created | ||||
|     |  when you call #[+api("spacy#load") #[code spacy.load()]] and contains | ||||
|     |  the shared vocabulary and #[+a("/usage/adding-languages") language data], | ||||
|     |  optional model data loaded from a #[+a("/models") model package] or | ||||
|     |  a path, and a #[+a("/usage/processing-pipelines") processing pipeline] | ||||
|     |  containing components like the tagger or parser that are called on a | ||||
|     |  document in order. You can also add your own processing pipeline | ||||
|     |  components that take a #[code Doc] object, modify it and return it. | ||||
| 
 | ||||
| +h(2, "init") Language.__init__ | ||||
|     +tag method | ||||
|  | @ -12,9 +19,9 @@ p | |||
| p Initialise a #[code Language] object. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.vocab import Vocab | ||||
|     from spacy.language import Language | ||||
|     nlp = Language(pipeline=['token_vectors', 'tags', | ||||
|                              'dependencies']) | ||||
|     nlp = Language(Vocab()) | ||||
| 
 | ||||
|     from spacy.lang.en import English | ||||
|     nlp = English() | ||||
|  | @ -34,14 +41,6 @@ p Initialise a #[code Language] object. | |||
|             |  A function that takes text and returns a #[code Doc] object. | ||||
|             |  Usually a #[code Tokenizer]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code pipeline] | ||||
|         +cell list | ||||
|         +cell | ||||
|             |  A list of annotation processes or IDs of annotation, processes, | ||||
|             |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked | ||||
|             |  up in #[code Language.Defaults.factories]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code meta] | ||||
|         +cell dict | ||||
|  | @ -235,7 +234,6 @@ p | |||
|     |  Can be called before training to pre-process gold data. By default, it | ||||
|     |  handles nonprojectivity and adds missing tags to the tag map. | ||||
| 
 | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code docs_golds] | ||||
|  | @ -247,6 +245,177 @@ p | |||
|         +cell tuple | ||||
|         +cell Tuples of #[code Doc] and #[code GoldParse] objects. | ||||
| 
 | ||||
| +h(2, "create_pipe") Language.create_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Create a pipeline component from a factory. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     parser = nlp.create_pipe('parser') | ||||
|     nlp.add_pipe(parser) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Factory name to look up in | ||||
|             |  #[+api("language#class-attributes") #[code Language.factories]]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code config] | ||||
|         +cell dict | ||||
|         +cell Configuration parameters to initialise component. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
| +h(2, "add_pipe") Language.add_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Add a component to the processing pipeline. Valid components are | ||||
|     |  callables that take a #[code Doc] object, modify it and return it. Only | ||||
|     |  one of #[code before], #[code after], #[code first] or #[code last] can | ||||
|     |  be set. Default behaviour is #[code last=True]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def component(doc): | ||||
|         # modify Doc and return it | ||||
|         return doc | ||||
| 
 | ||||
|     nlp.add_pipe(component, before='ner') | ||||
|     nlp.add_pipe(component, name='custom_name', last=True) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code component] | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell | ||||
|             |  Name of pipeline component. Overwrites existing | ||||
|             |  #[code component.name] attribute if available. If no #[code name] | ||||
|             |  is set and the component exposes no name attribute, | ||||
|             |  #[code component.__name__] is used. An error is raised if the | ||||
|             |  name already exists in the pipeline. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code before] | ||||
|         +cell unicode | ||||
|         +cell Component name to insert component directly before. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code after] | ||||
|         +cell unicode | ||||
|         +cell Component name to insert component directly after: | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code first] | ||||
|         +cell bool | ||||
|         +cell Insert component first / not first in the pipeline. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code last] | ||||
|         +cell bool | ||||
|         +cell Insert component last / not last in the pipeline. | ||||
| 
 | ||||
| +h(2, "get_pipe") Language.get_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Get a pipeline component for a given component name. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     parser = nlp.get_pipe('parser') | ||||
|     custom_component = nlp.get_pipe('custom_component') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the pipeline component to get. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
| +h(2, "replace_pipe") Language.replace_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Replace a component in the pipeline. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     nlp.replace_pipe('parser', my_custom_parser) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the component to replace. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code component] | ||||
|         +cell callable | ||||
|         +cell The pipeline component to inser. | ||||
| 
 | ||||
| 
 | ||||
| +h(2, "rename_pipe") Language.rename_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Rename a component in the pipeline. Useful to create custom names for | ||||
|     |  pre-defined and pre-loaded components. To change the default name of | ||||
|     |  a component added to the pipeline, you can also use the #[code name] | ||||
|     |  argument on #[+api("language#add_pipe") #[code add_pipe]]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     nlp.rename_pipe('parser', 'spacy_parser') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code old_name] | ||||
|         +cell unicode | ||||
|         +cell Name of the component to rename. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code new_name] | ||||
|         +cell unicode | ||||
|         +cell New name of the component. | ||||
| 
 | ||||
| +h(2, "remove_pipe") Language.remove_pipe | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Remove a component from the pipeline. Returns the removed component name | ||||
|     |  and component function. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     name, component = nlp.remove_pipe('parser') | ||||
|     assert name == 'parser' | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code name] | ||||
|         +cell unicode | ||||
|         +cell Name of the component to remove. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell tuple | ||||
|         +cell A #[code (name, component)] tuple of the removed component. | ||||
| 
 | ||||
| +h(2, "to_disk") Language.to_disk | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
|  | @ -399,7 +568,15 @@ p Load state from a binary string. | |||
|     +row | ||||
|         +cell #[code pipeline] | ||||
|         +cell list | ||||
|         +cell Sequence of annotation functions. | ||||
|         +cell | ||||
|             |  List of #[code (name, component)] tuples describing the current | ||||
|             |  processing pipeline, in order. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code pipe_names] | ||||
|             +tag-new(2) | ||||
|         +cell list | ||||
|         +cell List of pipeline component names, in order. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code meta] | ||||
|  | @ -424,3 +601,12 @@ p Load state from a binary string. | |||
|         +cell | ||||
|             |  Two-letter language ID, i.e. | ||||
|             |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code factories] | ||||
|             +tag-new(2) | ||||
|         +cell dict | ||||
|         +cell | ||||
|             |  Factories that create pre-defined pipeline components, e.g. the | ||||
|             |  tagger, parser or entity recognizer, keyed by their component | ||||
|             |  name. | ||||
|  |  | |||
|  | @ -143,6 +143,9 @@ | |||
| 
 | ||||
| //- Layout | ||||
| 
 | ||||
| .u-width-full | ||||
|     width: 100% | ||||
| 
 | ||||
| .u-float-left | ||||
|     float: left | ||||
|     margin-right: 1rem | ||||
|  | @ -166,6 +169,9 @@ | |||
| .u-padding-medium | ||||
|     padding: 1.8rem | ||||
| 
 | ||||
| .u-padding-top | ||||
|     padding-top: 2rem | ||||
| 
 | ||||
| .u-inline-block | ||||
|     display: inline-block | ||||
| 
 | ||||
|  |  | |||
|  | @ -25,7 +25,7 @@ | |||
|         display: inline-block | ||||
|         font-size: 0.6em | ||||
|         font-weight: bold | ||||
|         padding-right: 1.25rem | ||||
|         padding-right: 1em | ||||
|         margin-left: -3.75rem | ||||
|         text-align: right | ||||
|         width: 2.5rem | ||||
|  |  | |||
|  | @ -103,11 +103,11 @@ | |||
|         "title": "Language Processing Pipelines", | ||||
|         "next": "vectors-similarity", | ||||
|         "menu": { | ||||
|             "How pipelines work": "pipelines", | ||||
|             "Examples": "examples", | ||||
|             "How Pipelines Work": "pipelines", | ||||
|             "Custom Components": "custom-components", | ||||
|             "Multi-threading": "multithreading", | ||||
|             "User Hooks": "user-hooks", | ||||
|             "Serialization": "serialization" | ||||
|             "Serialization": "serialization", | ||||
|             "Developing Extensions": "extensions" | ||||
|         } | ||||
|     }, | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										151
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,151 @@ | |||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS | ||||
| 
 | ||||
| p | ||||
|     |  A component receives a #[code Doc] object and | ||||
|     |  #[strong performs the actual processing] – for example, using the current | ||||
|     |  weights to make a prediction and set some annotation on the document. By | ||||
|     |  adding a component to the pipeline, you'll get access to the #[code Doc] | ||||
|     |  at any point #[strong during] processing – instead of only being able to | ||||
|     |  modify it afterwards. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_component(doc): | ||||
|         # do something to the doc here | ||||
|         return doc | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code doc] | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by the previous component. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by this pipeline component. | ||||
| 
 | ||||
| p | ||||
|     |  Custom components can be added to the pipeline using the | ||||
|     |  #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you | ||||
|     |  can either specify a component to add it before or after, tell spaCy | ||||
|     |  to add it first or last in the pipeline, or define a custom name. | ||||
|     |  If no name is set and no #[code name] attribute is present on your | ||||
|     |  component, the function name, e.g. #[code component.__name__] is used. | ||||
| 
 | ||||
| +code("Adding pipeline components"). | ||||
|     def my_component(doc): | ||||
|         print("After tokenization, this doc has %s tokens." % len(doc)) | ||||
|         if len(doc) < 10: | ||||
|             print("This is a pretty short document.") | ||||
|         return doc | ||||
| 
 | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline.add_pipe(my_component, name='print_info', first=True) | ||||
|     print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] | ||||
|     doc = nlp(u"This is a sentence.") | ||||
| 
 | ||||
| p | ||||
|     |  Of course, you can also wrap your component as a class to allow | ||||
|     |  initialising it with custom settings and hold state within the component. | ||||
|     |  This is useful for #[strong stateful components], especially ones which | ||||
|     |  #[strong depend on shared data]. | ||||
| 
 | ||||
| +code. | ||||
|     class MyComponent(object): | ||||
|         name = 'print_info' | ||||
| 
 | ||||
|         def __init__(vocab, short_limit=10): | ||||
|             self.vocab = nlp.vocab | ||||
|             self.short_limit = short_limit | ||||
| 
 | ||||
|         def __call__(doc): | ||||
|             if len(doc) < self.short_limit: | ||||
|                 print("This is a pretty short document.") | ||||
|             return doc | ||||
| 
 | ||||
|     my_component = MyComponent(nlp.vocab, short_limit=25) | ||||
|     nlp.add_pipe(my_component, first=True) | ||||
| 
 | ||||
| +h(3, "custom-components-attributes") | ||||
|     |  Setting attributes on the #[code Doc], #[code Span] and #[code Token] | ||||
| 
 | ||||
| +aside("Why ._?") | ||||
|     |  Writing to a #[code ._] attribute instead of to the #[code Doc] directly | ||||
|     |  keeps a clearer separation and makes it easier to ensure backwards | ||||
|     |  compatibility. For example, if you've implemented your own #[code .coref] | ||||
|     |  property and spaCy claims it one day, it'll break your code. Similarly, | ||||
|     |  just by looking at the code, you'll immediately know what's built-in and | ||||
|     |  what's custom – for example, #[code doc.sentiment] is spaCy, while | ||||
|     |  #[code doc._.sent_score] isn't. | ||||
| 
 | ||||
| +under-construction | ||||
| 
 | ||||
| +h(3, "custom-components-user-hooks") Other user hooks | ||||
| 
 | ||||
| p | ||||
|     |  While it's generally recommended to use the #[code Doc._], #[code Span._] | ||||
|     |  and #[code Token._] proxies to add your own custom attributes, spaCy | ||||
|     |  offers a few exceptions to allow #[strong customising the built-in methods] | ||||
|     |  like #[+api("doc#similarity") #[code Doc.similarity]] or | ||||
|     |  #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can | ||||
|     |  rely on statistical models you train yourself. For instance, you can | ||||
|     |  provide your own on-the-fly sentence segmentation algorithm or document | ||||
|     |  similarity method. | ||||
| 
 | ||||
| p | ||||
|     |  Hooks let you customize some of the behaviours of the #[code Doc], | ||||
|     |  #[code Span] or #[code Token] objects by adding a component to the | ||||
|     |  pipeline. For instance, to customize the | ||||
|     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a | ||||
|     |  component that sets a custom function to | ||||
|     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] | ||||
|     |  method will check the #[code user_hooks] dict, and delegate to your | ||||
|     |  function if you've set one. Similar results can be achieved by setting | ||||
|     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. | ||||
| 
 | ||||
| +aside("Implementation note") | ||||
|     |  The hooks live on the #[code Doc] object because the #[code Span] and | ||||
|     |  #[code Token] objects are created lazily, and don't own any data. They | ||||
|     |  just proxy to their parent #[code Doc]. This turns out to be convenient | ||||
|     |  here — we only have to worry about installing hooks in one place. | ||||
| 
 | ||||
| +table(["Name", "Customises"]) | ||||
|     +row | ||||
|         +cell #[code user_hooks] | ||||
|         +cell | ||||
|             +api("doc#vector") #[code Doc.vector] | ||||
|             +api("doc#has_vector") #[code Doc.has_vector] | ||||
|             +api("doc#vector_norm") #[code Doc.vector_norm] | ||||
|             +api("doc#sents") #[code Doc.sents] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_token_hooks] | ||||
|         +cell | ||||
|             +api("token#similarity") #[code Token.similarity] | ||||
|             +api("token#vector") #[code Token.vector] | ||||
|             +api("token#has_vector") #[code Token.has_vector] | ||||
|             +api("token#vector_norm") #[code Token.vector_norm] | ||||
|             +api("token#conjuncts") #[code Token.conjuncts] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_span_hooks] | ||||
|         +cell | ||||
|             +api("span#similarity") #[code Span.similarity] | ||||
|             +api("span#vector") #[code Span.vector] | ||||
|             +api("span#has_vector") #[code Span.has_vector] | ||||
|             +api("span#vector_norm") #[code Span.vector_norm] | ||||
|             +api("span#root") #[code Span.root] | ||||
| 
 | ||||
| +code("Add custom similarity hooks"). | ||||
|     class SimilarityModel(object): | ||||
|         def __init__(self, model): | ||||
|             self._model = model | ||||
| 
 | ||||
|         def __call__(self, doc): | ||||
|             doc.user_hooks['similarity'] = self.similarity | ||||
|             doc.user_span_hooks['similarity'] = self.similarity | ||||
|             doc.user_token_hooks['similarity'] = self.similarity | ||||
| 
 | ||||
|         def similarity(self, obj1, obj2): | ||||
|             y = self._model([obj1.vector, obj2.vector]) | ||||
|             return float(y[0]) | ||||
							
								
								
									
										3
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,3 @@ | |||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS | ||||
| 
 | ||||
| +under-construction | ||||
|  | @ -11,7 +11,7 @@ p | |||
| 
 | ||||
| p | ||||
|     |  When you load a model, spaCy first consults the model's | ||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. The | ||||
|     |  #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The | ||||
|     |  meta typically includes the model details, the ID of a language class, | ||||
|     |  and an optional list of pipeline components. spaCy then does the | ||||
|     |  following: | ||||
|  | @ -21,24 +21,26 @@ p | |||
|         "name": "example_model", | ||||
|         "lang": "en" | ||||
|         "description": "Example model for spaCy", | ||||
|         "pipeline": ["tensorizer", "tagger"] | ||||
|         "pipeline": ["tagger", "parser"] | ||||
|     } | ||||
| 
 | ||||
| +list("numbers") | ||||
|     +item | ||||
|         |  Look up #[strong pipeline IDs] in the available | ||||
|         |  #[strong pipeline factories]. | ||||
|     +item | ||||
|         |  Initialise the #[strong pipeline components] by calling their | ||||
|         |  factories with the #[code Vocab] as an argument. This gives each | ||||
|         |  factory and component access to the pipeline's shared data, like | ||||
|         |  strings, morphology and annotation scheme. | ||||
|     +item | ||||
|         |  Load the #[strong language class and data] for the given ID via | ||||
|         |  #[+api("util.get_lang_class") #[code get_lang_class]]. | ||||
|         |  #[+api("util.get_lang_class") #[code get_lang_class]] and initialise | ||||
|         |  it. The #[code Language] class contains the shared vocabulary, | ||||
|         |  tokenization rules and the language-specific annotation scheme. | ||||
|     +item | ||||
|         |  Pass the path to the #[strong model data] to the #[code Language] | ||||
|         |  class and return it. | ||||
|         |  Iterate over the #[strong pipeline names] and create each component | ||||
|         |  using #[+api("language#create_pipe") #[code create_pipe]], which | ||||
|         |  looks them up in #[code Language.factories]. | ||||
|     +item | ||||
|         |  Add each pipeline component to the pipeline in order, using | ||||
|         |  #[+api("language#add_pipe") #[code add_pipe]]. | ||||
|     +item | ||||
|         |  Make the #[strong model data] available to the #[code Language] class | ||||
|         |  by calling #[+api("language#from_disk") #[code from_disk]] with the | ||||
|         |  path to the model data ditectory. | ||||
| 
 | ||||
| p | ||||
|     |  So when you call this... | ||||
|  | @ -47,12 +49,12 @@ p | |||
|     nlp = spacy.load('en') | ||||
| 
 | ||||
| p | ||||
|     | ... the model tells spaCy to use the pipeline | ||||
|     | ... the model tells spaCy to use the language #[code "en"] and the pipeline | ||||
|     |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will | ||||
|     |  then look up each string in its internal factories registry and | ||||
|     |  initialise the individual components. It'll then load | ||||
|     |  #[code spacy.lang.en.English], pass it the path to the model's data | ||||
|     |  directory, and return it for you to use as the #[code nlp] object. | ||||
|     |  then initialise #[code spacy.lang.en.English], and create each pipeline | ||||
|     |  component and add it to the processing pipeline. It'll then load in the | ||||
|     |  model's data from its data ditectory and return the modified | ||||
|     |  #[code Language] class for you to use as the #[code nlp] object. | ||||
| 
 | ||||
| p | ||||
|     |  Fundamentally, a #[+a("/models") spaCy model] consists of three | ||||
|  | @ -74,8 +76,11 @@ p | |||
|     data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' | ||||
| 
 | ||||
|     cls = spacy.util.get_lang_class(lang)   # 1. get Language instance, e.g. English() | ||||
|     nlp = cls(pipeline=pipeline)           # 2. initialise it with the pipeline | ||||
|     nlp.from_disk(model_data_path)         # 3. load in the binary data | ||||
|     nlp = cls()                             # 2. initialise it | ||||
|     for name in pipeline: | ||||
|         component = nlp.create_pipe(name)   # 3. create the pipeline components | ||||
|         nlp.add_pipe(component)             # 4. add the component to the pipeline | ||||
|     nlp.from_disk(model_data_path)          # 5. load in the binary data | ||||
| 
 | ||||
| p | ||||
|     |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and | ||||
|  | @ -87,124 +92,23 @@ p | |||
|     |  document, which is then processed by the component next in the pipeline. | ||||
| 
 | ||||
| +code("The pipeline under the hood"). | ||||
|     doc = nlp.make_doc(u'This is a sentence') | ||||
|     for proc in nlp.pipeline: | ||||
|         doc = proc(doc) | ||||
| 
 | ||||
| +h(3, "creating") Creating pipeline components and factories | ||||
|     doc = nlp.make_doc(u'This is a sentence')   # create a Doc from raw text | ||||
|     for name, proc in nlp.pipeline:             # iterate over components in order | ||||
|         doc = proc(doc)                         # apply each component | ||||
| 
 | ||||
| p | ||||
|     |  spaCy lets you customise the pipeline with your own components. Components | ||||
|     |  are functions that receive a #[code Doc] object, modify and return it. | ||||
|     |  If your component is stateful, you'll want to create a new one for each | ||||
|     |  pipeline. You can do that by defining and registering a factory which | ||||
|     |  receives the shared #[code Vocab] object and returns a component. | ||||
| 
 | ||||
| +h(4, "creating-component") Creating a  component | ||||
| 
 | ||||
| p | ||||
|     |  A component receives a #[code Doc] object and | ||||
|     |  #[strong performs the actual processing] – for example, using the current | ||||
|     |  weights to make a prediction and set some annotation on the document. By | ||||
|     |  adding a component to the pipeline, you'll get access to the #[code Doc] | ||||
|     |  at any point #[strong during] processing – instead of only being able to | ||||
|     |  modify it afterwards. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_component(doc): | ||||
|         # do something to the doc here | ||||
|         return doc | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code doc] | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by the previous component. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Doc] | ||||
|         +cell The #[code Doc] object processed by this pipeline component. | ||||
| 
 | ||||
| p | ||||
|     |  When creating a new #[code Language] class, you can pass it a list of | ||||
|     |  pipeline component functions to execute in that order. You can also | ||||
|     |  add it to an existing pipeline by modifying #[code nlp.pipeline] – just | ||||
|     |  be careful not to overwrite a pipeline or its components by accident! | ||||
|     |  The current processing pipeline is available as #[code nlp.pipeline], | ||||
|     |  which returns a list of #[code (name, component)] tuples, or | ||||
|     |  #[code nlp.pipe_names], which only returns a list of human-readable | ||||
|     |  component names. | ||||
| 
 | ||||
| +code. | ||||
|     # Create a new Language object with a pipeline | ||||
|     from spacy.language import Language | ||||
|     nlp = Language(pipeline=[my_component]) | ||||
|     nlp.pipeline | ||||
|     # [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)] | ||||
|     nlp.pipe_names | ||||
|     # ['tagger', 'parser', 'ner'] | ||||
| 
 | ||||
|     # Modify an existing pipeline | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline.append(my_component) | ||||
| 
 | ||||
| +h(4, "creating-factory") Creating a factory | ||||
| 
 | ||||
| p | ||||
|     |  A factory is a #[strong function that returns a pipeline component]. | ||||
|     |  It's called with the #[code Vocab] object, to give it access to the | ||||
|     |  shared data between components – for example, the strings, morphology, | ||||
|     |  vectors or annotation scheme. Factories are useful for creating | ||||
|     |  #[strong stateful components], especially ones which | ||||
|     |  #[strong depend on shared data]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     def my_factory(vocab): | ||||
|         # load some state | ||||
|         def my_component(doc): | ||||
|             # process the doc | ||||
|             return doc | ||||
|         return my_component | ||||
| 
 | ||||
| +table(["Argument", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code vocab] | ||||
|         +cell #[code Vocab] | ||||
|         +cell | ||||
|             |  Shared data between components, including strings, morphology, | ||||
|             |  vectors etc. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell callable | ||||
|         +cell The pipeline component. | ||||
| 
 | ||||
| p | ||||
|     |  By creating a factory, you're essentially telling spaCy how to get the | ||||
|     |  pipeline component #[strong once the vocab is available]. Factories need to | ||||
|     |  be registered via #[+api("spacy#set_factory") #[code set_factory()]] and | ||||
|     |  by assigning them a unique ID. This ID can be added to the pipeline as a | ||||
|     |  string. When creating a pipeline, you're free to mix strings and | ||||
|     |  callable components: | ||||
| 
 | ||||
| +code. | ||||
|     spacy.set_factory('my_factory', my_factory) | ||||
|     nlp = Language(pipeline=['my_factory', my_other_component]) | ||||
| 
 | ||||
| p | ||||
|     |  If spaCy comes across a string in the pipeline, it will try to resolve it | ||||
|     |  by looking it up in the available factories. The factory will then be | ||||
|     |  initialised with the #[code Vocab]. Providing factory names instead of | ||||
|     |  callables also makes it easy to specify them in the model's | ||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. If you're | ||||
|     |  training your own model and want to use one of spaCy's default components, | ||||
|     |  you won't have to worry about finding and implementing it either – to use | ||||
|     |  the default tagger, simply add #[code "tagger"] to the pipeline, and | ||||
|     |  #[strong spaCy will know what to do]. | ||||
| 
 | ||||
| +infobox("Important note") | ||||
|     |  Because factories are #[strong resolved on initialisation] of the | ||||
|     |  #[code Language] class, it's #[strong not possible] to add them to the | ||||
|     |  pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only | ||||
|     |  works with individual component functions. To use factories, you need to | ||||
|     |  create a new #[code Language] object, or generate a | ||||
|     |  #[+a("/usage/training#models-generating") model package] with | ||||
|     |  a custom pipeline. | ||||
| 
 | ||||
| +h(3, "disabling") Disabling pipeline components | ||||
| +h(3, "disabling") Disabling and modifying pipeline components | ||||
| 
 | ||||
| p | ||||
|     |  If you don't need a particular component of the pipeline – for | ||||
|  | @ -217,16 +121,19 @@ p | |||
| +code. | ||||
|     nlp = spacy.load('en', disable['parser', 'tagger']) | ||||
|     nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) | ||||
|     doc = nlp(u"I don't want parsed", disable=['parser']) | ||||
| 
 | ||||
| p | ||||
|     |  Note that you can't write directly to #[code nlp.pipeline], as this list | ||||
|     |  holds the #[em actual components], not the IDs. However, if you know the | ||||
|     |  order of the components, you can still slice the list: | ||||
|     |  You can also use the #[+api("language#remove_pipe") #[code remove_pipe]] | ||||
|     |  method to remove pipeline components from an existing pipeline, the | ||||
|     |  #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them, | ||||
|     |  or the #[+api("language#replace_pipe") #[code replace_pipe]] method | ||||
|     |  to replace them with a custom component entirely (more details on this | ||||
|     |  in the section on #[+a("#custom-components") custom components]. | ||||
| 
 | ||||
| +code. | ||||
|     nlp = spacy.load('en') | ||||
|     nlp.pipeline = nlp.pipeline[:2] # only use the first two components | ||||
|     nlp.remove_pipe('parser') | ||||
|     nlp.rename_pipe('ner', 'entityrecognizer') | ||||
|     nlp.replace_pipe('tagger', my_custom_tagger) | ||||
| 
 | ||||
| +infobox("Important note: disabling pipeline components") | ||||
|     .o-block | ||||
|  | @ -234,12 +141,14 @@ p | |||
|         |  processing pipeline components, the #[code parser], #[code tagger] | ||||
|         |  and #[code entity] keyword arguments have been replaced with | ||||
|         |  #[code disable], which takes a list of pipeline component names. | ||||
|         |  This lets you disable both default and custom components when loading | ||||
|         |  This lets you disable pre-defined components when loading | ||||
|         |  a model, or initialising a Language class via | ||||
|         |  #[+api("language-from_disk") #[code from_disk]]. | ||||
| 
 | ||||
|     +code-new. | ||||
|         nlp = spacy.load('en', disable=['tagger', 'ner']) | ||||
|         doc = nlp(u"I don't want parsed", disable=['parser']) | ||||
|         nlp = spacy.load('en', disable=['ner']) | ||||
|         nlp.remove_pipe('parser') | ||||
|         doc = nlp(u"I don't want parsed") | ||||
|     +code-old. | ||||
|         nlp = spacy.load('en', tagger=False, entity=False) | ||||
|         doc = nlp(u"I don't want parsed", parse=False) | ||||
|  |  | |||
|  | @ -21,7 +21,7 @@ p | |||
| 
 | ||||
| +code. | ||||
|     import spacy | ||||
|     from spacy.tokens import Span | ||||
|     from spacy.tokens.span import Span | ||||
| 
 | ||||
|     text = u'Netflix is hiring a new VP of global policy' | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,61 +0,0 @@ | |||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS | ||||
| 
 | ||||
| p | ||||
|     |  Hooks let you customize some of the behaviours of the #[code Doc], | ||||
|     |  #[code Span] or #[code Token] objects by adding a component to the | ||||
|     |  pipeline. For instance, to customize the | ||||
|     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a | ||||
|     |  component that sets a custom function to | ||||
|     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] | ||||
|     |  method will check the #[code user_hooks] dict, and delegate to your | ||||
|     |  function if you've set one. Similar results can be achieved by setting | ||||
|     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. | ||||
| 
 | ||||
| +code("Polymorphic similarity example"). | ||||
|     span.similarity(doc) | ||||
|     token.similarity(span) | ||||
|     doc1.similarity(doc2) | ||||
| 
 | ||||
| p | ||||
|     |  By default, this just averages the vectors for each document, and | ||||
|     |  computes their cosine. Obviously, spaCy should make it easy for you to | ||||
|     |  install your own similarity model. This introduces a tricky design | ||||
|     |  challenge. The current solution is to add three more dicts to the | ||||
|     |  #[code Doc] object: | ||||
| 
 | ||||
| +aside("Implementation note") | ||||
|     |  The hooks live on the #[code Doc] object because the #[code Span] and | ||||
|     |  #[code Token] objects are created lazily, and don't own any data. They | ||||
|     |  just proxy to their parent #[code Doc]. This turns out to be convenient | ||||
|     |  here — we only have to worry about installing hooks in one place. | ||||
| 
 | ||||
| +table(["Name", "Description"]) | ||||
|     +row | ||||
|         +cell #[code user_hooks] | ||||
|         +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_token_hooks] | ||||
|         +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_span_hooks] | ||||
|         +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root] | ||||
| 
 | ||||
| p | ||||
|     |  To sum up, here's an example of hooking in custom #[code .similarity()] | ||||
|     |  methods: | ||||
| 
 | ||||
| +code("Add custom similarity hooks"). | ||||
|     class SimilarityModel(object): | ||||
|         def __init__(self, model): | ||||
|             self._model = model | ||||
| 
 | ||||
|         def __call__(self, doc): | ||||
|             doc.user_hooks['similarity'] = self.similarity | ||||
|             doc.user_span_hooks['similarity'] = self.similarity | ||||
|             doc.user_token_hooks['similarity'] = self.similarity | ||||
| 
 | ||||
|         def similarity(self, obj1, obj2): | ||||
|             y = self._model([obj1.vector, obj2.vector]) | ||||
|             return float(y[0]) | ||||
|  | @ -8,18 +8,18 @@ include _spacy-101/_pipelines | |||
|     +h(2, "pipelines") How pipelines work | ||||
|     include _processing-pipelines/_pipelines | ||||
| 
 | ||||
| +section("examples") | ||||
|     +h(2, "examples") Examples | ||||
|     include _processing-pipelines/_examples | ||||
| +section("custom-components") | ||||
|     +h(2, "custom-components") Creating custom pipeline components | ||||
|     include _processing-pipelines/_custom-components | ||||
| 
 | ||||
| +section("multithreading") | ||||
|     +h(2, "multithreading") Multi-threading | ||||
|     include _processing-pipelines/_multithreading | ||||
| 
 | ||||
| +section("user-hooks") | ||||
|     +h(2, "user-hooks") User hooks | ||||
|     include _processing-pipelines/_user-hooks | ||||
| 
 | ||||
| +section("serialization") | ||||
|     +h(2, "serialization") Serialization | ||||
|     include _processing-pipelines/_serialization | ||||
| 
 | ||||
| +section("extensions") | ||||
|     +h(2, "extensions") Developing spaCy extensions | ||||
|     include _processing-pipelines/_extensions | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user