mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'feature/pipeline-management' into feature/dot-underscore
This commit is contained in:
		
						commit
						de374dc72a
					
				|  | @ -1,12 +1,9 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import absolute_import, unicode_literals | from __future__ import absolute_import, unicode_literals | ||||||
| from contextlib import contextmanager | from contextlib import contextmanager | ||||||
| import dill |  | ||||||
| 
 | 
 | ||||||
| import numpy |  | ||||||
| from thinc.neural import Model | from thinc.neural import Model | ||||||
| from thinc.neural.ops import NumpyOps, CupyOps | from thinc.neural.optimizers import Adam | ||||||
| from thinc.neural.optimizers import Adam, SGD |  | ||||||
| import random | import random | ||||||
| import ujson | import ujson | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
|  | @ -17,24 +14,20 @@ from .vocab import Vocab | ||||||
| from .tagger import Tagger | from .tagger import Tagger | ||||||
| from .lemmatizer import Lemmatizer | from .lemmatizer import Lemmatizer | ||||||
| from .syntax.parser import get_templates | from .syntax.parser import get_templates | ||||||
| from .syntax import nonproj |  | ||||||
| 
 | 
 | ||||||
| from .pipeline import NeuralDependencyParser, EntityRecognizer | from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger | ||||||
| from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer | from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer | ||||||
| from .pipeline import NeuralLabeller |  | ||||||
| from .pipeline import SimilarityHook |  | ||||||
| from .pipeline import TextCategorizer |  | ||||||
| from . import about |  | ||||||
| 
 | 
 | ||||||
| from .compat import json_dumps, izip | from .compat import json_dumps, izip | ||||||
|  | from .scorer import Scorer | ||||||
|  | from ._ml import link_vectors_to_models | ||||||
| from .attrs import IS_STOP | from .attrs import IS_STOP | ||||||
| from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES | ||||||
| from .lang.tokenizer_exceptions import TOKEN_MATCH | from .lang.tokenizer_exceptions import TOKEN_MATCH | ||||||
| from .lang.tag_map import TAG_MAP | from .lang.tag_map import TAG_MAP | ||||||
| from .lang.lex_attrs import LEX_ATTRS | from .lang.lex_attrs import LEX_ATTRS | ||||||
| from . import util | from . import util | ||||||
| from .scorer import Scorer | from . import about | ||||||
| from ._ml import link_vectors_to_models |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BaseDefaults(object): | class BaseDefaults(object): | ||||||
|  | @ -70,59 +63,7 @@ class BaseDefaults(object): | ||||||
|                          prefix_search=prefix_search, suffix_search=suffix_search, |                          prefix_search=prefix_search, suffix_search=suffix_search, | ||||||
|                          infix_finditer=infix_finditer, token_match=token_match) |                          infix_finditer=infix_finditer, token_match=token_match) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] | ||||||
|     def create_tagger(cls, nlp=None, **cfg): |  | ||||||
|         if nlp is None: |  | ||||||
|             return NeuralTagger(cls.create_vocab(nlp), **cfg) |  | ||||||
|         else: |  | ||||||
|             return NeuralTagger(nlp.vocab, **cfg) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def create_parser(cls, nlp=None, **cfg): |  | ||||||
|         if nlp is None: |  | ||||||
|             return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) |  | ||||||
|         else: |  | ||||||
|             return NeuralDependencyParser(nlp.vocab, **cfg) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def create_entity(cls, nlp=None, **cfg): |  | ||||||
|         if nlp is None: |  | ||||||
|             return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) |  | ||||||
|         else: |  | ||||||
|             return NeuralEntityRecognizer(nlp.vocab, **cfg) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def create_pipeline(cls, nlp=None, disable=tuple()): |  | ||||||
|         meta = nlp.meta if nlp is not None else {} |  | ||||||
|         # Resolve strings, like "cnn", "lstm", etc |  | ||||||
|         pipeline = [] |  | ||||||
|         for entry in meta.get('pipeline', []): |  | ||||||
|             if entry in disable or getattr(entry, 'name', entry) in disable: |  | ||||||
|                 continue |  | ||||||
|             factory = cls.Defaults.factories[entry] |  | ||||||
|             pipeline.append(factory(nlp, **meta.get(entry, {}))) |  | ||||||
|         return pipeline |  | ||||||
| 
 |  | ||||||
|     factories = { |  | ||||||
|         'make_doc': create_tokenizer, |  | ||||||
|         'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], |  | ||||||
|         'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], |  | ||||||
|         'parser': lambda nlp, **cfg: [ |  | ||||||
|             NeuralDependencyParser(nlp.vocab, **cfg), |  | ||||||
|             nonproj.deprojectivize], |  | ||||||
|         'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], |  | ||||||
|         'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], |  | ||||||
|         'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], |  | ||||||
|         # Temporary compatibility -- delete after pivot |  | ||||||
|         'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], |  | ||||||
|         'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], |  | ||||||
|         'dependencies': lambda nlp, **cfg: [ |  | ||||||
|             NeuralDependencyParser(nlp.vocab, **cfg), |  | ||||||
|             nonproj.deprojectivize, |  | ||||||
|         ], |  | ||||||
|         'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     token_match = TOKEN_MATCH |     token_match = TOKEN_MATCH | ||||||
|     prefixes = tuple(TOKENIZER_PREFIXES) |     prefixes = tuple(TOKENIZER_PREFIXES) | ||||||
|     suffixes = tuple(TOKENIZER_SUFFIXES) |     suffixes = tuple(TOKENIZER_SUFFIXES) | ||||||
|  | @ -152,8 +93,17 @@ class Language(object): | ||||||
|     Defaults = BaseDefaults |     Defaults = BaseDefaults | ||||||
|     lang = None |     lang = None | ||||||
| 
 | 
 | ||||||
|     def __init__(self, vocab=True, make_doc=True, pipeline=None, |     factories = { | ||||||
|                  meta={}, disable=tuple(), **kwargs): |         'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), | ||||||
|  |         'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), | ||||||
|  |         'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), | ||||||
|  |         'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), | ||||||
|  |         'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), | ||||||
|  |         'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), | ||||||
|  |         'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): | ||||||
|         """Initialise a Language object. |         """Initialise a Language object. | ||||||
| 
 | 
 | ||||||
|         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via |         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via | ||||||
|  | @ -179,28 +129,7 @@ class Language(object): | ||||||
|             factory = self.Defaults.create_tokenizer |             factory = self.Defaults.create_tokenizer | ||||||
|             make_doc = factory(self, **meta.get('tokenizer', {})) |             make_doc = factory(self, **meta.get('tokenizer', {})) | ||||||
|         self.tokenizer = make_doc |         self.tokenizer = make_doc | ||||||
|         if pipeline is True: |         self.pipeline = [] | ||||||
|             self.pipeline = self.Defaults.create_pipeline(self, disable) |  | ||||||
|         elif pipeline: |  | ||||||
|             # Careful not to do getattr(p, 'name', None) here |  | ||||||
|             # If we had disable=[None], we'd disable everything! |  | ||||||
|             self.pipeline = [p for p in pipeline |  | ||||||
|                              if p not in disable |  | ||||||
|                              and getattr(p, 'name', p) not in disable] |  | ||||||
|             # Resolve strings, like "cnn", "lstm", etc |  | ||||||
|             for i, entry in enumerate(self.pipeline): |  | ||||||
|                 if entry in self.Defaults.factories: |  | ||||||
|                     factory = self.Defaults.factories[entry] |  | ||||||
|                     self.pipeline[i] = factory(self, **meta.get(entry, {})) |  | ||||||
|         else: |  | ||||||
|             self.pipeline = [] |  | ||||||
|         flat_list = [] |  | ||||||
|         for pipe in self.pipeline: |  | ||||||
|             if isinstance(pipe, list): |  | ||||||
|                 flat_list.extend(pipe) |  | ||||||
|             else: |  | ||||||
|                 flat_list.append(pipe) |  | ||||||
|         self.pipeline = flat_list |  | ||||||
|         self._optimizer = None |         self._optimizer = None | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|  | @ -214,11 +143,7 @@ class Language(object): | ||||||
|         self._meta.setdefault('email', '') |         self._meta.setdefault('email', '') | ||||||
|         self._meta.setdefault('url', '') |         self._meta.setdefault('url', '') | ||||||
|         self._meta.setdefault('license', '') |         self._meta.setdefault('license', '') | ||||||
|         pipeline = [] |         self._meta['pipeline'] = self.pipe_names | ||||||
|         for component in self.pipeline: |  | ||||||
|             if hasattr(component, 'name'): |  | ||||||
|                 pipeline.append(component.name) |  | ||||||
|         self._meta['pipeline'] = pipeline |  | ||||||
|         return self._meta |         return self._meta | ||||||
| 
 | 
 | ||||||
|     @meta.setter |     @meta.setter | ||||||
|  | @ -228,34 +153,137 @@ class Language(object): | ||||||
|     # Conveniences to access pipeline components |     # Conveniences to access pipeline components | ||||||
|     @property |     @property | ||||||
|     def tensorizer(self): |     def tensorizer(self): | ||||||
|         return self.get_component('tensorizer') |         return self.get_pipe('tensorizer') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def tagger(self): |     def tagger(self): | ||||||
|         return self.get_component('tagger') |         return self.get_pipe('tagger') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def parser(self): |     def parser(self): | ||||||
|         return self.get_component('parser') |         return self.get_pipe('parser') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def entity(self): |     def entity(self): | ||||||
|         return self.get_component('ner') |         return self.get_pipe('ner') | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def matcher(self): |     def matcher(self): | ||||||
|         return self.get_component('matcher') |         return self.get_pipe('matcher') | ||||||
| 
 | 
 | ||||||
|     def get_component(self, name): |     @property | ||||||
|         if self.pipeline in (True, None): |     def pipe_names(self): | ||||||
|             return None |         """Get names of available pipeline components. | ||||||
|         for proc in self.pipeline: | 
 | ||||||
|             if hasattr(proc, 'name') and proc.name.endswith(name): |         RETURNS (list): List of component name strings, in order. | ||||||
|                 return proc |         """ | ||||||
|         return None |         return [pipe_name for pipe_name, _ in self.pipeline] | ||||||
|  | 
 | ||||||
|  |     def get_pipe(self, name): | ||||||
|  |         """Get a pipeline component for a given component name. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Name of pipeline component to get. | ||||||
|  |         RETURNS (callable): The pipeline component. | ||||||
|  |         """ | ||||||
|  |         for pipe_name, component in self.pipeline: | ||||||
|  |             if pipe_name == name: | ||||||
|  |                 return component | ||||||
|  |         msg = "No component '{}' found in pipeline. Available names: {}" | ||||||
|  |         raise KeyError(msg.format(name, self.pipe_names)) | ||||||
|  | 
 | ||||||
|  |     def create_pipe(self, name, config=dict()): | ||||||
|  |         """Create a pipeline component from a factory. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Factory name to look up in `Language.factories`. | ||||||
|  |         config (dict): Configuration parameters to initialise component. | ||||||
|  |         RETURNS (callable): Pipeline component. | ||||||
|  |         """ | ||||||
|  |         if name not in self.factories: | ||||||
|  |             raise KeyError("Can't find factory for '{}'.".format(name)) | ||||||
|  |         factory = self.factories[name] | ||||||
|  |         return factory(self, **config) | ||||||
|  | 
 | ||||||
|  |     def add_pipe(self, component, name=None, before=None, after=None, | ||||||
|  |                  first=None, last=None): | ||||||
|  |         """Add a component to the processing pipeline. Valid components are | ||||||
|  |         callables that take a `Doc` object, modify it and return it. Only one of | ||||||
|  |         before, after, first or last can be set. Default behaviour is "last". | ||||||
|  | 
 | ||||||
|  |         component (callable): The pipeline component. | ||||||
|  |         name (unicode): Name of pipeline component. Overwrites existing | ||||||
|  |             component.name attribute if available. If no name is set and | ||||||
|  |             the component exposes no name attribute, component.__name__ is | ||||||
|  |             used. An error is raised if the name already exists in the pipeline. | ||||||
|  |         before (unicode): Component name to insert component directly before. | ||||||
|  |         after (unicode): Component name to insert component directly after. | ||||||
|  |         first (bool): Insert component first / not first in the pipeline. | ||||||
|  |         last (bool): Insert component last / not last in the pipeline. | ||||||
|  | 
 | ||||||
|  |         EXAMPLE: | ||||||
|  |             >>> nlp.add_pipe(component, before='ner') | ||||||
|  |             >>> nlp.add_pipe(component, name='custom_name', last=True) | ||||||
|  |         """ | ||||||
|  |         if name is None: | ||||||
|  |             name = getattr(component, 'name', component.__name__) | ||||||
|  |         if name in self.pipe_names: | ||||||
|  |             raise ValueError("'{}' already exists in pipeline.".format(name)) | ||||||
|  |         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: | ||||||
|  |             msg = ("Invalid constraints. You can only set one of the " | ||||||
|  |                    "following: before, after, first, last.") | ||||||
|  |             raise ValueError(msg) | ||||||
|  |         pipe = (name, component) | ||||||
|  |         if last or not any([first, before, after]): | ||||||
|  |             self.pipeline.append(pipe) | ||||||
|  |         elif first: | ||||||
|  |             self.pipeline.insert(0, pipe) | ||||||
|  |         elif before and before in self.pipe_names: | ||||||
|  |             self.pipeline.insert(self.pipe_names.index(before), pipe) | ||||||
|  |         elif after and after in self.pipe_names: | ||||||
|  |             self.pipeline.insert(self.pipe_names.index(after), pipe) | ||||||
|  |         else: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             unfound = before or after | ||||||
|  |             raise ValueError(msg.format(unfound, self.pipe_names)) | ||||||
|  | 
 | ||||||
|  |     def replace_pipe(self, name, component): | ||||||
|  |         """Replace a component in the pipeline. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Name of the component to replace. | ||||||
|  |         component (callable): Pipeline component. | ||||||
|  |         """ | ||||||
|  |         if name not in self.pipe_names: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             raise ValueError(msg.format(name, self.pipe_names)) | ||||||
|  |         self.pipeline[self.pipe_names.index(name)] = (name, component) | ||||||
|  | 
 | ||||||
|  |     def rename_pipe(self, old_name, new_name): | ||||||
|  |         """Rename a pipeline component. | ||||||
|  | 
 | ||||||
|  |         old_name (unicode): Name of the component to rename. | ||||||
|  |         new_name (unicode): New name of the component. | ||||||
|  |         """ | ||||||
|  |         if old_name not in self.pipe_names: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             raise ValueError(msg.format(old_name, self.pipe_names)) | ||||||
|  |         if new_name in self.pipe_names: | ||||||
|  |             msg = "'{}' already exists in pipeline. Existing names: {}" | ||||||
|  |             raise ValueError(msg.format(new_name, self.pipe_names)) | ||||||
|  |         i = self.pipe_names.index(old_name) | ||||||
|  |         self.pipeline[i] = (new_name, self.pipeline[i][1]) | ||||||
|  | 
 | ||||||
|  |     def remove_pipe(self, name): | ||||||
|  |         """Remove a component from the pipeline. | ||||||
|  | 
 | ||||||
|  |         name (unicode): Name of the component to remove. | ||||||
|  |         RETURNS (tuple): A `(name, component)` tuple of the removed component. | ||||||
|  |         """ | ||||||
|  |         if name not in self.pipe_names: | ||||||
|  |             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||||
|  |             raise ValueError(msg.format(name, self.pipe_names)) | ||||||
|  |         return self.pipeline.pop(self.pipe_names.index(name)) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, text, disable=[]): |     def __call__(self, text, disable=[]): | ||||||
|         """'Apply the pipeline to some text. The text can span multiple sentences, |         """Apply the pipeline to some text. The text can span multiple sentences, | ||||||
|         and can contain arbtrary whitespace. Alignment into the original string |         and can contain arbtrary whitespace. Alignment into the original string | ||||||
|         is preserved. |         is preserved. | ||||||
| 
 | 
 | ||||||
|  | @ -269,8 +297,7 @@ class Language(object): | ||||||
|             ('An', 'NN') |             ('An', 'NN') | ||||||
|         """ |         """ | ||||||
|         doc = self.make_doc(text) |         doc = self.make_doc(text) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             name = getattr(proc, 'name', None) |  | ||||||
|             if name in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             doc = proc(doc) |             doc = proc(doc) | ||||||
|  | @ -308,7 +335,7 @@ class Language(object): | ||||||
|             grads[key] = (W, dW) |             grads[key] = (W, dW) | ||||||
|         pipes = list(self.pipeline) |         pipes = list(self.pipeline) | ||||||
|         random.shuffle(pipes) |         random.shuffle(pipes) | ||||||
|         for proc in pipes: |         for name, proc in pipes: | ||||||
|             if not hasattr(proc, 'update'): |             if not hasattr(proc, 'update'): | ||||||
|                 continue |                 continue | ||||||
|             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) |             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) | ||||||
|  | @ -322,7 +349,7 @@ class Language(object): | ||||||
|         docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. |         docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. | ||||||
|         YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. |         YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. | ||||||
|         """ |         """ | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if hasattr(proc, 'preprocess_gold'): |             if hasattr(proc, 'preprocess_gold'): | ||||||
|                 docs_golds = proc.preprocess_gold(docs_golds) |                 docs_golds = proc.preprocess_gold(docs_golds) | ||||||
|         for doc, gold in docs_golds: |         for doc, gold in docs_golds: | ||||||
|  | @ -354,7 +381,7 @@ class Language(object): | ||||||
| 
 | 
 | ||||||
|         get_gold_tuples (function): Function returning gold data |         get_gold_tuples (function): Function returning gold data | ||||||
|         **cfg: Config parameters. |         **cfg: Config parameters. | ||||||
|         returns: An optimizer |         RETURNS: An optimizer | ||||||
|         """ |         """ | ||||||
|         # Populate vocab |         # Populate vocab | ||||||
|         if get_gold_tuples is not None: |         if get_gold_tuples is not None: | ||||||
|  | @ -371,7 +398,7 @@ class Language(object): | ||||||
|         else: |         else: | ||||||
|             device = None |             device = None | ||||||
|         link_vectors_to_models(self.vocab) |         link_vectors_to_models(self.vocab) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if hasattr(proc, 'begin_training'): |             if hasattr(proc, 'begin_training'): | ||||||
|                 context = proc.begin_training(get_gold_tuples(), |                 context = proc.begin_training(get_gold_tuples(), | ||||||
|                                               pipeline=self.pipeline) |                                               pipeline=self.pipeline) | ||||||
|  | @ -393,7 +420,7 @@ class Language(object): | ||||||
|         docs, golds = zip(*docs_golds) |         docs, golds = zip(*docs_golds) | ||||||
|         docs = list(docs) |         docs = list(docs) | ||||||
|         golds = list(golds) |         golds = list(golds) | ||||||
|         for pipe in self.pipeline: |         for name, pipe in self.pipeline: | ||||||
|             if not hasattr(pipe, 'pipe'): |             if not hasattr(pipe, 'pipe'): | ||||||
|                 for doc in docs: |                 for doc in docs: | ||||||
|                     pipe(doc) |                     pipe(doc) | ||||||
|  | @ -419,7 +446,7 @@ class Language(object): | ||||||
|             >>> with nlp.use_params(optimizer.averages): |             >>> with nlp.use_params(optimizer.averages): | ||||||
|             >>>     nlp.to_disk('/tmp/checkpoint') |             >>>     nlp.to_disk('/tmp/checkpoint') | ||||||
|         """ |         """ | ||||||
|         contexts = [pipe.use_params(params) for pipe |         contexts = [pipe.use_params(params) for name, pipe | ||||||
|                     in self.pipeline if hasattr(pipe, 'use_params')] |                     in self.pipeline if hasattr(pipe, 'use_params')] | ||||||
|         # TODO: Having trouble with contextlib |         # TODO: Having trouble with contextlib | ||||||
|         # Workaround: these aren't actually context managers atm. |         # Workaround: these aren't actually context managers atm. | ||||||
|  | @ -466,8 +493,7 @@ class Language(object): | ||||||
|                 yield (doc, context) |                 yield (doc, context) | ||||||
|             return |             return | ||||||
|         docs = (self.make_doc(text) for text in texts) |         docs = (self.make_doc(text) for text in texts) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             name = getattr(proc, 'name', None) |  | ||||||
|             if name in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if hasattr(proc, 'pipe'): |             if hasattr(proc, 'pipe'): | ||||||
|  | @ -495,14 +521,14 @@ class Language(object): | ||||||
|             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), |             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), | ||||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) |             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||||
|         )) |         )) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if not hasattr(proc, 'name'): |             if not hasattr(proc, 'name'): | ||||||
|                 continue |                 continue | ||||||
|             if proc.name in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'to_disk'): |             if not hasattr(proc, 'to_disk'): | ||||||
|                 continue |                 continue | ||||||
|             serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) |             serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) | ||||||
|         serializers['vocab'] = lambda p: self.vocab.to_disk(p) |         serializers['vocab'] = lambda p: self.vocab.to_disk(p) | ||||||
|         util.to_disk(path, serializers, {p: False for p in disable}) |         util.to_disk(path, serializers, {p: False for p in disable}) | ||||||
| 
 | 
 | ||||||
|  | @ -526,14 +552,12 @@ class Language(object): | ||||||
|             ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), |             ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), | ||||||
|             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) |             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) | ||||||
|         )) |         )) | ||||||
|         for proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if not hasattr(proc, 'name'): |             if name in disable: | ||||||
|                 continue |  | ||||||
|             if proc.name in disable: |  | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'to_disk'): |             if not hasattr(proc, 'to_disk'): | ||||||
|                 continue |                 continue | ||||||
|             deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) |             deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) | ||||||
|         exclude = {p: False for p in disable} |         exclude = {p: False for p in disable} | ||||||
|         if not (path / 'vocab').exists(): |         if not (path / 'vocab').exists(): | ||||||
|             exclude['vocab'] = True |             exclude['vocab'] = True | ||||||
|  | @ -552,8 +576,8 @@ class Language(object): | ||||||
|             ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), |             ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), | ||||||
|             ('meta', lambda: ujson.dumps(self.meta)) |             ('meta', lambda: ujson.dumps(self.meta)) | ||||||
|         )) |         )) | ||||||
|         for i, proc in enumerate(self.pipeline): |         for i, (name, proc) in enumerate(self.pipeline): | ||||||
|             if getattr(proc, 'name', None) in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'to_bytes'): |             if not hasattr(proc, 'to_bytes'): | ||||||
|                 continue |                 continue | ||||||
|  | @ -572,8 +596,8 @@ class Language(object): | ||||||
|             ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), |             ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), | ||||||
|             ('meta', lambda b: self.meta.update(ujson.loads(b))) |             ('meta', lambda b: self.meta.update(ujson.loads(b))) | ||||||
|         )) |         )) | ||||||
|         for i, proc in enumerate(self.pipeline): |         for i, (name, proc) in enumerate(self.pipeline): | ||||||
|             if getattr(proc, 'name', None) in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|             if not hasattr(proc, 'from_bytes'): |             if not hasattr(proc, 'from_bytes'): | ||||||
|                 continue |                 continue | ||||||
|  |  | ||||||
|  | @ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
| from .syntax.parser cimport Parser as LinearParser | from .syntax.parser cimport Parser as LinearParser | ||||||
| from .syntax.nn_parser cimport Parser as NeuralParser | from .syntax.nn_parser cimport Parser as NeuralParser | ||||||
|  | from .syntax import nonproj | ||||||
| from .syntax.parser import get_templates as get_feature_templates | from .syntax.parser import get_templates as get_feature_templates | ||||||
| from .syntax.beam_parser cimport BeamParser | from .syntax.beam_parser cimport BeamParser | ||||||
| from .syntax.ner cimport BiluoPushDown | from .syntax.ner cimport BiluoPushDown | ||||||
|  | @ -773,11 +774,19 @@ cdef class DependencyParser(LinearParser): | ||||||
|         if isinstance(label, basestring): |         if isinstance(label, basestring): | ||||||
|             label = self.vocab.strings[label] |             label = self.vocab.strings[label] | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         return [nonproj.deprojectivize] | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| cdef class NeuralDependencyParser(NeuralParser): | cdef class NeuralDependencyParser(NeuralParser): | ||||||
|     name = 'parser' |     name = 'parser' | ||||||
|     TransitionSystem = ArcEager |     TransitionSystem = ArcEager | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         return [nonproj.deprojectivize] | ||||||
|  | 
 | ||||||
|     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): |     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): | ||||||
|         for target in []: |         for target in []: | ||||||
|             labeller = NeuralLabeller(self.vocab, target=target) |             labeller = NeuralLabeller(self.vocab, target=target) | ||||||
|  | @ -818,6 +827,11 @@ cdef class BeamDependencyParser(BeamParser): | ||||||
|         if isinstance(label, basestring): |         if isinstance(label, basestring): | ||||||
|             label = self.vocab.strings[label] |             label = self.vocab.strings[label] | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         return [nonproj.deprojectivize] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', | __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', | ||||||
|            'BeamEntityRecognizer', 'TokenVectorEnoder'] |            'BeamEntityRecognizer', 'TokenVectorEnoder'] | ||||||
|  |  | ||||||
|  | @ -779,6 +779,14 @@ cdef class Parser: | ||||||
|             for i in range(doc.length): |             for i in range(doc.length): | ||||||
|                 doc.c[i] = state.c._sent[i] |                 doc.c[i] = state.c._sent[i] | ||||||
|             self.moves.finalize_doc(doc) |             self.moves.finalize_doc(doc) | ||||||
|  |             for hook in self.postprocesses: | ||||||
|  |                 for doc in docs: | ||||||
|  |                     hook(doc) | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def postprocesses(self): | ||||||
|  |         # Available for subclasses, e.g. to deprojectivize | ||||||
|  |         return [] | ||||||
| 
 | 
 | ||||||
|     def add_label(self, label): |     def add_label(self, label): | ||||||
|         for action in self.moves.action_types: |         for action in self.moves.action_types: | ||||||
|  |  | ||||||
|  | @ -58,8 +58,9 @@ def en_vocab(): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def en_parser(): | def en_parser(en_vocab): | ||||||
|     return util.get_lang_class('en').Defaults.create_parser() |     nlp = util.get_lang_class('en')(en_vocab) | ||||||
|  |     return nlp.create_pipe('parser') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
|  |  | ||||||
|  | @ -1,10 +1,11 @@ | ||||||
| import spacy | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| @pytest.mark.models |  | ||||||
| def test_beam_parse(): |  | ||||||
|     nlp = spacy.load('en_core_web_sm') |  | ||||||
|     doc = nlp(u'Australia is a country', disable=['ner']) |  | ||||||
|     ents = nlp.entity(doc, beam_width=2) |  | ||||||
|     print(ents) |  | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.models('en') | ||||||
|  | def test_beam_parse(EN): | ||||||
|  |     doc = EN(u'Australia is a country', disable=['ner']) | ||||||
|  |     ents = EN.entity(doc, beam_width=2) | ||||||
|  |     print(ents) | ||||||
|  |  | ||||||
							
								
								
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/pipeline/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								spacy/tests/pipeline/test_pipe_methods.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,84 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from ...language import Language | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def nlp(): | ||||||
|  |     return Language() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def new_pipe(doc): | ||||||
|  |     return doc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_add_pipe_no_name(nlp): | ||||||
|  |     nlp.add_pipe(new_pipe) | ||||||
|  |     assert 'new_pipe' in nlp.pipe_names | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_add_pipe_duplicate_name(nlp): | ||||||
|  |     nlp.add_pipe(new_pipe, name='duplicate_name') | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.add_pipe(new_pipe, name='duplicate_name') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name', ['parser']) | ||||||
|  | def test_add_pipe_first(nlp, name): | ||||||
|  |     nlp.add_pipe(new_pipe, name=name, first=True) | ||||||
|  |     assert nlp.pipeline[0][0] == name | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) | ||||||
|  | def test_add_pipe_last(nlp, name1, name2): | ||||||
|  |     nlp.add_pipe(lambda doc: doc, name=name2) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name1, last=True) | ||||||
|  |     assert nlp.pipeline[0][0] != name1 | ||||||
|  |     assert nlp.pipeline[-1][0] == name1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_cant_add_pipe_first_and_last(nlp): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.add_pipe(new_pipe, first=True, last=True) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name', ['my_component']) | ||||||
|  | def test_get_pipe(nlp, name): | ||||||
|  |     with pytest.raises(KeyError): | ||||||
|  |         nlp.get_pipe(name) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name) | ||||||
|  |     assert nlp.get_pipe(name) == new_pipe | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) | ||||||
|  | def test_replace_pipe(nlp, name, replacement): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.replace_pipe(name, new_pipe) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name) | ||||||
|  |     nlp.replace_pipe(name, replacement) | ||||||
|  |     assert nlp.get_pipe(name) != new_pipe | ||||||
|  |     assert nlp.get_pipe(name) == replacement | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) | ||||||
|  | def test_rename_pipe(nlp, old_name, new_name): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.rename_pipe(old_name, new_name) | ||||||
|  |     nlp.add_pipe(new_pipe, name=old_name) | ||||||
|  |     nlp.rename_pipe(old_name, new_name) | ||||||
|  |     assert nlp.pipeline[0][0] == new_name | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.parametrize('name', ['my_component']) | ||||||
|  | def test_remove_pipe(nlp, name): | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         nlp.remove_pipe(name) | ||||||
|  |     nlp.add_pipe(new_pipe, name=name) | ||||||
|  |     assert len(nlp.pipeline) == 1 | ||||||
|  |     removed_name, removed_component = nlp.remove_pipe(name) | ||||||
|  |     assert not len(nlp.pipeline) | ||||||
|  |     assert removed_name == name | ||||||
|  |     assert removed_component == new_pipe | ||||||
|  | @ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides): | ||||||
|     if not meta: |     if not meta: | ||||||
|         meta = get_model_meta(model_path) |         meta = get_model_meta(model_path) | ||||||
|     cls = get_lang_class(meta['lang']) |     cls = get_lang_class(meta['lang']) | ||||||
|     nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides) |     nlp = cls(meta=meta, **overrides) | ||||||
|  |     pipeline = meta.get('pipeline', []) | ||||||
|  |     disable = overrides.get('disable', []) | ||||||
|  |     if pipeline is True: | ||||||
|  |         pipeline = nlp.Defaults.pipe_names | ||||||
|  |     elif pipeline in (False, None): | ||||||
|  |         pipeline = [] | ||||||
|  |     for name in pipeline: | ||||||
|  |         if name not in disable: | ||||||
|  |             config = meta.get('pipeline_args', {}).get(name, {}) | ||||||
|  |             component = nlp.create_pipe(name, config=config) | ||||||
|  |             nlp.add_pipe(component, name=name) | ||||||
|     return nlp.from_disk(model_path) |     return nlp.from_disk(model_path) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap) | ||||||
| 
 | 
 | ||||||
| //- Code blocks to display old/new versions | //- Code blocks to display old/new versions | ||||||
| 
 | 
 | ||||||
|  | mixin code-compare() | ||||||
|  |     span.u-inline-block.u-padding-top.u-width-full | ||||||
|  |         block | ||||||
|  | 
 | ||||||
| mixin code-old() | mixin code-old() | ||||||
|     +code(false, false, false, false, "reject").o-block-small |     +code(false, false, false, false, "reject").o-block-small | ||||||
|         block |         block | ||||||
|  |  | ||||||
|  | @ -43,6 +43,20 @@ p | ||||||
|         +cell #[code Language] |         +cell #[code Language] | ||||||
|         +cell A #[code Language] object with the loaded model. |         +cell A #[code Language] object with the loaded model. | ||||||
| 
 | 
 | ||||||
|  | p | ||||||
|  |     |  Essentially, #[code spacy.load()] is a convenience wrapper that reads | ||||||
|  |     |  the language ID and pipeline components from a model's #[code meta.json], | ||||||
|  |     |  initialises the #[code Language] class, loads in the model data and | ||||||
|  |     |  returns it. | ||||||
|  | 
 | ||||||
|  | +code("Abstract example"). | ||||||
|  |     cls = util.get_lang_class(lang)         #  get language for ID, e.g. 'en' | ||||||
|  |     nlp = cls()                             #  initialise the language | ||||||
|  |     for name in pipeline: | ||||||
|  |         component = nlp.create_pipe(name)   #  create each pipeline component | ||||||
|  |         nlp.add_pipe(component)             #  add component to pipeline | ||||||
|  |     nlp.from_disk(model_data_path)          #  load in model data | ||||||
|  | 
 | ||||||
| +infobox("Deprecation note", "⚠️") | +infobox("Deprecation note", "⚠️") | ||||||
|     .o-block |     .o-block | ||||||
|         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy |         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy | ||||||
|  | @ -141,37 +155,3 @@ p | ||||||
|         +cell returns |         +cell returns | ||||||
|         +cell unicode |         +cell unicode | ||||||
|         +cell The explanation, or #[code None] if not found in the glossary. |         +cell The explanation, or #[code None] if not found in the glossary. | ||||||
| 
 |  | ||||||
| +h(3, "spacy.set_factory") spacy.set_factory |  | ||||||
|     +tag function |  | ||||||
|     +tag-new(2) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Set a factory that returns a custom |  | ||||||
|     |  #[+a("/usage/processing-pipelines") processing pipeline] |  | ||||||
|     |  component. Factories are useful for creating stateful components, especially ones which depend on shared data. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     def my_factory(vocab): |  | ||||||
|         def my_component(doc): |  | ||||||
|             return doc |  | ||||||
|         return my_component |  | ||||||
| 
 |  | ||||||
|     spacy.set_factory('my_factory', my_factory) |  | ||||||
|     nlp = Language(pipeline=['my_factory']) |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code factory_id] |  | ||||||
|         +cell unicode |  | ||||||
|         +cell |  | ||||||
|             |  Unique name of factory. If added to a new pipeline, spaCy will |  | ||||||
|             |  look up the factory for this ID and use it to create the |  | ||||||
|             |  component. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code factory] |  | ||||||
|         +cell callable |  | ||||||
|         +cell |  | ||||||
|             |  Callable that takes a #[code Vocab] object and returns a pipeline |  | ||||||
|             |  component. |  | ||||||
|  |  | ||||||
|  | @ -4,7 +4,14 @@ include ../_includes/_mixins | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Usually you'll load this once per process as #[code nlp] and pass the |     |  Usually you'll load this once per process as #[code nlp] and pass the | ||||||
|     |  instance around your application. |     |  instance around your application. The #[code Language] class is created | ||||||
|  |     |  when you call #[+api("spacy#load") #[code spacy.load()]] and contains | ||||||
|  |     |  the shared vocabulary and #[+a("/usage/adding-languages") language data], | ||||||
|  |     |  optional model data loaded from a #[+a("/models") model package] or | ||||||
|  |     |  a path, and a #[+a("/usage/processing-pipelines") processing pipeline] | ||||||
|  |     |  containing components like the tagger or parser that are called on a | ||||||
|  |     |  document in order. You can also add your own processing pipeline | ||||||
|  |     |  components that take a #[code Doc] object, modify it and return it. | ||||||
| 
 | 
 | ||||||
| +h(2, "init") Language.__init__ | +h(2, "init") Language.__init__ | ||||||
|     +tag method |     +tag method | ||||||
|  | @ -12,9 +19,9 @@ p | ||||||
| p Initialise a #[code Language] object. | p Initialise a #[code Language] object. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|  |     from spacy.vocab import Vocab | ||||||
|     from spacy.language import Language |     from spacy.language import Language | ||||||
|     nlp = Language(pipeline=['token_vectors', 'tags', |     nlp = Language(Vocab()) | ||||||
|                              'dependencies']) |  | ||||||
| 
 | 
 | ||||||
|     from spacy.lang.en import English |     from spacy.lang.en import English | ||||||
|     nlp = English() |     nlp = English() | ||||||
|  | @ -34,14 +41,6 @@ p Initialise a #[code Language] object. | ||||||
|             |  A function that takes text and returns a #[code Doc] object. |             |  A function that takes text and returns a #[code Doc] object. | ||||||
|             |  Usually a #[code Tokenizer]. |             |  Usually a #[code Tokenizer]. | ||||||
| 
 | 
 | ||||||
|     +row |  | ||||||
|         +cell #[code pipeline] |  | ||||||
|         +cell list |  | ||||||
|         +cell |  | ||||||
|             |  A list of annotation processes or IDs of annotation, processes, |  | ||||||
|             |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked |  | ||||||
|             |  up in #[code Language.Defaults.factories]. |  | ||||||
| 
 |  | ||||||
|     +row |     +row | ||||||
|         +cell #[code meta] |         +cell #[code meta] | ||||||
|         +cell dict |         +cell dict | ||||||
|  | @ -235,7 +234,6 @@ p | ||||||
|     |  Can be called before training to pre-process gold data. By default, it |     |  Can be called before training to pre-process gold data. By default, it | ||||||
|     |  handles nonprojectivity and adds missing tags to the tag map. |     |  handles nonprojectivity and adds missing tags to the tag map. | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|         +cell #[code docs_golds] |         +cell #[code docs_golds] | ||||||
|  | @ -247,6 +245,177 @@ p | ||||||
|         +cell tuple |         +cell tuple | ||||||
|         +cell Tuples of #[code Doc] and #[code GoldParse] objects. |         +cell Tuples of #[code Doc] and #[code GoldParse] objects. | ||||||
| 
 | 
 | ||||||
|  | +h(2, "create_pipe") Language.create_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Create a pipeline component from a factory. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     parser = nlp.create_pipe('parser') | ||||||
|  |     nlp.add_pipe(parser) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell | ||||||
|  |             |  Factory name to look up in | ||||||
|  |             |  #[+api("language#class-attributes") #[code Language.factories]]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code config] | ||||||
|  |         +cell dict | ||||||
|  |         +cell Configuration parameters to initialise component. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component. | ||||||
|  | 
 | ||||||
|  | +h(2, "add_pipe") Language.add_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Add a component to the processing pipeline. Valid components are | ||||||
|  |     |  callables that take a #[code Doc] object, modify it and return it. Only | ||||||
|  |     |  one of #[code before], #[code after], #[code first] or #[code last] can | ||||||
|  |     |  be set. Default behaviour is #[code last=True]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     def component(doc): | ||||||
|  |         # modify Doc and return it | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  |     nlp.add_pipe(component, before='ner') | ||||||
|  |     nlp.add_pipe(component, name='custom_name', last=True) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code component] | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell | ||||||
|  |             |  Name of pipeline component. Overwrites existing | ||||||
|  |             |  #[code component.name] attribute if available. If no #[code name] | ||||||
|  |             |  is set and the component exposes no name attribute, | ||||||
|  |             |  #[code component.__name__] is used. An error is raised if the | ||||||
|  |             |  name already exists in the pipeline. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code before] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Component name to insert component directly before. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code after] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Component name to insert component directly after: | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code first] | ||||||
|  |         +cell bool | ||||||
|  |         +cell Insert component first / not first in the pipeline. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code last] | ||||||
|  |         +cell bool | ||||||
|  |         +cell Insert component last / not last in the pipeline. | ||||||
|  | 
 | ||||||
|  | +h(2, "get_pipe") Language.get_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Get a pipeline component for a given component name. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     parser = nlp.get_pipe('parser') | ||||||
|  |     custom_component = nlp.get_pipe('custom_component') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the pipeline component to get. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component. | ||||||
|  | 
 | ||||||
|  | +h(2, "replace_pipe") Language.replace_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p Replace a component in the pipeline. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     nlp.replace_pipe('parser', my_custom_parser) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the component to replace. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code component] | ||||||
|  |         +cell callable | ||||||
|  |         +cell The pipeline component to inser. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +h(2, "rename_pipe") Language.rename_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Rename a component in the pipeline. Useful to create custom names for | ||||||
|  |     |  pre-defined and pre-loaded components. To change the default name of | ||||||
|  |     |  a component added to the pipeline, you can also use the #[code name] | ||||||
|  |     |  argument on #[+api("language#add_pipe") #[code add_pipe]]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     nlp.rename_pipe('parser', 'spacy_parser') | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code old_name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the component to rename. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code new_name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell New name of the component. | ||||||
|  | 
 | ||||||
|  | +h(2, "remove_pipe") Language.remove_pipe | ||||||
|  |     +tag method | ||||||
|  |     +tag-new(2) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Remove a component from the pipeline. Returns the removed component name | ||||||
|  |     |  and component function. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     name, component = nlp.remove_pipe('parser') | ||||||
|  |     assert name == 'parser' | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code name] | ||||||
|  |         +cell unicode | ||||||
|  |         +cell Name of the component to remove. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell tuple | ||||||
|  |         +cell A #[code (name, component)] tuple of the removed component. | ||||||
|  | 
 | ||||||
| +h(2, "to_disk") Language.to_disk | +h(2, "to_disk") Language.to_disk | ||||||
|     +tag method |     +tag method | ||||||
|     +tag-new(2) |     +tag-new(2) | ||||||
|  | @ -399,7 +568,15 @@ p Load state from a binary string. | ||||||
|     +row |     +row | ||||||
|         +cell #[code pipeline] |         +cell #[code pipeline] | ||||||
|         +cell list |         +cell list | ||||||
|         +cell Sequence of annotation functions. |         +cell | ||||||
|  |             |  List of #[code (name, component)] tuples describing the current | ||||||
|  |             |  processing pipeline, in order. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code pipe_names] | ||||||
|  |             +tag-new(2) | ||||||
|  |         +cell list | ||||||
|  |         +cell List of pipeline component names, in order. | ||||||
| 
 | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code meta] |         +cell #[code meta] | ||||||
|  | @ -424,3 +601,12 @@ p Load state from a binary string. | ||||||
|         +cell |         +cell | ||||||
|             |  Two-letter language ID, i.e. |             |  Two-letter language ID, i.e. | ||||||
|             |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. |             |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code factories] | ||||||
|  |             +tag-new(2) | ||||||
|  |         +cell dict | ||||||
|  |         +cell | ||||||
|  |             |  Factories that create pre-defined pipeline components, e.g. the | ||||||
|  |             |  tagger, parser or entity recognizer, keyed by their component | ||||||
|  |             |  name. | ||||||
|  |  | ||||||
|  | @ -143,6 +143,9 @@ | ||||||
| 
 | 
 | ||||||
| //- Layout | //- Layout | ||||||
| 
 | 
 | ||||||
|  | .u-width-full | ||||||
|  |     width: 100% | ||||||
|  | 
 | ||||||
| .u-float-left | .u-float-left | ||||||
|     float: left |     float: left | ||||||
|     margin-right: 1rem |     margin-right: 1rem | ||||||
|  | @ -166,6 +169,9 @@ | ||||||
| .u-padding-medium | .u-padding-medium | ||||||
|     padding: 1.8rem |     padding: 1.8rem | ||||||
| 
 | 
 | ||||||
|  | .u-padding-top | ||||||
|  |     padding-top: 2rem | ||||||
|  | 
 | ||||||
| .u-inline-block | .u-inline-block | ||||||
|     display: inline-block |     display: inline-block | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -25,7 +25,7 @@ | ||||||
|         display: inline-block |         display: inline-block | ||||||
|         font-size: 0.6em |         font-size: 0.6em | ||||||
|         font-weight: bold |         font-weight: bold | ||||||
|         padding-right: 1.25rem |         padding-right: 1em | ||||||
|         margin-left: -3.75rem |         margin-left: -3.75rem | ||||||
|         text-align: right |         text-align: right | ||||||
|         width: 2.5rem |         width: 2.5rem | ||||||
|  |  | ||||||
|  | @ -103,11 +103,11 @@ | ||||||
|         "title": "Language Processing Pipelines", |         "title": "Language Processing Pipelines", | ||||||
|         "next": "vectors-similarity", |         "next": "vectors-similarity", | ||||||
|         "menu": { |         "menu": { | ||||||
|             "How pipelines work": "pipelines", |             "How Pipelines Work": "pipelines", | ||||||
|             "Examples": "examples", |             "Custom Components": "custom-components", | ||||||
|             "Multi-threading": "multithreading", |             "Multi-threading": "multithreading", | ||||||
|             "User Hooks": "user-hooks", |             "Serialization": "serialization", | ||||||
|             "Serialization": "serialization" |             "Developing Extensions": "extensions" | ||||||
|         } |         } | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										151
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								website/usage/_processing-pipelines/_custom-components.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,151 @@ | ||||||
|  | //- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  A component receives a #[code Doc] object and | ||||||
|  |     |  #[strong performs the actual processing] – for example, using the current | ||||||
|  |     |  weights to make a prediction and set some annotation on the document. By | ||||||
|  |     |  adding a component to the pipeline, you'll get access to the #[code Doc] | ||||||
|  |     |  at any point #[strong during] processing – instead of only being able to | ||||||
|  |     |  modify it afterwards. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     def my_component(doc): | ||||||
|  |         # do something to the doc here | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  | +table(["Argument", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code doc] | ||||||
|  |         +cell #[code Doc] | ||||||
|  |         +cell The #[code Doc] object processed by the previous component. | ||||||
|  | 
 | ||||||
|  |     +row("foot") | ||||||
|  |         +cell returns | ||||||
|  |         +cell #[code Doc] | ||||||
|  |         +cell The #[code Doc] object processed by this pipeline component. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Custom components can be added to the pipeline using the | ||||||
|  |     |  #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you | ||||||
|  |     |  can either specify a component to add it before or after, tell spaCy | ||||||
|  |     |  to add it first or last in the pipeline, or define a custom name. | ||||||
|  |     |  If no name is set and no #[code name] attribute is present on your | ||||||
|  |     |  component, the function name, e.g. #[code component.__name__] is used. | ||||||
|  | 
 | ||||||
|  | +code("Adding pipeline components"). | ||||||
|  |     def my_component(doc): | ||||||
|  |         print("After tokenization, this doc has %s tokens." % len(doc)) | ||||||
|  |         if len(doc) < 10: | ||||||
|  |             print("This is a pretty short document.") | ||||||
|  |         return doc | ||||||
|  | 
 | ||||||
|  |     nlp = spacy.load('en') | ||||||
|  |     nlp.pipeline.add_pipe(my_component, name='print_info', first=True) | ||||||
|  |     print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] | ||||||
|  |     doc = nlp(u"This is a sentence.") | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Of course, you can also wrap your component as a class to allow | ||||||
|  |     |  initialising it with custom settings and hold state within the component. | ||||||
|  |     |  This is useful for #[strong stateful components], especially ones which | ||||||
|  |     |  #[strong depend on shared data]. | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     class MyComponent(object): | ||||||
|  |         name = 'print_info' | ||||||
|  | 
 | ||||||
|  |         def __init__(vocab, short_limit=10): | ||||||
|  |             self.vocab = nlp.vocab | ||||||
|  |             self.short_limit = short_limit | ||||||
|  | 
 | ||||||
|  |         def __call__(doc): | ||||||
|  |             if len(doc) < self.short_limit: | ||||||
|  |                 print("This is a pretty short document.") | ||||||
|  |             return doc | ||||||
|  | 
 | ||||||
|  |     my_component = MyComponent(nlp.vocab, short_limit=25) | ||||||
|  |     nlp.add_pipe(my_component, first=True) | ||||||
|  | 
 | ||||||
|  | +h(3, "custom-components-attributes") | ||||||
|  |     |  Setting attributes on the #[code Doc], #[code Span] and #[code Token] | ||||||
|  | 
 | ||||||
|  | +aside("Why ._?") | ||||||
|  |     |  Writing to a #[code ._] attribute instead of to the #[code Doc] directly | ||||||
|  |     |  keeps a clearer separation and makes it easier to ensure backwards | ||||||
|  |     |  compatibility. For example, if you've implemented your own #[code .coref] | ||||||
|  |     |  property and spaCy claims it one day, it'll break your code. Similarly, | ||||||
|  |     |  just by looking at the code, you'll immediately know what's built-in and | ||||||
|  |     |  what's custom – for example, #[code doc.sentiment] is spaCy, while | ||||||
|  |     |  #[code doc._.sent_score] isn't. | ||||||
|  | 
 | ||||||
|  | +under-construction | ||||||
|  | 
 | ||||||
|  | +h(3, "custom-components-user-hooks") Other user hooks | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  While it's generally recommended to use the #[code Doc._], #[code Span._] | ||||||
|  |     |  and #[code Token._] proxies to add your own custom attributes, spaCy | ||||||
|  |     |  offers a few exceptions to allow #[strong customising the built-in methods] | ||||||
|  |     |  like #[+api("doc#similarity") #[code Doc.similarity]] or | ||||||
|  |     |  #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can | ||||||
|  |     |  rely on statistical models you train yourself. For instance, you can | ||||||
|  |     |  provide your own on-the-fly sentence segmentation algorithm or document | ||||||
|  |     |  similarity method. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Hooks let you customize some of the behaviours of the #[code Doc], | ||||||
|  |     |  #[code Span] or #[code Token] objects by adding a component to the | ||||||
|  |     |  pipeline. For instance, to customize the | ||||||
|  |     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a | ||||||
|  |     |  component that sets a custom function to | ||||||
|  |     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] | ||||||
|  |     |  method will check the #[code user_hooks] dict, and delegate to your | ||||||
|  |     |  function if you've set one. Similar results can be achieved by setting | ||||||
|  |     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. | ||||||
|  | 
 | ||||||
|  | +aside("Implementation note") | ||||||
|  |     |  The hooks live on the #[code Doc] object because the #[code Span] and | ||||||
|  |     |  #[code Token] objects are created lazily, and don't own any data. They | ||||||
|  |     |  just proxy to their parent #[code Doc]. This turns out to be convenient | ||||||
|  |     |  here — we only have to worry about installing hooks in one place. | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Customises"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code user_hooks] | ||||||
|  |         +cell | ||||||
|  |             +api("doc#vector") #[code Doc.vector] | ||||||
|  |             +api("doc#has_vector") #[code Doc.has_vector] | ||||||
|  |             +api("doc#vector_norm") #[code Doc.vector_norm] | ||||||
|  |             +api("doc#sents") #[code Doc.sents] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code user_token_hooks] | ||||||
|  |         +cell | ||||||
|  |             +api("token#similarity") #[code Token.similarity] | ||||||
|  |             +api("token#vector") #[code Token.vector] | ||||||
|  |             +api("token#has_vector") #[code Token.has_vector] | ||||||
|  |             +api("token#vector_norm") #[code Token.vector_norm] | ||||||
|  |             +api("token#conjuncts") #[code Token.conjuncts] | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code user_span_hooks] | ||||||
|  |         +cell | ||||||
|  |             +api("span#similarity") #[code Span.similarity] | ||||||
|  |             +api("span#vector") #[code Span.vector] | ||||||
|  |             +api("span#has_vector") #[code Span.has_vector] | ||||||
|  |             +api("span#vector_norm") #[code Span.vector_norm] | ||||||
|  |             +api("span#root") #[code Span.root] | ||||||
|  | 
 | ||||||
|  | +code("Add custom similarity hooks"). | ||||||
|  |     class SimilarityModel(object): | ||||||
|  |         def __init__(self, model): | ||||||
|  |             self._model = model | ||||||
|  | 
 | ||||||
|  |         def __call__(self, doc): | ||||||
|  |             doc.user_hooks['similarity'] = self.similarity | ||||||
|  |             doc.user_span_hooks['similarity'] = self.similarity | ||||||
|  |             doc.user_token_hooks['similarity'] = self.similarity | ||||||
|  | 
 | ||||||
|  |         def similarity(self, obj1, obj2): | ||||||
|  |             y = self._model([obj1.vector, obj2.vector]) | ||||||
|  |             return float(y[0]) | ||||||
							
								
								
									
										3
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								website/usage/_processing-pipelines/_extensions.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,3 @@ | ||||||
|  | //- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS | ||||||
|  | 
 | ||||||
|  | +under-construction | ||||||
|  | @ -11,7 +11,7 @@ p | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  When you load a model, spaCy first consults the model's |     |  When you load a model, spaCy first consults the model's | ||||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. The |     |  #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The | ||||||
|     |  meta typically includes the model details, the ID of a language class, |     |  meta typically includes the model details, the ID of a language class, | ||||||
|     |  and an optional list of pipeline components. spaCy then does the |     |  and an optional list of pipeline components. spaCy then does the | ||||||
|     |  following: |     |  following: | ||||||
|  | @ -21,24 +21,26 @@ p | ||||||
|         "name": "example_model", |         "name": "example_model", | ||||||
|         "lang": "en" |         "lang": "en" | ||||||
|         "description": "Example model for spaCy", |         "description": "Example model for spaCy", | ||||||
|         "pipeline": ["tensorizer", "tagger"] |         "pipeline": ["tagger", "parser"] | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| +list("numbers") | +list("numbers") | ||||||
|     +item |  | ||||||
|         |  Look up #[strong pipeline IDs] in the available |  | ||||||
|         |  #[strong pipeline factories]. |  | ||||||
|     +item |  | ||||||
|         |  Initialise the #[strong pipeline components] by calling their |  | ||||||
|         |  factories with the #[code Vocab] as an argument. This gives each |  | ||||||
|         |  factory and component access to the pipeline's shared data, like |  | ||||||
|         |  strings, morphology and annotation scheme. |  | ||||||
|     +item |     +item | ||||||
|         |  Load the #[strong language class and data] for the given ID via |         |  Load the #[strong language class and data] for the given ID via | ||||||
|         |  #[+api("util.get_lang_class") #[code get_lang_class]]. |         |  #[+api("util.get_lang_class") #[code get_lang_class]] and initialise | ||||||
|  |         |  it. The #[code Language] class contains the shared vocabulary, | ||||||
|  |         |  tokenization rules and the language-specific annotation scheme. | ||||||
|     +item |     +item | ||||||
|         |  Pass the path to the #[strong model data] to the #[code Language] |         |  Iterate over the #[strong pipeline names] and create each component | ||||||
|         |  class and return it. |         |  using #[+api("language#create_pipe") #[code create_pipe]], which | ||||||
|  |         |  looks them up in #[code Language.factories]. | ||||||
|  |     +item | ||||||
|  |         |  Add each pipeline component to the pipeline in order, using | ||||||
|  |         |  #[+api("language#add_pipe") #[code add_pipe]]. | ||||||
|  |     +item | ||||||
|  |         |  Make the #[strong model data] available to the #[code Language] class | ||||||
|  |         |  by calling #[+api("language#from_disk") #[code from_disk]] with the | ||||||
|  |         |  path to the model data ditectory. | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  So when you call this... |     |  So when you call this... | ||||||
|  | @ -47,12 +49,12 @@ p | ||||||
|     nlp = spacy.load('en') |     nlp = spacy.load('en') | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     | ... the model tells spaCy to use the pipeline |     | ... the model tells spaCy to use the language #[code "en"] and the pipeline | ||||||
|     |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will |     |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will | ||||||
|     |  then look up each string in its internal factories registry and |     |  then initialise #[code spacy.lang.en.English], and create each pipeline | ||||||
|     |  initialise the individual components. It'll then load |     |  component and add it to the processing pipeline. It'll then load in the | ||||||
|     |  #[code spacy.lang.en.English], pass it the path to the model's data |     |  model's data from its data ditectory and return the modified | ||||||
|     |  directory, and return it for you to use as the #[code nlp] object. |     |  #[code Language] class for you to use as the #[code nlp] object. | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Fundamentally, a #[+a("/models") spaCy model] consists of three |     |  Fundamentally, a #[+a("/models") spaCy model] consists of three | ||||||
|  | @ -73,9 +75,12 @@ p | ||||||
|     pipeline = ['tensorizer', 'tagger', 'parser', 'ner'] |     pipeline = ['tensorizer', 'tagger', 'parser', 'ner'] | ||||||
|     data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' |     data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' | ||||||
| 
 | 
 | ||||||
|     cls = spacy.util.get_lang_class(lang)  # 1. get Language instance, e.g. English() |     cls = spacy.util.get_lang_class(lang)   # 1. get Language instance, e.g. English() | ||||||
|     nlp = cls(pipeline=pipeline)           # 2. initialise it with the pipeline |     nlp = cls()                             # 2. initialise it | ||||||
|     nlp.from_disk(model_data_path)         # 3. load in the binary data |     for name in pipeline: | ||||||
|  |         component = nlp.create_pipe(name)   # 3. create the pipeline components | ||||||
|  |         nlp.add_pipe(component)             # 4. add the component to the pipeline | ||||||
|  |     nlp.from_disk(model_data_path)          # 5. load in the binary data | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and |     |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and | ||||||
|  | @ -87,124 +92,23 @@ p | ||||||
|     |  document, which is then processed by the component next in the pipeline. |     |  document, which is then processed by the component next in the pipeline. | ||||||
| 
 | 
 | ||||||
| +code("The pipeline under the hood"). | +code("The pipeline under the hood"). | ||||||
|     doc = nlp.make_doc(u'This is a sentence') |     doc = nlp.make_doc(u'This is a sentence')   # create a Doc from raw text | ||||||
|     for proc in nlp.pipeline: |     for name, proc in nlp.pipeline:             # iterate over components in order | ||||||
|         doc = proc(doc) |         doc = proc(doc)                         # apply each component | ||||||
| 
 |  | ||||||
| +h(3, "creating") Creating pipeline components and factories |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  spaCy lets you customise the pipeline with your own components. Components |     |  The current processing pipeline is available as #[code nlp.pipeline], | ||||||
|     |  are functions that receive a #[code Doc] object, modify and return it. |     |  which returns a list of #[code (name, component)] tuples, or | ||||||
|     |  If your component is stateful, you'll want to create a new one for each |     |  #[code nlp.pipe_names], which only returns a list of human-readable | ||||||
|     |  pipeline. You can do that by defining and registering a factory which |     |  component names. | ||||||
|     |  receives the shared #[code Vocab] object and returns a component. |  | ||||||
| 
 |  | ||||||
| +h(4, "creating-component") Creating a  component |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  A component receives a #[code Doc] object and |  | ||||||
|     |  #[strong performs the actual processing] – for example, using the current |  | ||||||
|     |  weights to make a prediction and set some annotation on the document. By |  | ||||||
|     |  adding a component to the pipeline, you'll get access to the #[code Doc] |  | ||||||
|     |  at any point #[strong during] processing – instead of only being able to |  | ||||||
|     |  modify it afterwards. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     def my_component(doc): |  | ||||||
|         # do something to the doc here |  | ||||||
|         return doc |  | ||||||
| 
 |  | ||||||
| +table(["Argument", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code doc] |  | ||||||
|         +cell #[code Doc] |  | ||||||
|         +cell The #[code Doc] object processed by the previous component. |  | ||||||
| 
 |  | ||||||
|     +row("foot") |  | ||||||
|         +cell returns |  | ||||||
|         +cell #[code Doc] |  | ||||||
|         +cell The #[code Doc] object processed by this pipeline component. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  When creating a new #[code Language] class, you can pass it a list of |  | ||||||
|     |  pipeline component functions to execute in that order. You can also |  | ||||||
|     |  add it to an existing pipeline by modifying #[code nlp.pipeline] – just |  | ||||||
|     |  be careful not to overwrite a pipeline or its components by accident! |  | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     # Create a new Language object with a pipeline |     nlp.pipeline | ||||||
|     from spacy.language import Language |     # [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)] | ||||||
|     nlp = Language(pipeline=[my_component]) |     nlp.pipe_names | ||||||
|  |     # ['tagger', 'parser', 'ner'] | ||||||
| 
 | 
 | ||||||
|     # Modify an existing pipeline | +h(3, "disabling") Disabling and modifying pipeline components | ||||||
|     nlp = spacy.load('en') |  | ||||||
|     nlp.pipeline.append(my_component) |  | ||||||
| 
 |  | ||||||
| +h(4, "creating-factory") Creating a factory |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  A factory is a #[strong function that returns a pipeline component]. |  | ||||||
|     |  It's called with the #[code Vocab] object, to give it access to the |  | ||||||
|     |  shared data between components – for example, the strings, morphology, |  | ||||||
|     |  vectors or annotation scheme. Factories are useful for creating |  | ||||||
|     |  #[strong stateful components], especially ones which |  | ||||||
|     |  #[strong depend on shared data]. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     def my_factory(vocab): |  | ||||||
|         # load some state |  | ||||||
|         def my_component(doc): |  | ||||||
|             # process the doc |  | ||||||
|             return doc |  | ||||||
|         return my_component |  | ||||||
| 
 |  | ||||||
| +table(["Argument", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code vocab] |  | ||||||
|         +cell #[code Vocab] |  | ||||||
|         +cell |  | ||||||
|             |  Shared data between components, including strings, morphology, |  | ||||||
|             |  vectors etc. |  | ||||||
| 
 |  | ||||||
|     +row("foot") |  | ||||||
|         +cell returns |  | ||||||
|         +cell callable |  | ||||||
|         +cell The pipeline component. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  By creating a factory, you're essentially telling spaCy how to get the |  | ||||||
|     |  pipeline component #[strong once the vocab is available]. Factories need to |  | ||||||
|     |  be registered via #[+api("spacy#set_factory") #[code set_factory()]] and |  | ||||||
|     |  by assigning them a unique ID. This ID can be added to the pipeline as a |  | ||||||
|     |  string. When creating a pipeline, you're free to mix strings and |  | ||||||
|     |  callable components: |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     spacy.set_factory('my_factory', my_factory) |  | ||||||
|     nlp = Language(pipeline=['my_factory', my_other_component]) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  If spaCy comes across a string in the pipeline, it will try to resolve it |  | ||||||
|     |  by looking it up in the available factories. The factory will then be |  | ||||||
|     |  initialised with the #[code Vocab]. Providing factory names instead of |  | ||||||
|     |  callables also makes it easy to specify them in the model's |  | ||||||
|     |  #[+a("/usage/saving-loading#models-generating") meta.json]. If you're |  | ||||||
|     |  training your own model and want to use one of spaCy's default components, |  | ||||||
|     |  you won't have to worry about finding and implementing it either – to use |  | ||||||
|     |  the default tagger, simply add #[code "tagger"] to the pipeline, and |  | ||||||
|     |  #[strong spaCy will know what to do]. |  | ||||||
| 
 |  | ||||||
| +infobox("Important note") |  | ||||||
|     |  Because factories are #[strong resolved on initialisation] of the |  | ||||||
|     |  #[code Language] class, it's #[strong not possible] to add them to the |  | ||||||
|     |  pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only |  | ||||||
|     |  works with individual component functions. To use factories, you need to |  | ||||||
|     |  create a new #[code Language] object, or generate a |  | ||||||
|     |  #[+a("/usage/training#models-generating") model package] with |  | ||||||
|     |  a custom pipeline. |  | ||||||
| 
 |  | ||||||
| +h(3, "disabling") Disabling pipeline components |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  If you don't need a particular component of the pipeline – for |     |  If you don't need a particular component of the pipeline – for | ||||||
|  | @ -217,16 +121,19 @@ p | ||||||
| +code. | +code. | ||||||
|     nlp = spacy.load('en', disable['parser', 'tagger']) |     nlp = spacy.load('en', disable['parser', 'tagger']) | ||||||
|     nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) |     nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) | ||||||
|     doc = nlp(u"I don't want parsed", disable=['parser']) |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Note that you can't write directly to #[code nlp.pipeline], as this list |     |  You can also use the #[+api("language#remove_pipe") #[code remove_pipe]] | ||||||
|     |  holds the #[em actual components], not the IDs. However, if you know the |     |  method to remove pipeline components from an existing pipeline, the | ||||||
|     |  order of the components, you can still slice the list: |     |  #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them, | ||||||
|  |     |  or the #[+api("language#replace_pipe") #[code replace_pipe]] method | ||||||
|  |     |  to replace them with a custom component entirely (more details on this | ||||||
|  |     |  in the section on #[+a("#custom-components") custom components]. | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     nlp = spacy.load('en') |     nlp.remove_pipe('parser') | ||||||
|     nlp.pipeline = nlp.pipeline[:2] # only use the first two components |     nlp.rename_pipe('ner', 'entityrecognizer') | ||||||
|  |     nlp.replace_pipe('tagger', my_custom_tagger) | ||||||
| 
 | 
 | ||||||
| +infobox("Important note: disabling pipeline components") | +infobox("Important note: disabling pipeline components") | ||||||
|     .o-block |     .o-block | ||||||
|  | @ -234,12 +141,14 @@ p | ||||||
|         |  processing pipeline components, the #[code parser], #[code tagger] |         |  processing pipeline components, the #[code parser], #[code tagger] | ||||||
|         |  and #[code entity] keyword arguments have been replaced with |         |  and #[code entity] keyword arguments have been replaced with | ||||||
|         |  #[code disable], which takes a list of pipeline component names. |         |  #[code disable], which takes a list of pipeline component names. | ||||||
|         |  This lets you disable both default and custom components when loading |         |  This lets you disable pre-defined components when loading | ||||||
|         |  a model, or initialising a Language class via |         |  a model, or initialising a Language class via | ||||||
|         |  #[+api("language-from_disk") #[code from_disk]]. |         |  #[+api("language-from_disk") #[code from_disk]]. | ||||||
|  | 
 | ||||||
|     +code-new. |     +code-new. | ||||||
|         nlp = spacy.load('en', disable=['tagger', 'ner']) |         nlp = spacy.load('en', disable=['ner']) | ||||||
|         doc = nlp(u"I don't want parsed", disable=['parser']) |         nlp.remove_pipe('parser') | ||||||
|  |         doc = nlp(u"I don't want parsed") | ||||||
|     +code-old. |     +code-old. | ||||||
|         nlp = spacy.load('en', tagger=False, entity=False) |         nlp = spacy.load('en', tagger=False, entity=False) | ||||||
|         doc = nlp(u"I don't want parsed", parse=False) |         doc = nlp(u"I don't want parsed", parse=False) | ||||||
|  |  | ||||||
|  | @ -21,7 +21,7 @@ p | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     import spacy |     import spacy | ||||||
|     from spacy.tokens import Span |     from spacy.tokens.span import Span | ||||||
| 
 | 
 | ||||||
|     text = u'Netflix is hiring a new VP of global policy' |     text = u'Netflix is hiring a new VP of global policy' | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,61 +0,0 @@ | ||||||
| //- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Hooks let you customize some of the behaviours of the #[code Doc], |  | ||||||
|     |  #[code Span] or #[code Token] objects by adding a component to the |  | ||||||
|     |  pipeline. For instance, to customize the |  | ||||||
|     |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a |  | ||||||
|     |  component that sets a custom function to |  | ||||||
|     |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] |  | ||||||
|     |  method will check the #[code user_hooks] dict, and delegate to your |  | ||||||
|     |  function if you've set one. Similar results can be achieved by setting |  | ||||||
|     |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. |  | ||||||
| 
 |  | ||||||
| +code("Polymorphic similarity example"). |  | ||||||
|     span.similarity(doc) |  | ||||||
|     token.similarity(span) |  | ||||||
|     doc1.similarity(doc2) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  By default, this just averages the vectors for each document, and |  | ||||||
|     |  computes their cosine. Obviously, spaCy should make it easy for you to |  | ||||||
|     |  install your own similarity model. This introduces a tricky design |  | ||||||
|     |  challenge. The current solution is to add three more dicts to the |  | ||||||
|     |  #[code Doc] object: |  | ||||||
| 
 |  | ||||||
| +aside("Implementation note") |  | ||||||
|     |  The hooks live on the #[code Doc] object because the #[code Span] and |  | ||||||
|     |  #[code Token] objects are created lazily, and don't own any data. They |  | ||||||
|     |  just proxy to their parent #[code Doc]. This turns out to be convenient |  | ||||||
|     |  here — we only have to worry about installing hooks in one place. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code user_hooks] |  | ||||||
|         +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents] |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code user_token_hooks] |  | ||||||
|         +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts] |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code user_span_hooks] |  | ||||||
|         +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root] |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  To sum up, here's an example of hooking in custom #[code .similarity()] |  | ||||||
|     |  methods: |  | ||||||
| 
 |  | ||||||
| +code("Add custom similarity hooks"). |  | ||||||
|     class SimilarityModel(object): |  | ||||||
|         def __init__(self, model): |  | ||||||
|             self._model = model |  | ||||||
| 
 |  | ||||||
|         def __call__(self, doc): |  | ||||||
|             doc.user_hooks['similarity'] = self.similarity |  | ||||||
|             doc.user_span_hooks['similarity'] = self.similarity |  | ||||||
|             doc.user_token_hooks['similarity'] = self.similarity |  | ||||||
| 
 |  | ||||||
|         def similarity(self, obj1, obj2): |  | ||||||
|             y = self._model([obj1.vector, obj2.vector]) |  | ||||||
|             return float(y[0]) |  | ||||||
|  | @ -8,18 +8,18 @@ include _spacy-101/_pipelines | ||||||
|     +h(2, "pipelines") How pipelines work |     +h(2, "pipelines") How pipelines work | ||||||
|     include _processing-pipelines/_pipelines |     include _processing-pipelines/_pipelines | ||||||
| 
 | 
 | ||||||
| +section("examples") | +section("custom-components") | ||||||
|     +h(2, "examples") Examples |     +h(2, "custom-components") Creating custom pipeline components | ||||||
|     include _processing-pipelines/_examples |     include _processing-pipelines/_custom-components | ||||||
| 
 | 
 | ||||||
| +section("multithreading") | +section("multithreading") | ||||||
|     +h(2, "multithreading") Multi-threading |     +h(2, "multithreading") Multi-threading | ||||||
|     include _processing-pipelines/_multithreading |     include _processing-pipelines/_multithreading | ||||||
| 
 | 
 | ||||||
| +section("user-hooks") |  | ||||||
|     +h(2, "user-hooks") User hooks |  | ||||||
|     include _processing-pipelines/_user-hooks |  | ||||||
| 
 |  | ||||||
| +section("serialization") | +section("serialization") | ||||||
|     +h(2, "serialization") Serialization |     +h(2, "serialization") Serialization | ||||||
|     include _processing-pipelines/_serialization |     include _processing-pipelines/_serialization | ||||||
|  | 
 | ||||||
|  | +section("extensions") | ||||||
|  |     +h(2, "extensions") Developing spaCy extensions | ||||||
|  |     include _processing-pipelines/_extensions | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user