Component decorator and component analysis (#4517)

* Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors
2025-11-04 09:57:26 +03:00 · 2019-10-27 13:35:49 +01:00 · 2019-10-27 13:35:49 +01:00 · a9c6104047
commit a9c6104047
parent 1180304449
15 changed files with 492 additions and 53 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
 thinc>=7.2.0,<7.3.0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.2.0,<1.1.0
+wasabi>=0.3.0,<1.1.0
 srsly>=0.1.0,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
--- a/setup.cfg
+++ b/setup.cfg
@ -49,7 +49,7 @@ install_requires =
    blis>=0.4.0,<0.5.0
    plac>=0.9.6,<1.2.0
    requests>=2.13.0,<3.0.0
-    wasabi>=0.2.0,<1.1.0
+    wasabi>=0.3.0,<1.1.0
    srsly>=0.1.0,<1.1.0
    pathlib==1.0.1; python_version < "3.4"
    importlib_metadata>=0.20; python_version < "3.8"
--- a/spacy/init.py
+++ b/spacy/init.py
@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 # These are imported as part of the API
 from thinc.neural.util import prefer_gpu, require_gpu
 from . import pipeline
 from .cli.info import info as cli_info
 from .glossary import explain
 from .about import __version__
 from .errors import Errors, Warnings, deprecation_warning
 from . import util
 from .util import register_architecture, get_architecture
 from .language import component
 if sys.maxunicode == 65535:
--- a/spacy/analysis.py
+++ b/spacy/analysis.py
@ -0,0 +1,176 @@
 # coding: utf8
 from __future__ import unicode_literals
 from collections import OrderedDict
 from wasabi import Printer
 from .tokens import Doc, Token
 from .errors import Errors, Warnings, user_warning
 def analyze_pipes(pipeline, name, pipe, index, warn=True):
    """Analyze a pipeline component with respect to its position in the current
    pipeline and the other components. Will check whether requirements are
    fulfilled (e.g. if previous components assign the attributes).
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
    name (unicode): The name of the pipeline component to analyze.
    pipe (callable): The pipeline component function to analyze.
    index (int): The index of the component in the pipeline.
    warn (bool): Show user warning if problem is found.
    RETURNS (list): The problems found for the given pipeline component.
    """
    assert pipeline[index][0] == name
    prev_pipes = pipeline[:index]
    pipe_requires = getattr(pipe, "requires", [])
    requires = OrderedDict([(annot, False) for annot in pipe_requires])
    if requires:
        for prev_name, prev_pipe in prev_pipes:
            prev_assigns = getattr(prev_pipe, "assigns", [])
            for annot in prev_assigns:
                requires[annot] = True
    problems = []
    for annot, fulfilled in requires.items():
        if not fulfilled:
            problems.append(annot)
            if warn:
                user_warning(Warnings.W025.format(name=name, attr=annot))
    return problems
 def analyze_all_pipes(pipeline, warn=True):
    """Analyze all pipes in the pipeline in order.
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
    warn (bool): Show user warning if problem is found.
    RETURNS (dict): The problems found, keyed by component name.
    """
    problems = {}
    for i, (name, pipe) in enumerate(pipeline):
        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
    return problems
 def dot_to_dict(values):
    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
    become {"token": {"pos": True, "_": {"xyz": True }}}.
    values (iterable): The values to convert.
    RETURNS (dict): The converted values.
    """
    result = {}
    for value in values:
        path = result
        parts = value.lower().split(".")
        for i, item in enumerate(parts):
            is_last = i == len(parts) - 1
            path = path.setdefault(item, True if is_last else {})
    return result
 def validate_attrs(values):
    """Validate component attributes provided to "assigns", "requires" etc.
    Raises error for invalid attributes and formatting. Doesn't check if
    custom extension attributes are registered, since this is something the
    user might want to do themselves later in the component.
    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
    RETURNS (iterable): The checked attributes.
    """
    data = dot_to_dict(values)
    objs = {"doc": Doc, "token": Token}
    for obj_key, attrs in data.items():
        if obj_key not in objs:  # first element is not doc/token
            if obj_key == "span":
                span_attrs = [attr for attr in values if attr.startswith("span.")]
                raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
            invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
            raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
        if not isinstance(attrs, dict):  # attr is something like "doc"
            raise ValueError(Errors.E182.format(attr=obj_key))
        for attr, value in attrs.items():
            if attr == "_":
                if value is True:  # attr is something like "doc._"
                    raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
                for ext_attr, ext_value in value.items():
                    # We don't check whether the attribute actually exists
                    if ext_value is not True:  # attr is something like doc._.x.y
                        good = "{}._.{}".format(obj_key, ext_attr)
                        bad = "{}.{}".format(good, ".".join(ext_value))
                        raise ValueError(Errors.E183.format(attr=bad, solution=good))
                continue  # we can't validate those further
            if attr.endswith("_"):  # attr is something like "token.pos_"
                raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
            if value is not True:  # attr is something like doc.x.y
                good = "{}.{}".format(obj_key, attr)
                bad = "{}.{}".format(good, ".".join(value))
                raise ValueError(Errors.E183.format(attr=bad, solution=good))
            obj = objs[obj_key]
            if not hasattr(obj, attr):
                raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
    return values
 def _get_feature_for_attr(pipeline, attr, feature):
    assert feature in ["assigns", "requires"]
    result = []
    for pipe_name, pipe in pipeline:
        pipe_assigns = getattr(pipe, feature, [])
        if attr in pipe_assigns:
            result.append((pipe_name, pipe))
    return result
 def get_assigns_for_attr(pipeline, attr):
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
    attr (unicode): The attribute to check.
    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
    """
    return _get_feature_for_attr(pipeline, attr, "assigns")
 def get_requires_for_attr(pipeline, attr):
    """Get all pipeline components that require an attr, e.g. "doc.tensor".
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
    attr (unicode): The attribute to check.
    RETURNS (list): (name, pipeline) tuples of components that require the attr.
    """
    return _get_feature_for_attr(pipeline, attr, "requires")
 def print_summary(nlp, pretty=True, no_print=False):
    """Print a formatted summary for the current nlp object's pipeline. Shows
    a table with the pipeline components and why they assign and require, as
    well as any problems if available.
    nlp (Language): The nlp object.
    pretty (bool): Pretty-print the results (color etc).
    no_print (bool): Don't print anything, just return the data.
    RETURNS (dict): A dict with "overview" and "problems".
    """
    msg = Printer(pretty=pretty, no_print=no_print)
    overview = []
    problems = {}
    for i, (name, pipe) in enumerate(nlp.pipeline):
        requires = getattr(pipe, "requires", [])
        assigns = getattr(pipe, "assigns", [])
        retok = getattr(pipe, "retokenizes", False)
        overview.append((i, name, requires, assigns, retok))
        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
    msg.divider("Pipeline Overview")
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
    msg.table(overview, header=header, divider=True, multiline=True)
    n_problems = sum(len(p) for p in problems.values())
    if any(p for p in problems.values()):
        msg.divider("Problems ({})".format(n_problems))
        for name, problem in problems.items():
            if problem:
                problem = ", ".join(problem)
                msg.warn("'{}' requirements not met: {}".format(name, problem))
    else:
        msg.good("No problems found.")
    if no_print:
        return {"overview": overview, "problems": problems}
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -12,6 +12,7 @@ import os
 import sys
 import itertools
 import ast
 import types
 from thinc.neural.util import copy_array
@ -67,6 +68,7 @@ if is_python2:
    basestring_ = basestring  # noqa: F821
    input_ = raw_input  # noqa: F821
    path2str = lambda path: str(path).decode("utf8")
    class_types = (type, types.ClassType)
 elif is_python3:
    bytes_ = bytes
@ -74,6 +76,7 @@ elif is_python3:
    basestring_ = str
    input_ = input
    path2str = lambda path: str(path)
    class_types = (type, types.ClassType) if is_python_pre_3_5 else type
 def b_to_str(b_str):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -99,6 +99,8 @@ class Warnings(object):
            "'n_process' will be set to 1.")
    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
            "the Knowledge Base.")
    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
            "previous components in the pipeline declare that they assign it.")
@add_codes
@ -511,6 +513,20 @@ class Errors(object):
    E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
            "Doc. If you only want to add one pattern, make sure to wrap it "
            "in a list. For example: matcher.add('{key}', [doc])")
    E180 = ("Span attributes can't be declared as required or assigned by "
            "components, since spans are only views of the Doc. Use Doc and "
            "Token attributes only and remove the following: {attrs}")
    E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
            "Only Doc and Token attributes are supported.")
    E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
            "to define the attribute? For example: {attr}.???")
    E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
            "attributes are supported, for example: {solution}")
    E184 = ("Only attributes without underscores are supported in component "
            "attribute declarations (because underscore and non-underscore "
            "attributes are connected anyways): {attr} -> {solution}")
    E185 = ("Received invalid attribute in component attribute declaration: "
            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
@add_codes
--- a/spacy/language.py
+++ b/spacy/language.py
@ -18,13 +18,8 @@ from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .lookups import Lookups
-from .pipeline import DependencyParser, Tagger
+from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
-from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
+from .compat import izip, basestring_, is_python2, class_types
 from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
 from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .pipeline import EntityRuler
 from .pipeline import Morphologizer
 from .compat import izip, basestring_, is_python2
 from .gold import GoldParse
 from .scorer import Scorer
 from ._ml import link_vectors_to_models, create_default_optimizer
@ -40,6 +35,9 @@ from . import util
 from . import about
 ENABLE_PIPELINE_ANALYSIS = False
 class BaseDefaults(object):
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
@ -135,19 +133,6 @@ class Language(object):
    factories = {
        "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
        "tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
        "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
        "morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
        "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
        "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
        "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
        "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
        "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
        "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
        "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
        "merge_entities": lambda nlp, **cfg: merge_entities,
        "merge_subtokens": lambda nlp, **cfg: merge_subtokens,
        "entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
    }
    def __init__(
@ -218,6 +203,7 @@ class Language(object):
            "name": self.vocab.vectors.name,
        }
        self._meta["pipeline"] = self.pipe_names
        self._meta["factories"] = self.pipe_factories
        self._meta["labels"] = self.pipe_labels
        return self._meta
@ -259,6 +245,17 @@ class Language(object):
        """
        return [pipe_name for pipe_name, _ in self.pipeline]
    @property
    def pipe_factories(self):
        """Get the component factories for the available pipeline components.
        RETURNS (dict): Factory names, keyed by component names.
        """
        factories = {}
        for pipe_name, pipe in self.pipeline:
            factories[pipe_name] = getattr(pipe, "factory", pipe_name)
        return factories
    @property
    def pipe_labels(self):
        """Get the labels set by the pipeline components, if available (if
@ -327,33 +324,30 @@ class Language(object):
                msg += Errors.E004.format(component=component)
            raise ValueError(msg)
        if name is None:
-            if hasattr(component, "name"):
+            name = util.get_component_name(component)
                name = component.name
            elif hasattr(component, "__name__"):
                name = component.__name__
            elif hasattr(component, "__class__") and hasattr(
                component.__class__, "__name__"
            ):
                name = component.__class__.__name__
            else:
                name = repr(component)
        if name in self.pipe_names:
            raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
            raise ValueError(Errors.E006)
        pipe_index = 0
        pipe = (name, component)
        if last or not any([first, before, after]):
            pipe_index = len(self.pipeline)
            self.pipeline.append(pipe)
        elif first:
            self.pipeline.insert(0, pipe)
        elif before and before in self.pipe_names:
            pipe_index = self.pipe_names.index(before)
            self.pipeline.insert(self.pipe_names.index(before), pipe)
        elif after and after in self.pipe_names:
            pipe_index = self.pipe_names.index(after) + 1
            self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
        else:
            raise ValueError(
                Errors.E001.format(name=before or after, opts=self.pipe_names)
            )
        if ENABLE_PIPELINE_ANALYSIS:
            analyze_pipes(self.pipeline, name, component, pipe_index)
    def has_pipe(self, name):
        """Check if a component name is present in the pipeline. Equivalent to
@ -382,6 +376,8 @@ class Language(object):
                msg += Errors.E135.format(name=name)
            raise ValueError(msg)
        self.pipeline[self.pipe_names.index(name)] = (name, component)
        if ENABLE_PIPELINE_ANALYSIS:
            analyze_all_pipes(self.pipeline)
    def rename_pipe(self, old_name, new_name):
        """Rename a pipeline component.
@ -408,6 +404,8 @@ class Language(object):
        """
        if name not in self.pipe_names:
            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
        if ENABLE_PIPELINE_ANALYSIS:
            analyze_all_pipes(self.pipeline)
        return self.pipeline.pop(self.pipe_names.index(name))
    def __call__(self, text, disable=[], component_cfg=None):
@ -1001,6 +999,52 @@ class Language(object):
        return self
 class component(object):
    """Decorator for pipeline components. Can decorate both function components
    and class components and will automatically register components in the
    Language.factories. If the component is a class and needs access to the
    nlp object or config parameters, it can expose a from_nlp classmethod
    that takes the nlp object and **cfg arguments and returns the initialized
    component.
    """
    # NB: This decorator needs to live here, because it needs to write to
    # Language.factories. All other solutions would cause circular import.
    def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
        """Decorate a pipeline component.
        name (unicode): Default component and factory name.
        assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
        requires (list): Attributes required by component, e.g. `["token.dep"]`.
        retokenizes (bool): Whether the component changes the tokenization.
        """
        self.name = name
        self.assigns = validate_attrs(assigns)
        self.requires = validate_attrs(requires)
        self.retokenizes = retokenizes
    def __call__(self, *args, **kwargs):
        obj = args[0]
        args = args[1:]
        factory_name = self.name or util.get_component_name(obj)
        obj.name = factory_name
        obj.factory = factory_name
        obj.assigns = self.assigns
        obj.requires = self.requires
        obj.retokenizes = self.retokenizes
        def factory(nlp, **cfg):
            if hasattr(obj, "from_nlp"):
                return obj.from_nlp(nlp, **cfg)
            elif isinstance(obj, class_types):
                return obj()
            return obj
        Language.factories[obj.factory] = factory
        return obj
 def _fix_pretrained_vectors_name(nlp):
    # TODO: Replace this once we handle vectors consistently as static
    # data
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from collections import defaultdict, OrderedDict
 import srsly
 from ..language import component
 from ..errors import Errors
 from ..compat import basestring_
 from ..util import ensure_path, to_disk, from_disk
@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
 DEFAULT_ENT_ID_SEP = "||"
@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
 class EntityRuler(object):
    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
    rules or exact phrase matches. It can be combined with the statistical
@ -24,8 +26,6 @@ class EntityRuler(object):
    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
    """
    name = "entity_ruler"
    def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
        """Initialize the entitiy ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -69,6 +69,10 @@ class EntityRuler(object):
        if patterns is not None:
            self.add_patterns(patterns)
    @classmethod
    def from_nlp(cls, nlp, **cfg):
        return cls(nlp, **cfg)
    def __len__(self):
        """The number of all patterns added to the entity ruler."""
        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -1,9 +1,15 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..language import component
 from ..matcher import Matcher
@component(
    "merge_noun_chunks",
    requires=["token.dep", "token.tag", "token.pos"],
    retokenizes=True,
 )
 def merge_noun_chunks(doc):
    """Merge noun chunks into a single token.
@ -21,6 +27,11 @@ def merge_noun_chunks(doc):
    return doc
@component(
    "merge_entities",
    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
    retokenizes=True,
 )
 def merge_entities(doc):
    """Merge entities into a single token.
@ -36,6 +47,7 @@ def merge_entities(doc):
    return doc
@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
 def merge_subtokens(doc, label="subtok"):
    """Merge subtokens into a single token.
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
 from .pipes import Pipe
 from ..language import component
 from .._ml import link_vectors_to_models
@component("sentencizer_hook", assigns=["doc.user_hooks"])
 class SentenceSegmenter(object):
    """A simple spaCy hook, to allow custom sentence boundary detection logic
    (that doesn't require the dependency parse). To change the sentence
@ -17,8 +19,6 @@ class SentenceSegmenter(object):
    and yield `Span` objects for each sentence.
    """
    name = "sentencizer"
    def __init__(self, vocab, strategy=None):
        self.vocab = vocab
        if strategy is None or strategy == "on_punct":
@ -44,6 +44,7 @@ class SentenceSegmenter(object):
            yield doc[start : len(doc)]
@component("similarity", assigns=["doc.user_hooks"])
 class SimilarityHook(Pipe):
    """
    Experimental: A pipeline component to install a hook for supervised
@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
    Where W is a vector of dimension weights, initialized to 1.
    """
    name = "similarity"
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -8,6 +8,7 @@ from thinc.api import chain
 from thinc.neural.util import to_categorical, copy_array, get_array_module
 from .. import util
 from .pipes import Pipe
 from ..language import component
 from .._ml import Tok2Vec, build_morphologizer_model
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import create_default_optimizer
@ -18,8 +19,8 @@ from ..vocab cimport Vocab
 from ..morphology cimport Morphology
@component("morphologizer", assigns=["token.morph", "token.pos"])
 class Morphologizer(Pipe):
    name = 'morphologizer'
    @classmethod
    def Model(cls, **cfg):
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 from .functions import merge_subtokens
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
 from ..morphology cimport Morphology
 from ..vocab cimport Vocab
 from .functions import merge_subtokens
 from ..language import Language, component
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
@ -54,6 +55,10 @@ class Pipe(object):
        """Initialize a model for the pipe."""
        raise NotImplementedError
    @classmethod
    def from_nlp(cls, nlp, **cfg):
        return cls(nlp.vocab, **cfg)
    def __init__(self, vocab, model=True, **cfg):
        """Create a new pipe instance."""
        raise NotImplementedError
@ -223,11 +228,10 @@ class Pipe(object):
        return self
@component("tensorizer", assigns=["doc.tensor"])
 class Tensorizer(Pipe):
    """Pre-train position-sensitive vectors for tokens."""
    name = "tensorizer"
    @classmethod
    def Model(cls, output_size=300, **cfg):
        """Create a new statistical model for the class.
@ -362,14 +366,13 @@ class Tensorizer(Pipe):
        return sgd
@component("tagger", assigns=["token.tag", "token.pos"])
 class Tagger(Pipe):
    """Pipeline component for part-of-speech tagging.
    DOCS: https://spacy.io/api/tagger
    """
    name = "tagger"
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
@ -657,13 +660,12 @@ class Tagger(Pipe):
        return self
@component("nn_labeller")
 class MultitaskObjective(Tagger):
    """Experimental: Assist training of a parser or tagger, by training a
    side-objective.
    """
    name = "nn_labeller"
    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
        self.vocab = vocab
        self.model = model
@ -898,12 +900,12 @@ class ClozeMultitask(Pipe):
            losses[self.name] += loss
@component("textcat", assigns=["doc.cats"])
 class TextCategorizer(Pipe):
    """Pipeline component for text classification.
    DOCS: https://spacy.io/api/textcategorizer
    """
    name = 'textcat'
    @classmethod
    def Model(cls, nr_class=1, **cfg):
@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser):
    DOCS: https://spacy.io/api/dependencyparser
    """
-
+    # cdef classes can't have decorators, so we're defining this here
    name = "parser"
    factory = "parser"
    assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
    requires = []
    TransitionSystem = ArcEager
    @property
@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser):
    DOCS: https://spacy.io/api/entityrecognizer
    """
    name = "ner"
    factory = "ner"
    assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
    requires = []
    TransitionSystem = BiluoPushDown
    nr_feature = 6
@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser):
        return tuple(sorted(labels))
@component(
    "entity_linker",
    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
    assigns=["token.ent_kb_id"]
 )
 class EntityLinker(Pipe):
    """Pipeline component for named entity linking.
    DOCS: https://spacy.io/api/entitylinker
    """
    name = 'entity_linker'
    NIL = "NIL"  # string used to refer to a non-existing link
    @classmethod
@ -1405,13 +1416,13 @@ class EntityLinker(Pipe):
        raise NotImplementedError
@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
 class Sentencizer(object):
    """Segment the Doc into sentences using a rule-based strategy.
    DOCS: https://spacy.io/api/sentencizer
    """
    name = "sentencizer"
    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
@ -1437,6 +1448,10 @@ class Sentencizer(object):
        else:
            self.punct_chars = set(self.default_punct_chars)
    @classmethod
    def from_nlp(cls, nlp, **cfg):
        return cls(**cfg)
    def __call__(self, doc):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.
@ -1503,4 +1518,9 @@ class Sentencizer(object):
        return self
 # Cython classes can't be decorated, so we need to add the factories here
 Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
 Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
 __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -128,6 +128,10 @@ cdef class Parser:
        self._multitasks = []
        self._rehearsal_model = None
    @classmethod
    def from_nlp(cls, nlp, **cfg):
        return cls(nlp.vocab, **cfg)
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@ -0,0 +1,146 @@
 # coding: utf8
 from __future__ import unicode_literals
 import spacy.language
 from spacy.language import Language, component
 from spacy.analysis import print_summary, validate_attrs
 from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
 from spacy.compat import is_python2
 from mock import Mock, ANY
 import pytest
 def test_component_decorator_function():
    @component(name="test")
    def test_component(doc):
        """docstring"""
        return doc
    assert test_component.name == "test"
    if not is_python2:
        assert test_component.__doc__ == "docstring"
    assert test_component("foo") == "foo"
 def test_component_decorator_class():
    @component(name="test")
    class TestComponent(object):
        """docstring1"""
        foo = "bar"
        def __call__(self, doc):
            """docstring2"""
            return doc
        def custom(self, x):
            """docstring3"""
            return x
    assert TestComponent.name == "test"
    assert TestComponent.foo == "bar"
    assert hasattr(TestComponent, "custom")
    test_component = TestComponent()
    assert test_component.foo == "bar"
    assert test_component("foo") == "foo"
    assert hasattr(test_component, "custom")
    assert test_component.custom("bar") == "bar"
    if not is_python2:
        assert TestComponent.__doc__ == "docstring1"
        assert TestComponent.__call__.__doc__ == "docstring2"
        assert TestComponent.custom.__doc__ == "docstring3"
        assert test_component.__doc__ == "docstring1"
        assert test_component.__call__.__doc__ == "docstring2"
        assert test_component.custom.__doc__ == "docstring3"
 def test_component_decorator_assigns():
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True
    @component("c1", assigns=["token.tag", "doc.tensor"])
    def test_component1(doc):
        return doc
    @component(
        "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
    )
    def test_component2(doc):
        return doc
    @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
    def test_component3(doc):
        return doc
    assert "c1" in Language.factories
    assert "c2" in Language.factories
    assert "c3" in Language.factories
    nlp = Language()
    nlp.add_pipe(test_component1)
    with pytest.warns(UserWarning):
        nlp.add_pipe(test_component2)
    nlp.add_pipe(test_component3)
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
    test_component4 = nlp.create_pipe("c1")
    assert test_component4.name == "c1"
    assert test_component4.factory == "c1"
    nlp.add_pipe(test_component4, name="c4")
    assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
    assert "c4" not in Language.factories
    assert nlp.pipe_factories["c1"] == "c1"
    assert nlp.pipe_factories["c4"] == "c1"
    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
    assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
    requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
    assert [name for name, _ in requires_pos] == ["c2"]
    assert print_summary(nlp, no_print=True)
    assert nlp("hello world")
 def test_component_factories_from_nlp():
    """Test that class components can implement a from_nlp classmethod that
    gives them access to the nlp object and config via the factory."""
    class TestComponent5(object):
        def __call__(self, doc):
            return doc
    mock = Mock()
    mock.return_value = TestComponent5()
    TestComponent5.from_nlp = classmethod(mock)
    TestComponent5 = component("c5")(TestComponent5)
    assert "c5" in Language.factories
    nlp = Language()
    pipe = nlp.create_pipe("c5", config={"foo": "bar"})
    nlp.add_pipe(pipe)
    assert nlp("hello world")
    # The first argument here is the class itself, so we're accepting any here
    mock.assert_called_once_with(ANY, nlp, foo="bar")
 def test_analysis_validate_attrs_valid():
    attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"]
    assert validate_attrs(attrs)
    for attr in attrs:
        assert validate_attrs([attr])
    with pytest.raises(ValueError):
        validate_attrs(["doc.sents", "doc.xyz"])
@pytest.mark.parametrize(
    "attr",
    [
        "doc",
        "doc_ents",
        "doc.xyz",
        "token.xyz",
        "token.tag_",
        "token.tag.xyz",
        "token._.xyz.abc",
    ],
 )
 def test_analysis_validate_attrs_invalid(attr):
    with pytest.raises(ValueError):
        validate_attrs([attr])
--- a/spacy/util.py
+++ b/spacy/util.py
@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
    cls = get_lang_class(lang)
    nlp = cls(meta=meta, **overrides)
    pipeline = meta.get("pipeline", [])
    factories = meta.get("factories", {})
    disable = overrides.get("disable", [])
    if pipeline is True:
        pipeline = nlp.Defaults.pipe_names
@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
    for name in pipeline:
        if name not in disable:
            config = meta.get("pipeline_args", {}).get(name, {})
-            component = nlp.create_pipe(name, config=config)
+            factory = factories.get(name, name)
            component = nlp.create_pipe(factory, config=config)
            nlp.add_pipe(component, name=name)
    return nlp.from_disk(model_path)
@ -368,6 +370,16 @@ def is_in_jupyter():
    return False
 def get_component_name(component):
    if hasattr(component, "name"):
        return component.name
    if hasattr(component, "__name__"):
        return component.__name__
    if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
        return component.__class__.__name__
    return repr(component)
 def get_cuda_stream(require=False):
    if CudaStream is None:
        return None