Component decorator and component analysis (#4517)

* Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors
2025-08-25 14:34:55 +03:00 · 2019-10-27 13:35:49 +01:00 · 2019-10-27 13:35:49 +01:00 · a9c6104047
commit a9c6104047
parent 1180304449
15 changed files with 492 additions and 53 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
 thinc>=7.2.0,<7.3.0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.2.0,<1.1.0
+wasabi>=0.3.0,<1.1.0
 srsly>=0.1.0,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
--- a/setup.cfg
+++ b/setup.cfg
@ -49,7 +49,7 @@ install_requires =
    blis>=0.4.0,<0.5.0
    plac>=0.9.6,<1.2.0
    requests>=2.13.0,<3.0.0
-    wasabi>=0.2.0,<1.1.0
+    wasabi>=0.3.0,<1.1.0
    srsly>=0.1.0,<1.1.0
    pathlib==1.0.1; python_version < "3.4"
    importlib_metadata>=0.20; python_version < "3.8"
--- a/spacy/init.py
+++ b/spacy/init.py
@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 # These are imported as part of the API
 from thinc.neural.util import prefer_gpu, require_gpu

+from . import pipeline
 from .cli.info import info as cli_info
 from .glossary import explain
 from .about import __version__
 from .errors import Errors, Warnings, deprecation_warning
 from . import util
 from .util import register_architecture, get_architecture
+from .language import component


 if sys.maxunicode == 65535:
--- a/spacy/analysis.py
+++ b/spacy/analysis.py
@ -0,0 +1,176 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+from wasabi import Printer
+
+from .tokens import Doc, Token
+from .errors import Errors, Warnings, user_warning
+
+
+def analyze_pipes(pipeline, name, pipe, index, warn=True):
+    """Analyze a pipeline component with respect to its position in the current
+    pipeline and the other components. Will check whether requirements are
+    fulfilled (e.g. if previous components assign the attributes).
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    name (unicode): The name of the pipeline component to analyze.
+    pipe (callable): The pipeline component function to analyze.
+    index (int): The index of the component in the pipeline.
+    warn (bool): Show user warning if problem is found.
+    RETURNS (list): The problems found for the given pipeline component.
+    """
+    assert pipeline[index][0] == name
+    prev_pipes = pipeline[:index]
+    pipe_requires = getattr(pipe, "requires", [])
+    requires = OrderedDict([(annot, False) for annot in pipe_requires])
+    if requires:
+        for prev_name, prev_pipe in prev_pipes:
+            prev_assigns = getattr(prev_pipe, "assigns", [])
+            for annot in prev_assigns:
+                requires[annot] = True
+    problems = []
+    for annot, fulfilled in requires.items():
+        if not fulfilled:
+            problems.append(annot)
+            if warn:
+                user_warning(Warnings.W025.format(name=name, attr=annot))
+    return problems
+
+
+def analyze_all_pipes(pipeline, warn=True):
+    """Analyze all pipes in the pipeline in order.
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    warn (bool): Show user warning if problem is found.
+    RETURNS (dict): The problems found, keyed by component name.
+    """
+    problems = {}
+    for i, (name, pipe) in enumerate(pipeline):
+        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
+    return problems
+
+
+def dot_to_dict(values):
+    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
+    become {"token": {"pos": True, "_": {"xyz": True }}}.
+
+    values (iterable): The values to convert.
+    RETURNS (dict): The converted values.
+    """
+    result = {}
+    for value in values:
+        path = result
+        parts = value.lower().split(".")
+        for i, item in enumerate(parts):
+            is_last = i == len(parts) - 1
+            path = path.setdefault(item, True if is_last else {})
+    return result
+
+
+def validate_attrs(values):
+    """Validate component attributes provided to "assigns", "requires" etc.
+    Raises error for invalid attributes and formatting. Doesn't check if
+    custom extension attributes are registered, since this is something the
+    user might want to do themselves later in the component.
+
+    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
+    RETURNS (iterable): The checked attributes.
+    """
+    data = dot_to_dict(values)
+    objs = {"doc": Doc, "token": Token}
+    for obj_key, attrs in data.items():
+        if obj_key not in objs:  # first element is not doc/token
+            if obj_key == "span":
+                span_attrs = [attr for attr in values if attr.startswith("span.")]
+                raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
+            invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
+            raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
+        if not isinstance(attrs, dict):  # attr is something like "doc"
+            raise ValueError(Errors.E182.format(attr=obj_key))
+        for attr, value in attrs.items():
+            if attr == "_":
+                if value is True:  # attr is something like "doc._"
+                    raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
+                for ext_attr, ext_value in value.items():
+                    # We don't check whether the attribute actually exists
+                    if ext_value is not True:  # attr is something like doc._.x.y
+                        good = "{}._.{}".format(obj_key, ext_attr)
+                        bad = "{}.{}".format(good, ".".join(ext_value))
+                        raise ValueError(Errors.E183.format(attr=bad, solution=good))
+                continue  # we can't validate those further
+            if attr.endswith("_"):  # attr is something like "token.pos_"
+                raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
+            if value is not True:  # attr is something like doc.x.y
+                good = "{}.{}".format(obj_key, attr)
+                bad = "{}.{}".format(good, ".".join(value))
+                raise ValueError(Errors.E183.format(attr=bad, solution=good))
+            obj = objs[obj_key]
+            if not hasattr(obj, attr):
+                raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
+    return values
+
+
+def _get_feature_for_attr(pipeline, attr, feature):
+    assert feature in ["assigns", "requires"]
+    result = []
+    for pipe_name, pipe in pipeline:
+        pipe_assigns = getattr(pipe, feature, [])
+        if attr in pipe_assigns:
+            result.append((pipe_name, pipe))
+    return result
+
+
+def get_assigns_for_attr(pipeline, attr):
+    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    attr (unicode): The attribute to check.
+    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
+    """
+    return _get_feature_for_attr(pipeline, attr, "assigns")
+
+
+def get_requires_for_attr(pipeline, attr):
+    """Get all pipeline components that require an attr, e.g. "doc.tensor".
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    attr (unicode): The attribute to check.
+    RETURNS (list): (name, pipeline) tuples of components that require the attr.
+    """
+    return _get_feature_for_attr(pipeline, attr, "requires")
+
+
+def print_summary(nlp, pretty=True, no_print=False):
+    """Print a formatted summary for the current nlp object's pipeline. Shows
+    a table with the pipeline components and why they assign and require, as
+    well as any problems if available.
+
+    nlp (Language): The nlp object.
+    pretty (bool): Pretty-print the results (color etc).
+    no_print (bool): Don't print anything, just return the data.
+    RETURNS (dict): A dict with "overview" and "problems".
+    """
+    msg = Printer(pretty=pretty, no_print=no_print)
+    overview = []
+    problems = {}
+    for i, (name, pipe) in enumerate(nlp.pipeline):
+        requires = getattr(pipe, "requires", [])
+        assigns = getattr(pipe, "assigns", [])
+        retok = getattr(pipe, "retokenizes", False)
+        overview.append((i, name, requires, assigns, retok))
+        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
+    msg.divider("Pipeline Overview")
+    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
+    msg.table(overview, header=header, divider=True, multiline=True)
+    n_problems = sum(len(p) for p in problems.values())
+    if any(p for p in problems.values()):
+        msg.divider("Problems ({})".format(n_problems))
+        for name, problem in problems.items():
+            if problem:
+                problem = ", ".join(problem)
+                msg.warn("'{}' requirements not met: {}".format(name, problem))
+    else:
+        msg.good("No problems found.")
+    if no_print:
+        return {"overview": overview, "problems": problems}
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -12,6 +12,7 @@ import os
 import sys
 import itertools
 import ast
+import types

 from thinc.neural.util import copy_array

@ -67,6 +68,7 @@ if is_python2:
    basestring_ = basestring  # noqa: F821
    input_ = raw_input  # noqa: F821
    path2str = lambda path: str(path).decode("utf8")
+    class_types = (type, types.ClassType)

 elif is_python3:
    bytes_ = bytes
@ -74,6 +76,7 @@ elif is_python3:
    basestring_ = str
    input_ = input
    path2str = lambda path: str(path)
+    class_types = (type, types.ClassType) if is_python_pre_3_5 else type


 def b_to_str(b_str):
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -99,6 +99,8 @@ class Warnings(object):
            "'n_process' will be set to 1.")
    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
            "the Knowledge Base.")
+    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
+            "previous components in the pipeline declare that they assign it.")


@add_codes
@ -511,6 +513,20 @@ class Errors(object):
    E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
            "Doc. If you only want to add one pattern, make sure to wrap it "
            "in a list. For example: matcher.add('{key}', [doc])")
+    E180 = ("Span attributes can't be declared as required or assigned by "
+            "components, since spans are only views of the Doc. Use Doc and "
+            "Token attributes only and remove the following: {attrs}")
+    E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
+            "Only Doc and Token attributes are supported.")
+    E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
+            "to define the attribute? For example: {attr}.???")
+    E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
+            "attributes are supported, for example: {solution}")
+    E184 = ("Only attributes without underscores are supported in component "
+            "attribute declarations (because underscore and non-underscore "
+            "attributes are connected anyways): {attr} -> {solution}")
+    E185 = ("Received invalid attribute in component attribute declaration: "
+            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")


@add_codes
--- a/spacy/language.py
+++ b/spacy/language.py
@ -18,13 +18,8 @@ from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .lookups import Lookups
-from .pipeline import DependencyParser, Tagger
-from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
-from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
-from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
-from .pipeline import EntityRuler
-from .pipeline import Morphologizer
-from .compat import izip, basestring_, is_python2
+from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .compat import izip, basestring_, is_python2, class_types
 from .gold import GoldParse
 from .scorer import Scorer
 from ._ml import link_vectors_to_models, create_default_optimizer
@ -40,6 +35,9 @@ from . import util
 from . import about


+ENABLE_PIPELINE_ANALYSIS = False
+
+
 class BaseDefaults(object):
    @classmethod
    def create_lemmatizer(cls, nlp=None, lookups=None):
@ -135,19 +133,6 @@ class Language(object):

    factories = {
        "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
-        "tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
-        "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
-        "morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
-        "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
-        "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
-        "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
-        "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
-        "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
-        "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
-        "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
-        "merge_entities": lambda nlp, **cfg: merge_entities,
-        "merge_subtokens": lambda nlp, **cfg: merge_subtokens,
-        "entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
    }

    def __init__(
@ -218,6 +203,7 @@ class Language(object):
            "name": self.vocab.vectors.name,
        }
        self._meta["pipeline"] = self.pipe_names
+        self._meta["factories"] = self.pipe_factories
        self._meta["labels"] = self.pipe_labels
        return self._meta

@ -259,6 +245,17 @@ class Language(object):
        """
        return [pipe_name for pipe_name, _ in self.pipeline]

+    @property
+    def pipe_factories(self):
+        """Get the component factories for the available pipeline components.
+
+        RETURNS (dict): Factory names, keyed by component names.
+        """
+        factories = {}
+        for pipe_name, pipe in self.pipeline:
+            factories[pipe_name] = getattr(pipe, "factory", pipe_name)
+        return factories
+
    @property
    def pipe_labels(self):
        """Get the labels set by the pipeline components, if available (if
@ -327,33 +324,30 @@ class Language(object):
                msg += Errors.E004.format(component=component)
            raise ValueError(msg)
        if name is None:
-            if hasattr(component, "name"):
-                name = component.name
-            elif hasattr(component, "__name__"):
-                name = component.__name__
-            elif hasattr(component, "__class__") and hasattr(
-                component.__class__, "__name__"
-            ):
-                name = component.__class__.__name__
-            else:
-                name = repr(component)
+            name = util.get_component_name(component)
        if name in self.pipe_names:
            raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
            raise ValueError(Errors.E006)
+        pipe_index = 0
        pipe = (name, component)
        if last or not any([first, before, after]):
+            pipe_index = len(self.pipeline)
            self.pipeline.append(pipe)
        elif first:
            self.pipeline.insert(0, pipe)
        elif before and before in self.pipe_names:
+            pipe_index = self.pipe_names.index(before)
            self.pipeline.insert(self.pipe_names.index(before), pipe)
        elif after and after in self.pipe_names:
+            pipe_index = self.pipe_names.index(after) + 1
            self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
        else:
            raise ValueError(
                Errors.E001.format(name=before or after, opts=self.pipe_names)
            )
+        if ENABLE_PIPELINE_ANALYSIS:
+            analyze_pipes(self.pipeline, name, component, pipe_index)

    def has_pipe(self, name):
        """Check if a component name is present in the pipeline. Equivalent to
@ -382,6 +376,8 @@ class Language(object):
                msg += Errors.E135.format(name=name)
            raise ValueError(msg)
        self.pipeline[self.pipe_names.index(name)] = (name, component)
+        if ENABLE_PIPELINE_ANALYSIS:
+            analyze_all_pipes(self.pipeline)

    def rename_pipe(self, old_name, new_name):
        """Rename a pipeline component.
@ -408,6 +404,8 @@ class Language(object):
        """
        if name not in self.pipe_names:
            raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
+        if ENABLE_PIPELINE_ANALYSIS:
+            analyze_all_pipes(self.pipeline)
        return self.pipeline.pop(self.pipe_names.index(name))

    def __call__(self, text, disable=[], component_cfg=None):
@ -1001,6 +999,52 @@ class Language(object):
        return self


+class component(object):
+    """Decorator for pipeline components. Can decorate both function components
+    and class components and will automatically register components in the
+    Language.factories. If the component is a class and needs access to the
+    nlp object or config parameters, it can expose a from_nlp classmethod
+    that takes the nlp object and **cfg arguments and returns the initialized
+    component.
+    """
+
+    # NB: This decorator needs to live here, because it needs to write to
+    # Language.factories. All other solutions would cause circular import.
+
+    def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
+        """Decorate a pipeline component.
+
+        name (unicode): Default component and factory name.
+        assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
+        requires (list): Attributes required by component, e.g. `["token.dep"]`.
+        retokenizes (bool): Whether the component changes the tokenization.
+        """
+        self.name = name
+        self.assigns = validate_attrs(assigns)
+        self.requires = validate_attrs(requires)
+        self.retokenizes = retokenizes
+
+    def __call__(self, *args, **kwargs):
+        obj = args[0]
+        args = args[1:]
+        factory_name = self.name or util.get_component_name(obj)
+        obj.name = factory_name
+        obj.factory = factory_name
+        obj.assigns = self.assigns
+        obj.requires = self.requires
+        obj.retokenizes = self.retokenizes
+
+        def factory(nlp, **cfg):
+            if hasattr(obj, "from_nlp"):
+                return obj.from_nlp(nlp, **cfg)
+            elif isinstance(obj, class_types):
+                return obj()
+            return obj
+
+        Language.factories[obj.factory] = factory
+        return obj
+
+
 def _fix_pretrained_vectors_name(nlp):
    # TODO: Replace this once we handle vectors consistently as static
    # data
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from collections import defaultdict, OrderedDict
 import srsly

+from ..language import component
 from ..errors import Errors
 from ..compat import basestring_
 from ..util import ensure_path, to_disk, from_disk
@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
 DEFAULT_ENT_ID_SEP = "||"


+@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
 class EntityRuler(object):
    """The EntityRuler lets you add spans to the `Doc.ents` using token-based
    rules or exact phrase matches. It can be combined with the statistical
@ -24,8 +26,6 @@ class EntityRuler(object):
    USAGE: https://spacy.io/usage/rule-based-matching#entityruler
    """

-    name = "entity_ruler"
-
    def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
        """Initialize the entitiy ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
@ -69,6 +69,10 @@ class EntityRuler(object):
        if patterns is not None:
            self.add_patterns(patterns)

+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(nlp, **cfg)
+
    def __len__(self):
        """The number of all patterns added to the entity ruler."""
        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -1,9 +1,15 @@
 # coding: utf8
 from __future__ import unicode_literals

+from ..language import component
 from ..matcher import Matcher


+@component(
+    "merge_noun_chunks",
+    requires=["token.dep", "token.tag", "token.pos"],
+    retokenizes=True,
+)
 def merge_noun_chunks(doc):
    """Merge noun chunks into a single token.

@ -21,6 +27,11 @@ def merge_noun_chunks(doc):
    return doc


+@component(
+    "merge_entities",
+    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+    retokenizes=True,
+)
 def merge_entities(doc):
    """Merge entities into a single token.

@ -36,6 +47,7 @@ def merge_entities(doc):
    return doc


+@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
 def merge_subtokens(doc, label="subtok"):
    """Merge subtokens into a single token.

--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural._classes.difference import Siamese, CauchySimilarity

 from .pipes import Pipe
+from ..language import component
 from .._ml import link_vectors_to_models


+@component("sentencizer_hook", assigns=["doc.user_hooks"])
 class SentenceSegmenter(object):
    """A simple spaCy hook, to allow custom sentence boundary detection logic
    (that doesn't require the dependency parse). To change the sentence
@ -17,8 +19,6 @@ class SentenceSegmenter(object):
    and yield `Span` objects for each sentence.
    """

-    name = "sentencizer"
-
    def __init__(self, vocab, strategy=None):
        self.vocab = vocab
        if strategy is None or strategy == "on_punct":
@ -44,6 +44,7 @@ class SentenceSegmenter(object):
            yield doc[start : len(doc)]


+@component("similarity", assigns=["doc.user_hooks"])
 class SimilarityHook(Pipe):
    """
    Experimental: A pipeline component to install a hook for supervised
@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
    Where W is a vector of dimension weights, initialized to 1.
    """

-    name = "similarity"
-
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -8,6 +8,7 @@ from thinc.api import chain
 from thinc.neural.util import to_categorical, copy_array, get_array_module
 from .. import util
 from .pipes import Pipe
+from ..language import component
 from .._ml import Tok2Vec, build_morphologizer_model
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import create_default_optimizer
@ -18,9 +19,9 @@ from ..vocab cimport Vocab
 from ..morphology cimport Morphology


+@component("morphologizer", assigns=["token.morph", "token.pos"])
 class Morphologizer(Pipe):
-    name = 'morphologizer'
-    
+
    @classmethod
    def Model(cls, **cfg):
        if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module

-from .functions import merge_subtokens
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
 from ..morphology cimport Morphology
 from ..vocab cimport Vocab

+from .functions import merge_subtokens
+from ..language import Language, component
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
@ -54,6 +55,10 @@ class Pipe(object):
        """Initialize a model for the pipe."""
        raise NotImplementedError

+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(nlp.vocab, **cfg)
+
    def __init__(self, vocab, model=True, **cfg):
        """Create a new pipe instance."""
        raise NotImplementedError
@ -223,11 +228,10 @@ class Pipe(object):
        return self


+@component("tensorizer", assigns=["doc.tensor"])
 class Tensorizer(Pipe):
    """Pre-train position-sensitive vectors for tokens."""

-    name = "tensorizer"
-
    @classmethod
    def Model(cls, output_size=300, **cfg):
        """Create a new statistical model for the class.
@ -362,14 +366,13 @@ class Tensorizer(Pipe):
        return sgd


+@component("tagger", assigns=["token.tag", "token.pos"])
 class Tagger(Pipe):
    """Pipeline component for part-of-speech tagging.

    DOCS: https://spacy.io/api/tagger
    """

-    name = "tagger"
-
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
@ -657,13 +660,12 @@ class Tagger(Pipe):
        return self


+@component("nn_labeller")
 class MultitaskObjective(Tagger):
    """Experimental: Assist training of a parser or tagger, by training a
    side-objective.
    """

-    name = "nn_labeller"
-
    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
        self.vocab = vocab
        self.model = model
@ -898,12 +900,12 @@ class ClozeMultitask(Pipe):
            losses[self.name] += loss


+@component("textcat", assigns=["doc.cats"])
 class TextCategorizer(Pipe):
    """Pipeline component for text classification.

    DOCS: https://spacy.io/api/textcategorizer
    """
-    name = 'textcat'

    @classmethod
    def Model(cls, nr_class=1, **cfg):
@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser):

    DOCS: https://spacy.io/api/dependencyparser
    """
-
+    # cdef classes can't have decorators, so we're defining this here
    name = "parser"
+    factory = "parser"
+    assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
+    requires = []
    TransitionSystem = ArcEager

    @property
@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser):

    DOCS: https://spacy.io/api/entityrecognizer
    """
-
    name = "ner"
+    factory = "ner"
+    assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
+    requires = []
    TransitionSystem = BiluoPushDown
    nr_feature = 6

@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser):
        return tuple(sorted(labels))


+@component(
+    "entity_linker",
+    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+    assigns=["token.ent_kb_id"]
+)
 class EntityLinker(Pipe):
    """Pipeline component for named entity linking.

    DOCS: https://spacy.io/api/entitylinker
    """
-    name = 'entity_linker'
    NIL = "NIL"  # string used to refer to a non-existing link

    @classmethod
@ -1405,13 +1416,13 @@ class EntityLinker(Pipe):
        raise NotImplementedError


+@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
 class Sentencizer(object):
    """Segment the Doc into sentences using a rule-based strategy.

    DOCS: https://spacy.io/api/sentencizer
    """

-    name = "sentencizer"
    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
@ -1437,6 +1448,10 @@ class Sentencizer(object):
        else:
            self.punct_chars = set(self.default_punct_chars)

+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(**cfg)
+
    def __call__(self, doc):
        """Apply the sentencizer to a Doc and set Token.is_sent_start.

@ -1503,4 +1518,9 @@ class Sentencizer(object):
        return self


+# Cython classes can't be decorated, so we need to add the factories here
+Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
+Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
+
+
 __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -128,6 +128,10 @@ cdef class Parser:
        self._multitasks = []
        self._rehearsal_model = None

+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(nlp.vocab, **cfg)
+
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)

--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@ -0,0 +1,146 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import spacy.language
+from spacy.language import Language, component
+from spacy.analysis import print_summary, validate_attrs
+from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.compat import is_python2
+from mock import Mock, ANY
+import pytest
+
+
+def test_component_decorator_function():
+    @component(name="test")
+    def test_component(doc):
+        """docstring"""
+        return doc
+
+    assert test_component.name == "test"
+    if not is_python2:
+        assert test_component.__doc__ == "docstring"
+    assert test_component("foo") == "foo"
+
+
+def test_component_decorator_class():
+    @component(name="test")
+    class TestComponent(object):
+        """docstring1"""
+
+        foo = "bar"
+
+        def __call__(self, doc):
+            """docstring2"""
+            return doc
+
+        def custom(self, x):
+            """docstring3"""
+            return x
+
+    assert TestComponent.name == "test"
+    assert TestComponent.foo == "bar"
+    assert hasattr(TestComponent, "custom")
+    test_component = TestComponent()
+    assert test_component.foo == "bar"
+    assert test_component("foo") == "foo"
+    assert hasattr(test_component, "custom")
+    assert test_component.custom("bar") == "bar"
+    if not is_python2:
+        assert TestComponent.__doc__ == "docstring1"
+        assert TestComponent.__call__.__doc__ == "docstring2"
+        assert TestComponent.custom.__doc__ == "docstring3"
+        assert test_component.__doc__ == "docstring1"
+        assert test_component.__call__.__doc__ == "docstring2"
+        assert test_component.custom.__doc__ == "docstring3"
+
+
+def test_component_decorator_assigns():
+    spacy.language.ENABLE_PIPELINE_ANALYSIS = True
+
+    @component("c1", assigns=["token.tag", "doc.tensor"])
+    def test_component1(doc):
+        return doc
+
+    @component(
+        "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
+    )
+    def test_component2(doc):
+        return doc
+
+    @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
+    def test_component3(doc):
+        return doc
+
+    assert "c1" in Language.factories
+    assert "c2" in Language.factories
+    assert "c3" in Language.factories
+
+    nlp = Language()
+    nlp.add_pipe(test_component1)
+    with pytest.warns(UserWarning):
+        nlp.add_pipe(test_component2)
+    nlp.add_pipe(test_component3)
+    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
+    assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
+    test_component4 = nlp.create_pipe("c1")
+    assert test_component4.name == "c1"
+    assert test_component4.factory == "c1"
+    nlp.add_pipe(test_component4, name="c4")
+    assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
+    assert "c4" not in Language.factories
+    assert nlp.pipe_factories["c1"] == "c1"
+    assert nlp.pipe_factories["c4"] == "c1"
+    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
+    assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
+    requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
+    assert [name for name, _ in requires_pos] == ["c2"]
+    assert print_summary(nlp, no_print=True)
+    assert nlp("hello world")
+
+
+def test_component_factories_from_nlp():
+    """Test that class components can implement a from_nlp classmethod that
+    gives them access to the nlp object and config via the factory."""
+
+    class TestComponent5(object):
+        def __call__(self, doc):
+            return doc
+
+    mock = Mock()
+    mock.return_value = TestComponent5()
+    TestComponent5.from_nlp = classmethod(mock)
+    TestComponent5 = component("c5")(TestComponent5)
+
+    assert "c5" in Language.factories
+    nlp = Language()
+    pipe = nlp.create_pipe("c5", config={"foo": "bar"})
+    nlp.add_pipe(pipe)
+    assert nlp("hello world")
+    # The first argument here is the class itself, so we're accepting any here
+    mock.assert_called_once_with(ANY, nlp, foo="bar")
+
+
+def test_analysis_validate_attrs_valid():
+    attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"]
+    assert validate_attrs(attrs)
+    for attr in attrs:
+        assert validate_attrs([attr])
+    with pytest.raises(ValueError):
+        validate_attrs(["doc.sents", "doc.xyz"])
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "doc",
+        "doc_ents",
+        "doc.xyz",
+        "token.xyz",
+        "token.tag_",
+        "token.tag.xyz",
+        "token._.xyz.abc",
+    ],
+)
+def test_analysis_validate_attrs_invalid(attr):
+    with pytest.raises(ValueError):
+        validate_attrs([attr])
--- a/spacy/util.py
+++ b/spacy/util.py
@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
    cls = get_lang_class(lang)
    nlp = cls(meta=meta, **overrides)
    pipeline = meta.get("pipeline", [])
+    factories = meta.get("factories", {})
    disable = overrides.get("disable", [])
    if pipeline is True:
        pipeline = nlp.Defaults.pipe_names
@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
    for name in pipeline:
        if name not in disable:
            config = meta.get("pipeline_args", {}).get(name, {})
-            component = nlp.create_pipe(name, config=config)
+            factory = factories.get(name, name)
+            component = nlp.create_pipe(factory, config=config)
            nlp.add_pipe(component, name=name)
    return nlp.from_disk(model_path)

@ -368,6 +370,16 @@ def is_in_jupyter():
    return False


+def get_component_name(component):
+    if hasattr(component, "name"):
+        return component.name
+    if hasattr(component, "__name__"):
+        return component.__name__
+    if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
+        return component.__class__.__name__
+    return repr(component)
+
+
 def get_cuda_stream(require=False):
    if CudaStream is None:
        return None