diff --git a/spacy/errors.py b/spacy/errors.py
index 3fe53d6db..124572b0b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -63,8 +63,6 @@ class Warnings:
"have the spacy-lookups-data package installed.")
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
"the Knowledge Base.")
- W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
- "previous components in the pipeline declare that they assign it.")
W026 = ("Unable to set all sentence boundaries from dependency parses.")
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
diff --git a/spacy/language.py b/spacy/language.py
index 594a4b148..d1b180cef 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -18,7 +18,7 @@ from timeit import default_timer as timer
from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
-from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .gold import Example
from .scorer import Scorer
from .util import create_default_optimizer, registry
@@ -37,8 +37,6 @@ from . import util
from . import about
-# TODO: integrate pipeline analyis
-ENABLE_PIPELINE_ANALYSIS = False
# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
@@ -522,6 +520,25 @@ class Language:
return add_component(func)
return add_component
+ def analyze_pipes(
+ self,
+ *,
+ keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
+ pretty: bool = False,
+ ) -> Optional[Dict[str, Any]]:
+ """Analyze the current pipeline components, print a summary of what
+ they assign or require and check that all requirements are met.
+
+ keys (List[str]): The meta values to display in the table. Corresponds
+ to values in FactoryMeta, defined by @Language.factory decorator.
+ pretty (bool): Pretty-print the results.
+ RETURNS (dict): The data.
+ """
+ analysis = analyze_pipes(self, keys=keys)
+ if pretty:
+ print_pipe_analysis(analysis, keys=keys)
+ return analysis
+
def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
"""Get a pipeline component for a given component name.
@@ -666,8 +683,6 @@ class Language:
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
self.pipeline.insert(pipe_index, (name, pipe_component))
- if ENABLE_PIPELINE_ANALYSIS:
- analyze_pipes(self, name, pipe_index)
return pipe_component
def _get_pipe_index(
@@ -758,8 +773,6 @@ class Language:
self.add_pipe(factory_name, name=name)
else:
self.add_pipe(factory_name, name=name, before=pipe_index)
- if ENABLE_PIPELINE_ANALYSIS:
- analyze_all_pipes(self)
def rename_pipe(self, old_name: str, new_name: str) -> None:
"""Rename a pipeline component.
@@ -793,8 +806,6 @@ class Language:
# because factory may be used for something else
self._pipe_meta.pop(name)
self._pipe_configs.pop(name)
- if ENABLE_PIPELINE_ANALYSIS:
- analyze_all_pipes(self)
return removed
def __call__(
diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py
index b57f1524b..008ac3384 100644
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@@ -1,9 +1,8 @@
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
-from wasabi import Printer
-import warnings
+from wasabi import msg
from .tokens import Doc, Token, Span
-from .errors import Errors, Warnings
+from .errors import Errors
from .util import dot_to_dict
if TYPE_CHECKING:
@@ -11,48 +10,7 @@ if TYPE_CHECKING:
from .language import Language # noqa: F401
-def analyze_pipes(
- nlp: "Language", name: str, index: int, warn: bool = True
-) -> List[str]:
- """Analyze a pipeline component with respect to its position in the current
- pipeline and the other components. Will check whether requirements are
- fulfilled (e.g. if previous components assign the attributes).
-
- nlp (Language): The current nlp object.
- name (str): The name of the pipeline component to analyze.
- index (int): The index of the component in the pipeline.
- warn (bool): Show user warning if problem is found.
- RETURNS (List[str]): The problems found for the given pipeline component.
- """
- assert nlp.pipeline[index][0] == name
- prev_pipes = nlp.pipeline[:index]
- meta = nlp.get_pipe_meta(name)
- requires = {annot: False for annot in meta.requires}
- if requires:
- for prev_name, prev_pipe in prev_pipes:
- prev_meta = nlp.get_pipe_meta(prev_name)
- for annot in prev_meta.assigns:
- requires[annot] = True
- problems = []
- for annot, fulfilled in requires.items():
- if not fulfilled:
- problems.append(annot)
- if warn:
- warnings.warn(Warnings.W025.format(name=name, attr=annot))
- return problems
-
-
-def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
- """Analyze all pipes in the pipeline in order.
-
- nlp (Language): The current nlp object.
- warn (bool): Show user warning if problem is found.
- RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
- """
- problems = {}
- for i, name in enumerate(nlp.pipe_names):
- problems[name] = analyze_pipes(nlp, name, i, warn=warn)
- return problems
+DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"]
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
@@ -101,89 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]:
return values
-def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
- assert feature in ["assigns", "requires"]
- result = []
+def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
+ """Check which components in the pipeline assign or require an attribute.
+
+ nlp (Language): The current nlp object.
+ attr (str): The attribute, e.g. "doc.tensor".
+ RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
+ mapped to a list of component names.
+ """
+ result = {"assigns": [], "requires": []}
for pipe_name in nlp.pipe_names:
meta = nlp.get_pipe_meta(pipe_name)
- pipe_assigns = getattr(meta, feature, [])
- if attr in pipe_assigns:
- result.append(pipe_name)
+ if attr in meta.assigns:
+ result["assigns"].append(pipe_name)
+ if attr in meta.requires:
+ result["requires"].append(pipe_name)
return result
-def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
- """Get all pipeline components that assign an attr, e.g. "doc.tensor".
-
- pipeline (Language): The current nlp object.
- attr (str): The attribute to check.
- RETURNS (List[str]): Names of components that require the attr.
- """
- return _get_feature_for_attr(nlp, attr, "assigns")
-
-
-def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
- """Get all pipeline components that require an attr, e.g. "doc.tensor".
-
- pipeline (Language): The current nlp object.
- attr (str): The attribute to check.
- RETURNS (List[str]): Names of components that require the attr.
- """
- return _get_feature_for_attr(nlp, attr, "requires")
-
-
-def print_summary(
- nlp: "Language", pretty: bool = True, no_print: bool = False
-) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
+def analyze_pipes(
+ nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
+) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
"""Print a formatted summary for the current nlp object's pipeline. Shows
a table with the pipeline components and why they assign and require, as
well as any problems if available.
nlp (Language): The nlp object.
- pretty (bool): Pretty-print the results (color etc).
- no_print (bool): Don't print anything, just return the data.
- RETURNS (dict): A dict with "overview" and "problems".
+ keys (List[str]): The meta keys to show in the table.
+ RETURNS (dict): A dict with "summary" and "problems".
"""
- msg = Printer(pretty=pretty, no_print=no_print)
- overview = []
- problems = {}
+ result = {"summary": {}, "problems": {}}
+ all_attrs = set()
for i, name in enumerate(nlp.pipe_names):
meta = nlp.get_pipe_meta(name)
- overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
- problems[name] = analyze_pipes(nlp, name, i, warn=False)
+ all_attrs.update(meta.assigns)
+ all_attrs.update(meta.requires)
+ result["summary"][name] = {key: getattr(meta, key, None) for key in keys}
+ prev_pipes = nlp.pipeline[:i]
+ requires = {annot: False for annot in meta.requires}
+ if requires:
+ for prev_name, prev_pipe in prev_pipes:
+ prev_meta = nlp.get_pipe_meta(prev_name)
+ for annot in prev_meta.assigns:
+ requires[annot] = True
+ result["problems"][name] = []
+ for annot, fulfilled in requires.items():
+ if not fulfilled:
+ result["problems"][name].append(annot)
+ result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
+ return result
+
+
+def print_pipe_analysis(
+ analysis: Dict[str, Union[List[str], Dict[str, List[str]]]],
+ *,
+ keys: List[str] = DEFAULT_KEYS,
+) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
+ """Print a formatted version of the pipe analysis produced by analyze_pipes.
+
+ analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
+ keys (List[str]): The meta keys to show in the table.
+ """
msg.divider("Pipeline Overview")
- header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
- msg.table(overview, header=header, divider=True, multiline=True)
- n_problems = sum(len(p) for p in problems.values())
- if any(p for p in problems.values()):
+ header = ["#", "Component", *[key.capitalize() for key in keys]]
+ summary = analysis["summary"].items()
+ body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
+ msg.table(body, header=header, divider=True, multiline=True)
+ n_problems = sum(len(p) for p in analysis["problems"].values())
+ if any(p for p in analysis["problems"].values()):
msg.divider(f"Problems ({n_problems})")
- for name, problem in problems.items():
+ for name, problem in analysis["problems"].items():
if problem:
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
else:
msg.good("No problems found.")
- if no_print:
- return {"overview": overview, "problems": problems}
-
-
-def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
- """Count how many subsequent components require an annotation set by each
- component in the pipeline.
-
- nlp (Language): The current nlp object.
- RETURNS (List[int]): The interdependency counts.
- """
- pipe_assigns = []
- pipe_requires = []
- for name in nlp.pipe_names:
- meta = nlp.get_pipe_meta(name)
- pipe_assigns.append(set(meta.assigns))
- pipe_requires.append(set(meta.requires))
- counts = []
- for i, assigns in enumerate(pipe_assigns):
- count = 0
- for requires in pipe_requires[i + 1 :]:
- if assigns.intersection(requires):
- count += 1
- counts.append(count)
- return counts
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
index 4e1407707..df3d7dff5 100644
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -1,15 +1,10 @@
-import spacy.language
from spacy.language import Language
-from spacy.pipe_analysis import print_summary, validate_attrs
-from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
-from spacy.pipe_analysis import count_pipeline_interdependencies
+from spacy.pipe_analysis import get_attr_info, validate_attrs
from mock import Mock
import pytest
def test_component_decorator_assigns():
- spacy.language.ENABLE_PIPELINE_ANALYSIS = True
-
@Language.component("c1", assigns=["token.tag", "doc.tensor"])
def test_component1(doc):
return doc
@@ -32,10 +27,11 @@ def test_component_decorator_assigns():
nlp = Language()
nlp.add_pipe("c1")
- with pytest.warns(UserWarning):
- nlp.add_pipe("c2")
+ nlp.add_pipe("c2")
+ problems = nlp.analyze_pipes()["problems"]
+ assert problems["c2"] == ["token.pos"]
nlp.add_pipe("c3")
- assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"]
+ assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"]
nlp.add_pipe("c1", name="c4")
test_component4_meta = nlp.get_pipe_meta("c1")
assert test_component4_meta.factory == "c1"
@@ -43,9 +39,8 @@ def test_component_decorator_assigns():
assert not Language.has_factory("c4")
assert nlp.pipe_factories["c1"] == "c1"
assert nlp.pipe_factories["c4"] == "c1"
- assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"]
- assert get_requires_for_attr(nlp, "token.pos") == ["c2"]
- assert print_summary(nlp, no_print=True)
+ assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"]
+ assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"]
assert nlp("hello world")
@@ -100,7 +95,6 @@ def test_analysis_validate_attrs_invalid(attr):
def test_analysis_validate_attrs_remove_pipe():
"""Test that attributes are validated correctly on remove."""
- spacy.language.ENABLE_PIPELINE_ANALYSIS = True
@Language.component("pipe_analysis_c6", assigns=["token.tag"])
def c1(doc):
@@ -112,26 +106,9 @@ def test_analysis_validate_attrs_remove_pipe():
nlp = Language()
nlp.add_pipe("pipe_analysis_c6")
- with pytest.warns(UserWarning):
- nlp.add_pipe("pipe_analysis_c7")
- with pytest.warns(None) as record:
- nlp.remove_pipe("pipe_analysis_c7")
- assert not record.list
-
-
-def test_pipe_interdependencies():
- prefix = "test_pipe_interdependencies"
-
- @Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",))
- def fancifier(doc):
- return doc
-
- @Language.component(f"{prefix}.needer", requires=("doc._.fancy",))
- def needer(doc):
- return doc
-
- nlp = Language()
- nlp.add_pipe(f"{prefix}.fancifier")
- nlp.add_pipe(f"{prefix}.needer")
- counts = count_pipeline_interdependencies(nlp)
- assert counts == [1, 0]
+ nlp.add_pipe("pipe_analysis_c7")
+ problems = nlp.analyze_pipes()["problems"]
+ assert problems["pipe_analysis_c7"] == ["token.pos"]
+ nlp.remove_pipe("pipe_analysis_c7")
+ problems = nlp.analyze_pipes()["problems"]
+ assert all(p == [] for p in problems.values())
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 0662fb12a..ba62d0b13 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -98,10 +98,10 @@ decorator. For more details and examples, see the
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
-| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
-| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
-| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
-| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
+| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. |
+| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@@ -146,10 +146,10 @@ examples, see the
| `name` | str | The name of the component factory. |
| _keyword-only_ | | |
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
-| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
-| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
-| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
-| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
+| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@@ -598,6 +598,97 @@ contains the information about the component and its default provided by the
| `name` | str | The pipeline component name. |
| **RETURNS** | [`FactoryMeta`](#factorymeta) | The factory meta. |
+## Language.analyze_pipes {#analyze_pipes tag="method" new="3"}
+
+Analyze the current pipeline components and show a summary of the attributes
+they assign and require, and the scores they set. The data is based on the
+information provided in the [`@Language.component`](/api/language#component) and
+[`@Language.factory`](/api/language#factory) decorator. If requirements aren't
+met, e.g. if a component specifies a required property that is not set by a
+previous component, a warning is shown.
+
+
+
+The pipeline analysis is static and does **not actually run the components**.
+This means that it relies on the information provided by the components
+themselves. If a custom component declares that it assigns an attribute but it
+doesn't, the pipeline analysis won't catch that.
+
+
+
+> #### Example
+>
+> ```python
+> nlp = spacy.blank("en")
+> nlp.add_pipe("tagger")
+> nlp.add_pipe("entity_linker")
+> analysis = nlp.analyze_pipes()
+> ```
+
+
+
+```json
+### Structured
+{
+ "summary": {
+ "tagger": {
+ "assigns": ["token.tag"],
+ "requires": [],
+ "scores": ["tag_acc", "pos_acc", "lemma_acc"],
+ "retokenizes": false
+ },
+ "entity_linker": {
+ "assigns": ["token.ent_kb_id"],
+ "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
+ "scores": [],
+ "retokenizes": false
+ }
+ },
+ "problems": {
+ "tagger": [],
+ "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
+ },
+ "attrs": {
+ "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
+ "doc.ents": { "assigns": [], "requires": ["entity_linker"] },
+ "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
+ "doc.sents": { "assigns": [], "requires": ["entity_linker"] },
+ "token.tag": { "assigns": ["tagger"], "requires": [] },
+ "token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
+ }
+}
+```
+
+```
+### Pretty
+============================= Pipeline Overview =============================
+
+# Component Assigns Requires Scores Retokenizes
+- ------------- --------------- -------------- --------- -----------
+0 tagger token.tag tag_acc False
+ pos_acc
+ lemma_acc
+
+1 entity_linker token.ent_kb_id doc.ents False
+ doc.sents
+ token.ent_iob
+ token.ent_type
+
+
+================================ Problems (4) ================================
+⚠ 'entity_linker' requirements not met: doc.ents, doc.sents,
+token.ent_iob, token.ent_type
+```
+
+
+
+| Name | Type | Description |
+| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| _keyword-only_ | | |
+| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. |
+| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. |
+| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). |
+
## Language.meta {#meta tag="property"}
Custom meta data for the Language class. If a model is loaded, contains meta
@@ -833,8 +924,8 @@ instance and factory instance.
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | str | The name of the registered component factory. |
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
-| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
-| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
-| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
-| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
+| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
+| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 486cef1be..6388529f6 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -311,6 +311,99 @@ nlp.rename_pipe("ner", "entityrecognizer")
nlp.replace_pipe("tagger", my_custom_tagger)
```
+### Analyzing pipeline components {#analysis new="3"}
+
+The [`nlp.analyze_pipes`](/api/language#analyze_pipes) method analyzes the
+components in the current pipeline and outputs information about them, like the
+attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether
+they retokenize the `Doc` and which scores they produce during training. It will
+also show warnings if components require values that aren't set by previous
+component – for instance, if the entity linker is used but no component that
+runs before it sets named entities. Setting `pretty=True` will pretty-print a
+table instead of only returning the structured data.
+
+> #### ✏️ Things to try
+>
+> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker.
+> The analysis should now show no problems, because requirements are met.
+
+```python
+### {executable="true"}
+import spacy
+
+nlp = spacy.blank("en")
+nlp.add_pipe("tagger")
+# This is a problem because it needs entities and sentence boundaries
+nlp.add_pipe("entity_linker")
+analysis = nlp.analyze_pipes(pretty=True)
+```
+
+
+
+```json
+### Structured
+{
+ "summary": {
+ "tagger": {
+ "assigns": ["token.tag"],
+ "requires": [],
+ "scores": ["tag_acc", "pos_acc", "lemma_acc"],
+ "retokenizes": false
+ },
+ "entity_linker": {
+ "assigns": ["token.ent_kb_id"],
+ "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
+ "scores": [],
+ "retokenizes": false
+ }
+ },
+ "problems": {
+ "tagger": [],
+ "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
+ },
+ "attrs": {
+ "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
+ "doc.ents": { "assigns": [], "requires": ["entity_linker"] },
+ "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
+ "doc.sents": { "assigns": [], "requires": ["entity_linker"] },
+ "token.tag": { "assigns": ["tagger"], "requires": [] },
+ "token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
+ }
+}
+```
+
+```
+### Pretty
+============================= Pipeline Overview =============================
+
+# Component Assigns Requires Scores Retokenizes
+- ------------- --------------- -------------- --------- -----------
+0 tagger token.tag tag_acc False
+ pos_acc
+ lemma_acc
+
+1 entity_linker token.ent_kb_id doc.ents False
+ doc.sents
+ token.ent_iob
+ token.ent_type
+
+
+================================ Problems (4) ================================
+⚠ 'entity_linker' requirements not met: doc.ents, doc.sents,
+token.ent_iob, token.ent_type
+```
+
+
+
+
+
+The pipeline analysis is static and does **not actually run the components**.
+This means that it relies on the information provided by the components
+themselves. If a custom component declares that it assigns an attribute but it
+doesn't, the pipeline analysis won't catch that.
+
+
+
## Creating custom pipeline components {#custom-components}
A pipeline component is a function that receives a `Doc` object, modifies it and