mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge pull request #5851 from explosion/feature/better-pipe-analysis
This commit is contained in:
		
						commit
						e393ebd78b
					
				| 
						 | 
				
			
			@ -63,8 +63,6 @@ class Warnings:
 | 
			
		|||
            "have the spacy-lookups-data package installed.")
 | 
			
		||||
    W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
 | 
			
		||||
            "the Knowledge Base.")
 | 
			
		||||
    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
 | 
			
		||||
            "previous components in the pipeline declare that they assign it.")
 | 
			
		||||
    W026 = ("Unable to set all sentence boundaries from dependency parses.")
 | 
			
		||||
    W027 = ("Found a large training file of {size} bytes. Note that it may "
 | 
			
		||||
            "be more efficient to split your training data into multiple "
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -18,7 +18,7 @@ from timeit import default_timer as timer
 | 
			
		|||
 | 
			
		||||
from .tokens.underscore import Underscore
 | 
			
		||||
from .vocab import Vocab, create_vocab
 | 
			
		||||
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
 | 
			
		||||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
 | 
			
		||||
from .gold import Example
 | 
			
		||||
from .scorer import Scorer
 | 
			
		||||
from .util import create_default_optimizer, registry
 | 
			
		||||
| 
						 | 
				
			
			@ -37,8 +37,6 @@ from . import util
 | 
			
		|||
from . import about
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# TODO: integrate pipeline analyis
 | 
			
		||||
ENABLE_PIPELINE_ANALYSIS = False
 | 
			
		||||
# This is the base config will all settings (training etc.)
 | 
			
		||||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
 | 
			
		||||
DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH)
 | 
			
		||||
| 
						 | 
				
			
			@ -522,6 +520,25 @@ class Language:
 | 
			
		|||
            return add_component(func)
 | 
			
		||||
        return add_component
 | 
			
		||||
 | 
			
		||||
    def analyze_pipes(
 | 
			
		||||
        self,
 | 
			
		||||
        *,
 | 
			
		||||
        keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
 | 
			
		||||
        pretty: bool = False,
 | 
			
		||||
    ) -> Optional[Dict[str, Any]]:
 | 
			
		||||
        """Analyze the current pipeline components, print a summary of what
 | 
			
		||||
        they assign or require and check that all requirements are met.
 | 
			
		||||
 | 
			
		||||
        keys (List[str]): The meta values to display in the table. Corresponds
 | 
			
		||||
            to values in FactoryMeta, defined by @Language.factory decorator.
 | 
			
		||||
        pretty (bool): Pretty-print the results.
 | 
			
		||||
        RETURNS (dict): The data.
 | 
			
		||||
        """
 | 
			
		||||
        analysis = analyze_pipes(self, keys=keys)
 | 
			
		||||
        if pretty:
 | 
			
		||||
            print_pipe_analysis(analysis, keys=keys)
 | 
			
		||||
        return analysis
 | 
			
		||||
 | 
			
		||||
    def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
 | 
			
		||||
        """Get a pipeline component for a given component name.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -666,8 +683,6 @@ class Language:
 | 
			
		|||
        pipe_index = self._get_pipe_index(before, after, first, last)
 | 
			
		||||
        self._pipe_meta[name] = self.get_factory_meta(factory_name)
 | 
			
		||||
        self.pipeline.insert(pipe_index, (name, pipe_component))
 | 
			
		||||
        if ENABLE_PIPELINE_ANALYSIS:
 | 
			
		||||
            analyze_pipes(self, name, pipe_index)
 | 
			
		||||
        return pipe_component
 | 
			
		||||
 | 
			
		||||
    def _get_pipe_index(
 | 
			
		||||
| 
						 | 
				
			
			@ -758,8 +773,6 @@ class Language:
 | 
			
		|||
            self.add_pipe(factory_name, name=name)
 | 
			
		||||
        else:
 | 
			
		||||
            self.add_pipe(factory_name, name=name, before=pipe_index)
 | 
			
		||||
        if ENABLE_PIPELINE_ANALYSIS:
 | 
			
		||||
            analyze_all_pipes(self)
 | 
			
		||||
 | 
			
		||||
    def rename_pipe(self, old_name: str, new_name: str) -> None:
 | 
			
		||||
        """Rename a pipeline component.
 | 
			
		||||
| 
						 | 
				
			
			@ -793,8 +806,6 @@ class Language:
 | 
			
		|||
        # because factory may be used for something else
 | 
			
		||||
        self._pipe_meta.pop(name)
 | 
			
		||||
        self._pipe_configs.pop(name)
 | 
			
		||||
        if ENABLE_PIPELINE_ANALYSIS:
 | 
			
		||||
            analyze_all_pipes(self)
 | 
			
		||||
        return removed
 | 
			
		||||
 | 
			
		||||
    def __call__(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,9 +1,8 @@
 | 
			
		|||
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
 | 
			
		||||
from wasabi import Printer
 | 
			
		||||
import warnings
 | 
			
		||||
from wasabi import msg
 | 
			
		||||
 | 
			
		||||
from .tokens import Doc, Token, Span
 | 
			
		||||
from .errors import Errors, Warnings
 | 
			
		||||
from .errors import Errors
 | 
			
		||||
from .util import dot_to_dict
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
| 
						 | 
				
			
			@ -11,48 +10,7 @@ if TYPE_CHECKING:
 | 
			
		|||
    from .language import Language  # noqa: F401
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def analyze_pipes(
 | 
			
		||||
    nlp: "Language", name: str, index: int, warn: bool = True
 | 
			
		||||
) -> List[str]:
 | 
			
		||||
    """Analyze a pipeline component with respect to its position in the current
 | 
			
		||||
    pipeline and the other components. Will check whether requirements are
 | 
			
		||||
    fulfilled (e.g. if previous components assign the attributes).
 | 
			
		||||
 | 
			
		||||
    nlp (Language): The current nlp object.
 | 
			
		||||
    name (str): The name of the pipeline component to analyze.
 | 
			
		||||
    index (int): The index of the component in the pipeline.
 | 
			
		||||
    warn (bool): Show user warning if problem is found.
 | 
			
		||||
    RETURNS (List[str]): The problems found for the given pipeline component.
 | 
			
		||||
    """
 | 
			
		||||
    assert nlp.pipeline[index][0] == name
 | 
			
		||||
    prev_pipes = nlp.pipeline[:index]
 | 
			
		||||
    meta = nlp.get_pipe_meta(name)
 | 
			
		||||
    requires = {annot: False for annot in meta.requires}
 | 
			
		||||
    if requires:
 | 
			
		||||
        for prev_name, prev_pipe in prev_pipes:
 | 
			
		||||
            prev_meta = nlp.get_pipe_meta(prev_name)
 | 
			
		||||
            for annot in prev_meta.assigns:
 | 
			
		||||
                requires[annot] = True
 | 
			
		||||
    problems = []
 | 
			
		||||
    for annot, fulfilled in requires.items():
 | 
			
		||||
        if not fulfilled:
 | 
			
		||||
            problems.append(annot)
 | 
			
		||||
            if warn:
 | 
			
		||||
                warnings.warn(Warnings.W025.format(name=name, attr=annot))
 | 
			
		||||
    return problems
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]:
 | 
			
		||||
    """Analyze all pipes in the pipeline in order.
 | 
			
		||||
 | 
			
		||||
    nlp (Language): The current nlp object.
 | 
			
		||||
    warn (bool): Show user warning if problem is found.
 | 
			
		||||
    RETURNS (Dict[str, List[str]]): The problems found, keyed by component name.
 | 
			
		||||
    """
 | 
			
		||||
    problems = {}
 | 
			
		||||
    for i, name in enumerate(nlp.pipe_names):
 | 
			
		||||
        problems[name] = analyze_pipes(nlp, name, i, warn=warn)
 | 
			
		||||
    return problems
 | 
			
		||||
DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
 | 
			
		||||
| 
						 | 
				
			
			@ -101,89 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]:
 | 
			
		|||
    return values
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
 | 
			
		||||
    assert feature in ["assigns", "requires"]
 | 
			
		||||
    result = []
 | 
			
		||||
def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
 | 
			
		||||
    """Check which components in the pipeline assign or require an attribute.
 | 
			
		||||
 | 
			
		||||
    nlp (Language): The current nlp object.
 | 
			
		||||
    attr (str): The attribute, e.g. "doc.tensor".
 | 
			
		||||
    RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
 | 
			
		||||
        mapped to a list of component names.
 | 
			
		||||
    """
 | 
			
		||||
    result = {"assigns": [], "requires": []}
 | 
			
		||||
    for pipe_name in nlp.pipe_names:
 | 
			
		||||
        meta = nlp.get_pipe_meta(pipe_name)
 | 
			
		||||
        pipe_assigns = getattr(meta, feature, [])
 | 
			
		||||
        if attr in pipe_assigns:
 | 
			
		||||
            result.append(pipe_name)
 | 
			
		||||
        if attr in meta.assigns:
 | 
			
		||||
            result["assigns"].append(pipe_name)
 | 
			
		||||
        if attr in meta.requires:
 | 
			
		||||
            result["requires"].append(pipe_name)
 | 
			
		||||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
 | 
			
		||||
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
 | 
			
		||||
 | 
			
		||||
    pipeline (Language): The current nlp object.
 | 
			
		||||
    attr (str): The attribute to check.
 | 
			
		||||
    RETURNS (List[str]): Names of components that require the attr.
 | 
			
		||||
    """
 | 
			
		||||
    return _get_feature_for_attr(nlp, attr, "assigns")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
 | 
			
		||||
    """Get all pipeline components that require an attr, e.g. "doc.tensor".
 | 
			
		||||
 | 
			
		||||
    pipeline (Language): The current nlp object.
 | 
			
		||||
    attr (str): The attribute to check.
 | 
			
		||||
    RETURNS (List[str]): Names of components that require the attr.
 | 
			
		||||
    """
 | 
			
		||||
    return _get_feature_for_attr(nlp, attr, "requires")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def print_summary(
 | 
			
		||||
    nlp: "Language", pretty: bool = True, no_print: bool = False
 | 
			
		||||
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
 | 
			
		||||
def analyze_pipes(
 | 
			
		||||
    nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
 | 
			
		||||
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
 | 
			
		||||
    """Print a formatted summary for the current nlp object's pipeline. Shows
 | 
			
		||||
    a table with the pipeline components and why they assign and require, as
 | 
			
		||||
    well as any problems if available.
 | 
			
		||||
 | 
			
		||||
    nlp (Language): The nlp object.
 | 
			
		||||
    pretty (bool): Pretty-print the results (color etc).
 | 
			
		||||
    no_print (bool): Don't print anything, just return the data.
 | 
			
		||||
    RETURNS (dict): A dict with "overview" and "problems".
 | 
			
		||||
    keys (List[str]): The meta keys to show in the table.
 | 
			
		||||
    RETURNS (dict): A dict with "summary" and "problems".
 | 
			
		||||
    """
 | 
			
		||||
    msg = Printer(pretty=pretty, no_print=no_print)
 | 
			
		||||
    overview = []
 | 
			
		||||
    problems = {}
 | 
			
		||||
    result = {"summary": {}, "problems": {}}
 | 
			
		||||
    all_attrs = set()
 | 
			
		||||
    for i, name in enumerate(nlp.pipe_names):
 | 
			
		||||
        meta = nlp.get_pipe_meta(name)
 | 
			
		||||
        overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes))
 | 
			
		||||
        problems[name] = analyze_pipes(nlp, name, i, warn=False)
 | 
			
		||||
        all_attrs.update(meta.assigns)
 | 
			
		||||
        all_attrs.update(meta.requires)
 | 
			
		||||
        result["summary"][name] = {key: getattr(meta, key, None) for key in keys}
 | 
			
		||||
        prev_pipes = nlp.pipeline[:i]
 | 
			
		||||
        requires = {annot: False for annot in meta.requires}
 | 
			
		||||
        if requires:
 | 
			
		||||
            for prev_name, prev_pipe in prev_pipes:
 | 
			
		||||
                prev_meta = nlp.get_pipe_meta(prev_name)
 | 
			
		||||
                for annot in prev_meta.assigns:
 | 
			
		||||
                    requires[annot] = True
 | 
			
		||||
        result["problems"][name] = []
 | 
			
		||||
        for annot, fulfilled in requires.items():
 | 
			
		||||
            if not fulfilled:
 | 
			
		||||
                result["problems"][name].append(annot)
 | 
			
		||||
    result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
 | 
			
		||||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def print_pipe_analysis(
 | 
			
		||||
    analysis: Dict[str, Union[List[str], Dict[str, List[str]]]],
 | 
			
		||||
    *,
 | 
			
		||||
    keys: List[str] = DEFAULT_KEYS,
 | 
			
		||||
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
 | 
			
		||||
    """Print a formatted version of the pipe analysis produced by analyze_pipes.
 | 
			
		||||
 | 
			
		||||
    analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
 | 
			
		||||
    keys (List[str]): The meta keys to show in the table.
 | 
			
		||||
    """
 | 
			
		||||
    msg.divider("Pipeline Overview")
 | 
			
		||||
    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
 | 
			
		||||
    msg.table(overview, header=header, divider=True, multiline=True)
 | 
			
		||||
    n_problems = sum(len(p) for p in problems.values())
 | 
			
		||||
    if any(p for p in problems.values()):
 | 
			
		||||
    header = ["#", "Component", *[key.capitalize() for key in keys]]
 | 
			
		||||
    summary = analysis["summary"].items()
 | 
			
		||||
    body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
 | 
			
		||||
    msg.table(body, header=header, divider=True, multiline=True)
 | 
			
		||||
    n_problems = sum(len(p) for p in analysis["problems"].values())
 | 
			
		||||
    if any(p for p in analysis["problems"].values()):
 | 
			
		||||
        msg.divider(f"Problems ({n_problems})")
 | 
			
		||||
        for name, problem in problems.items():
 | 
			
		||||
        for name, problem in analysis["problems"].items():
 | 
			
		||||
            if problem:
 | 
			
		||||
                msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
 | 
			
		||||
    else:
 | 
			
		||||
        msg.good("No problems found.")
 | 
			
		||||
    if no_print:
 | 
			
		||||
        return {"overview": overview, "problems": problems}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
 | 
			
		||||
    """Count how many subsequent components require an annotation set by each
 | 
			
		||||
    component in the pipeline.
 | 
			
		||||
 | 
			
		||||
    nlp (Language): The current nlp object.
 | 
			
		||||
    RETURNS (List[int]): The interdependency counts.
 | 
			
		||||
    """
 | 
			
		||||
    pipe_assigns = []
 | 
			
		||||
    pipe_requires = []
 | 
			
		||||
    for name in nlp.pipe_names:
 | 
			
		||||
        meta = nlp.get_pipe_meta(name)
 | 
			
		||||
        pipe_assigns.append(set(meta.assigns))
 | 
			
		||||
        pipe_requires.append(set(meta.requires))
 | 
			
		||||
    counts = []
 | 
			
		||||
    for i, assigns in enumerate(pipe_assigns):
 | 
			
		||||
        count = 0
 | 
			
		||||
        for requires in pipe_requires[i + 1 :]:
 | 
			
		||||
            if assigns.intersection(requires):
 | 
			
		||||
                count += 1
 | 
			
		||||
        counts.append(count)
 | 
			
		||||
    return counts
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,10 @@
 | 
			
		|||
import spacy.language
 | 
			
		||||
from spacy.language import Language
 | 
			
		||||
from spacy.pipe_analysis import print_summary, validate_attrs
 | 
			
		||||
from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
 | 
			
		||||
from spacy.pipe_analysis import count_pipeline_interdependencies
 | 
			
		||||
from spacy.pipe_analysis import get_attr_info, validate_attrs
 | 
			
		||||
from mock import Mock
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_component_decorator_assigns():
 | 
			
		||||
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True
 | 
			
		||||
 | 
			
		||||
    @Language.component("c1", assigns=["token.tag", "doc.tensor"])
 | 
			
		||||
    def test_component1(doc):
 | 
			
		||||
        return doc
 | 
			
		||||
| 
						 | 
				
			
			@ -32,10 +27,11 @@ def test_component_decorator_assigns():
 | 
			
		|||
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    nlp.add_pipe("c1")
 | 
			
		||||
    with pytest.warns(UserWarning):
 | 
			
		||||
        nlp.add_pipe("c2")
 | 
			
		||||
    nlp.add_pipe("c2")
 | 
			
		||||
    problems = nlp.analyze_pipes()["problems"]
 | 
			
		||||
    assert problems["c2"] == ["token.pos"]
 | 
			
		||||
    nlp.add_pipe("c3")
 | 
			
		||||
    assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"]
 | 
			
		||||
    assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"]
 | 
			
		||||
    nlp.add_pipe("c1", name="c4")
 | 
			
		||||
    test_component4_meta = nlp.get_pipe_meta("c1")
 | 
			
		||||
    assert test_component4_meta.factory == "c1"
 | 
			
		||||
| 
						 | 
				
			
			@ -43,9 +39,8 @@ def test_component_decorator_assigns():
 | 
			
		|||
    assert not Language.has_factory("c4")
 | 
			
		||||
    assert nlp.pipe_factories["c1"] == "c1"
 | 
			
		||||
    assert nlp.pipe_factories["c4"] == "c1"
 | 
			
		||||
    assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"]
 | 
			
		||||
    assert get_requires_for_attr(nlp, "token.pos") == ["c2"]
 | 
			
		||||
    assert print_summary(nlp, no_print=True)
 | 
			
		||||
    assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"]
 | 
			
		||||
    assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"]
 | 
			
		||||
    assert nlp("hello world")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -100,7 +95,6 @@ def test_analysis_validate_attrs_invalid(attr):
 | 
			
		|||
 | 
			
		||||
def test_analysis_validate_attrs_remove_pipe():
 | 
			
		||||
    """Test that attributes are validated correctly on remove."""
 | 
			
		||||
    spacy.language.ENABLE_PIPELINE_ANALYSIS = True
 | 
			
		||||
 | 
			
		||||
    @Language.component("pipe_analysis_c6", assigns=["token.tag"])
 | 
			
		||||
    def c1(doc):
 | 
			
		||||
| 
						 | 
				
			
			@ -112,26 +106,9 @@ def test_analysis_validate_attrs_remove_pipe():
 | 
			
		|||
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    nlp.add_pipe("pipe_analysis_c6")
 | 
			
		||||
    with pytest.warns(UserWarning):
 | 
			
		||||
        nlp.add_pipe("pipe_analysis_c7")
 | 
			
		||||
    with pytest.warns(None) as record:
 | 
			
		||||
        nlp.remove_pipe("pipe_analysis_c7")
 | 
			
		||||
    assert not record.list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_pipe_interdependencies():
 | 
			
		||||
    prefix = "test_pipe_interdependencies"
 | 
			
		||||
 | 
			
		||||
    @Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",))
 | 
			
		||||
    def fancifier(doc):
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
    @Language.component(f"{prefix}.needer", requires=("doc._.fancy",))
 | 
			
		||||
    def needer(doc):
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
    nlp = Language()
 | 
			
		||||
    nlp.add_pipe(f"{prefix}.fancifier")
 | 
			
		||||
    nlp.add_pipe(f"{prefix}.needer")
 | 
			
		||||
    counts = count_pipeline_interdependencies(nlp)
 | 
			
		||||
    assert counts == [1, 0]
 | 
			
		||||
    nlp.add_pipe("pipe_analysis_c7")
 | 
			
		||||
    problems = nlp.analyze_pipes()["problems"]
 | 
			
		||||
    assert problems["pipe_analysis_c7"] == ["token.pos"]
 | 
			
		||||
    nlp.remove_pipe("pipe_analysis_c7")
 | 
			
		||||
    problems = nlp.analyze_pipes()["problems"]
 | 
			
		||||
    assert all(p == [] for p in problems.values())
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -98,10 +98,10 @@ decorator. For more details and examples, see the
 | 
			
		|||
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
 | 
			
		||||
| _keyword-only_          |                      |                                                                                                                                                                                                                             |
 | 
			
		||||
| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
 | 
			
		||||
| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
 | 
			
		||||
| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis.                                                                                                                                                     |
 | 
			
		||||
| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | 
			
		||||
| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis)..                                                                           |
 | 
			
		||||
| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                            |
 | 
			
		||||
| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                                                                 |
 | 
			
		||||
| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                  |
 | 
			
		||||
| `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
 | 
			
		||||
| `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -146,10 +146,10 @@ examples, see the
 | 
			
		|||
| `name`                  | str                  | The name of the component factory.                                                                                                                                                                                          |
 | 
			
		||||
| _keyword-only_          |                      |                                                                                                                                                                                                                             |
 | 
			
		||||
| `default_config`        | `Dict[str, any]`     | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
 | 
			
		||||
| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
 | 
			
		||||
| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
 | 
			
		||||
| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for pipeline analysis.                                                                                                                                                     |
 | 
			
		||||
| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | 
			
		||||
| `assigns`               | `Iterable[str]`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                            |
 | 
			
		||||
| `requires`              | `Iterable[str]`      | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                            |
 | 
			
		||||
| `retokenizes`           | bool                 | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                                                                 |
 | 
			
		||||
| `scores`                | `Iterable[str]`      | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                  |
 | 
			
		||||
| `default_score_weights` | `Dict[str, float]`   | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
 | 
			
		||||
| `func`                  | `Optional[Callable]` | Optional function if not used a a decorator.                                                                                                                                                                                |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -598,6 +598,97 @@ contains the information about the component and its default provided by the
 | 
			
		|||
| `name`      | str                           | The pipeline component name. |
 | 
			
		||||
| **RETURNS** | [`FactoryMeta`](#factorymeta) |  The factory meta.           |
 | 
			
		||||
 | 
			
		||||
## Language.analyze_pipes {#analyze_pipes tag="method" new="3"}
 | 
			
		||||
 | 
			
		||||
Analyze the current pipeline components and show a summary of the attributes
 | 
			
		||||
they assign and require, and the scores they set. The data is based on the
 | 
			
		||||
information provided in the [`@Language.component`](/api/language#component) and
 | 
			
		||||
[`@Language.factory`](/api/language#factory) decorator. If requirements aren't
 | 
			
		||||
met, e.g. if a component specifies a required property that is not set by a
 | 
			
		||||
previous component, a warning is shown.
 | 
			
		||||
 | 
			
		||||
<Infobox variant="warning" title="Important note">
 | 
			
		||||
 | 
			
		||||
The pipeline analysis is static and does **not actually run the components**.
 | 
			
		||||
This means that it relies on the information provided by the components
 | 
			
		||||
themselves. If a custom component declares that it assigns an attribute but it
 | 
			
		||||
doesn't, the pipeline analysis won't catch that.
 | 
			
		||||
 | 
			
		||||
</Infobox>
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> nlp = spacy.blank("en")
 | 
			
		||||
> nlp.add_pipe("tagger")
 | 
			
		||||
> nlp.add_pipe("entity_linker")
 | 
			
		||||
> analysis = nlp.analyze_pipes()
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
<Accordion title="Example output" spaced>
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
### Structured
 | 
			
		||||
{
 | 
			
		||||
  "summary": {
 | 
			
		||||
    "tagger": {
 | 
			
		||||
      "assigns": ["token.tag"],
 | 
			
		||||
      "requires": [],
 | 
			
		||||
      "scores": ["tag_acc", "pos_acc", "lemma_acc"],
 | 
			
		||||
      "retokenizes": false
 | 
			
		||||
    },
 | 
			
		||||
    "entity_linker": {
 | 
			
		||||
      "assigns": ["token.ent_kb_id"],
 | 
			
		||||
      "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
 | 
			
		||||
      "scores": [],
 | 
			
		||||
      "retokenizes": false
 | 
			
		||||
    }
 | 
			
		||||
  },
 | 
			
		||||
  "problems": {
 | 
			
		||||
    "tagger": [],
 | 
			
		||||
    "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
 | 
			
		||||
  },
 | 
			
		||||
  "attrs": {
 | 
			
		||||
    "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
 | 
			
		||||
    "doc.ents": { "assigns": [], "requires": ["entity_linker"] },
 | 
			
		||||
    "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
 | 
			
		||||
    "doc.sents": { "assigns": [], "requires": ["entity_linker"] },
 | 
			
		||||
    "token.tag": { "assigns": ["tagger"], "requires": [] },
 | 
			
		||||
    "token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
### Pretty
 | 
			
		||||
============================= Pipeline Overview =============================
 | 
			
		||||
 | 
			
		||||
#   Component       Assigns           Requires         Scores      Retokenizes
 | 
			
		||||
-   -------------   ---------------   --------------   ---------   -----------
 | 
			
		||||
0   tagger          token.tag                          tag_acc     False
 | 
			
		||||
                                                       pos_acc
 | 
			
		||||
                                                       lemma_acc
 | 
			
		||||
 | 
			
		||||
1   entity_linker   token.ent_kb_id   doc.ents                     False
 | 
			
		||||
                                      doc.sents
 | 
			
		||||
                                      token.ent_iob
 | 
			
		||||
                                      token.ent_type
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
================================ Problems (4) ================================
 | 
			
		||||
⚠ 'entity_linker' requirements not met: doc.ents, doc.sents,
 | 
			
		||||
token.ent_iob, token.ent_type
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
</Accordion>
 | 
			
		||||
 | 
			
		||||
| Name           | Type        | Description                                                                                                                                                                                                    |
 | 
			
		||||
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| _keyword-only_ |             |                                                                                                                                                                                                                |
 | 
			
		||||
| `keys`         | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`.                               |
 | 
			
		||||
| `pretty`       | bool        | Pretty-print the results as a table. Defaults to `False`.                                                                                                                                                      |
 | 
			
		||||
| **RETURNS**    | dict        | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). |
 | 
			
		||||
 | 
			
		||||
## Language.meta {#meta tag="property"}
 | 
			
		||||
 | 
			
		||||
Custom meta data for the Language class. If a model is loaded, contains meta
 | 
			
		||||
| 
						 | 
				
			
			@ -833,8 +924,8 @@ instance and factory instance.
 | 
			
		|||
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `factory`               | str                | The name of the registered component factory.                                                                                                                                                                               |
 | 
			
		||||
| `default_config`        | `Dict[str, Any]`   | The default config, describing the default values of the factory arguments.                                                                                                                                                 |
 | 
			
		||||
| `assigns`               | `Iterable[str]`    | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
 | 
			
		||||
| `requires`              | `Iterable[str]`    | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.                                                                                                                |
 | 
			
		||||
| `retokenizes`           | bool               | Whether the component changes tokenization. Used for pipeline analysis.                                                                                                                                                     |
 | 
			
		||||
| `scores`                | `Iterable[str]`    | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`.                                                                                                                                  |
 | 
			
		||||
| `assigns`               | `Iterable[str]`    | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                            |
 | 
			
		||||
| `requires`              | `Iterable[str]`    | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                            |
 | 
			
		||||
| `retokenizes`           | bool               | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                                                                 |
 | 
			
		||||
| `scores`                | `Iterable[str]`    | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).                                                                  |
 | 
			
		||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -311,6 +311,99 @@ nlp.rename_pipe("ner", "entityrecognizer")
 | 
			
		|||
nlp.replace_pipe("tagger", my_custom_tagger)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Analyzing pipeline components {#analysis new="3"}
 | 
			
		||||
 | 
			
		||||
The [`nlp.analyze_pipes`](/api/language#analyze_pipes) method analyzes the
 | 
			
		||||
components in the current pipeline and outputs information about them, like the
 | 
			
		||||
attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether
 | 
			
		||||
they retokenize the `Doc` and which scores they produce during training. It will
 | 
			
		||||
also show warnings if components require values that aren't set by previous
 | 
			
		||||
component – for instance, if the entity linker is used but no component that
 | 
			
		||||
runs before it sets named entities. Setting `pretty=True` will pretty-print a
 | 
			
		||||
table instead of only returning the structured data.
 | 
			
		||||
 | 
			
		||||
> #### ✏️ Things to try
 | 
			
		||||
>
 | 
			
		||||
> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker.
 | 
			
		||||
>    The analysis should now show no problems, because requirements are met.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
### {executable="true"}
 | 
			
		||||
import spacy
 | 
			
		||||
 | 
			
		||||
nlp = spacy.blank("en")
 | 
			
		||||
nlp.add_pipe("tagger")
 | 
			
		||||
# This is a problem because it needs entities and sentence boundaries
 | 
			
		||||
nlp.add_pipe("entity_linker")
 | 
			
		||||
analysis = nlp.analyze_pipes(pretty=True)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
<Accordion title="Example output">
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
### Structured
 | 
			
		||||
{
 | 
			
		||||
  "summary": {
 | 
			
		||||
    "tagger": {
 | 
			
		||||
      "assigns": ["token.tag"],
 | 
			
		||||
      "requires": [],
 | 
			
		||||
      "scores": ["tag_acc", "pos_acc", "lemma_acc"],
 | 
			
		||||
      "retokenizes": false
 | 
			
		||||
    },
 | 
			
		||||
    "entity_linker": {
 | 
			
		||||
      "assigns": ["token.ent_kb_id"],
 | 
			
		||||
      "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
 | 
			
		||||
      "scores": [],
 | 
			
		||||
      "retokenizes": false
 | 
			
		||||
    }
 | 
			
		||||
  },
 | 
			
		||||
  "problems": {
 | 
			
		||||
    "tagger": [],
 | 
			
		||||
    "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
 | 
			
		||||
  },
 | 
			
		||||
  "attrs": {
 | 
			
		||||
    "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
 | 
			
		||||
    "doc.ents": { "assigns": [], "requires": ["entity_linker"] },
 | 
			
		||||
    "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
 | 
			
		||||
    "doc.sents": { "assigns": [], "requires": ["entity_linker"] },
 | 
			
		||||
    "token.tag": { "assigns": ["tagger"], "requires": [] },
 | 
			
		||||
    "token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
### Pretty
 | 
			
		||||
============================= Pipeline Overview =============================
 | 
			
		||||
 | 
			
		||||
#   Component       Assigns           Requires         Scores      Retokenizes
 | 
			
		||||
-   -------------   ---------------   --------------   ---------   -----------
 | 
			
		||||
0   tagger          token.tag                          tag_acc     False
 | 
			
		||||
                                                       pos_acc
 | 
			
		||||
                                                       lemma_acc
 | 
			
		||||
 | 
			
		||||
1   entity_linker   token.ent_kb_id   doc.ents                     False
 | 
			
		||||
                                      doc.sents
 | 
			
		||||
                                      token.ent_iob
 | 
			
		||||
                                      token.ent_type
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
================================ Problems (4) ================================
 | 
			
		||||
⚠ 'entity_linker' requirements not met: doc.ents, doc.sents,
 | 
			
		||||
token.ent_iob, token.ent_type
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
</Accordion>
 | 
			
		||||
 | 
			
		||||
<Infobox variant="warning" title="Important note">
 | 
			
		||||
 | 
			
		||||
The pipeline analysis is static and does **not actually run the components**.
 | 
			
		||||
This means that it relies on the information provided by the components
 | 
			
		||||
themselves. If a custom component declares that it assigns an attribute but it
 | 
			
		||||
doesn't, the pipeline analysis won't catch that.
 | 
			
		||||
 | 
			
		||||
</Infobox>
 | 
			
		||||
 | 
			
		||||
## Creating custom pipeline components {#custom-components}
 | 
			
		||||
 | 
			
		||||
A pipeline component is a function that receives a `Doc` object, modifies it and
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user