From 30a76fcf6f662a3ef2d63648beba9f7a82e02150 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 31 Jul 2020 18:34:35 +0200 Subject: [PATCH 1/4] Integrate and simplify pipe analysis --- spacy/language.py | 28 ++++++++++++++++-------- spacy/pipe_analysis.py | 31 +++++++++++---------------- spacy/tests/pipeline/test_analysis.py | 22 +++++++++---------- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 594a4b148..6230913b4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,7 +18,7 @@ from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab -from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .pipe_analysis import validate_attrs, print_summary from .gold import Example from .scorer import Scorer from .util import create_default_optimizer, registry @@ -37,8 +37,6 @@ from . import util from . import about -# TODO: integrate pipeline analyis -ENABLE_PIPELINE_ANALYSIS = False # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH) @@ -522,6 +520,24 @@ class Language: return add_component(func) return add_component + def analyze_pipes( + self, + *, + keys: List[str] = ["assigns", "requires", "scores", "retokenizes"], + pretty: bool = True, + no_print: bool = False, + ) -> Optional[Dict[str, Any]]: + """Analyze the current pipeline components, print a summary of what + they assign or require and check that all requirements are met. + + keys (List[str]): The meta values to display in the table. Corresponds + to values in FactoryMeta, defined by @Language.factory decorator. + pretty (bool): Pretty-print the results with colors and icons. + no_print (bool): Don't print anything and return structured dict instead. + RETURNS (dict): The data, if no_print is set to True. + """ + return print_summary(self, keys=keys, pretty=pretty, no_print=no_print) + def get_pipe(self, name: str) -> Callable[[Doc], Doc]: """Get a pipeline component for a given component name. @@ -666,8 +682,6 @@ class Language: pipe_index = self._get_pipe_index(before, after, first, last) self._pipe_meta[name] = self.get_factory_meta(factory_name) self.pipeline.insert(pipe_index, (name, pipe_component)) - if ENABLE_PIPELINE_ANALYSIS: - analyze_pipes(self, name, pipe_index) return pipe_component def _get_pipe_index( @@ -758,8 +772,6 @@ class Language: self.add_pipe(factory_name, name=name) else: self.add_pipe(factory_name, name=name, before=pipe_index) - if ENABLE_PIPELINE_ANALYSIS: - analyze_all_pipes(self) def rename_pipe(self, old_name: str, new_name: str) -> None: """Rename a pipeline component. @@ -793,8 +805,6 @@ class Language: # because factory may be used for something else self._pipe_meta.pop(name) self._pipe_configs.pop(name) - if ENABLE_PIPELINE_ANALYSIS: - analyze_all_pipes(self) return removed def __call__( diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index b57f1524b..71f99daef 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -42,19 +42,6 @@ def analyze_pipes( return problems -def analyze_all_pipes(nlp: "Language", warn: bool = True) -> Dict[str, List[str]]: - """Analyze all pipes in the pipeline in order. - - nlp (Language): The current nlp object. - warn (bool): Show user warning if problem is found. - RETURNS (Dict[str, List[str]]): The problems found, keyed by component name. - """ - problems = {} - for i, name in enumerate(nlp.pipe_names): - problems[name] = analyze_pipes(nlp, name, i, warn=warn) - return problems - - def validate_attrs(values: Iterable[str]) -> Iterable[str]: """Validate component attributes provided to "assigns", "requires" etc. Raises error for invalid attributes and formatting. Doesn't check if @@ -133,27 +120,35 @@ def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]: def print_summary( - nlp: "Language", pretty: bool = True, no_print: bool = False + nlp: "Language", + *, + keys: List[str] = ["requires", "assigns", "scores", "retokenizes"], + pretty: bool = True, + no_print: bool = False, ) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: """Print a formatted summary for the current nlp object's pipeline. Shows a table with the pipeline components and why they assign and require, as well as any problems if available. nlp (Language): The nlp object. + keys (List[str]): The meta keys to show in the table. pretty (bool): Pretty-print the results (color etc). no_print (bool): Don't print anything, just return the data. RETURNS (dict): A dict with "overview" and "problems". """ msg = Printer(pretty=pretty, no_print=no_print) - overview = [] + overview = {} problems = {} for i, name in enumerate(nlp.pipe_names): meta = nlp.get_pipe_meta(name) - overview.append((i, name, meta.requires, meta.assigns, meta.retokenizes)) + overview[name] = {"i": i, "name": name} + for key in keys: + overview[name][key] = getattr(meta, key, None) problems[name] = analyze_pipes(nlp, name, i, warn=False) msg.divider("Pipeline Overview") - header = ("#", "Component", "Requires", "Assigns", "Retokenizes") - msg.table(overview, header=header, divider=True, multiline=True) + header = ["#", "Component", *[key.capitalize() for key in keys]] + body = [[info for info in entry.values()] for entry in overview.values()] + msg.table(body, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in problems.values()) if any(p for p in problems.values()): msg.divider(f"Problems ({n_problems})") diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 4e1407707..7d22bb1a0 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,15 +1,12 @@ import spacy.language from spacy.language import Language -from spacy.pipe_analysis import print_summary, validate_attrs from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.pipe_analysis import count_pipeline_interdependencies +from spacy.pipe_analysis import validate_attrs, count_pipeline_interdependencies from mock import Mock import pytest def test_component_decorator_assigns(): - spacy.language.ENABLE_PIPELINE_ANALYSIS = True - @Language.component("c1", assigns=["token.tag", "doc.tensor"]) def test_component1(doc): return doc @@ -32,8 +29,9 @@ def test_component_decorator_assigns(): nlp = Language() nlp.add_pipe("c1") - with pytest.warns(UserWarning): - nlp.add_pipe("c2") + nlp.add_pipe("c2") + problems = nlp.analyze_pipes(no_print=True)["problems"] + assert problems["c2"] == ["token.pos"] nlp.add_pipe("c3") assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"] nlp.add_pipe("c1", name="c4") @@ -45,7 +43,6 @@ def test_component_decorator_assigns(): assert nlp.pipe_factories["c4"] == "c1" assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"] assert get_requires_for_attr(nlp, "token.pos") == ["c2"] - assert print_summary(nlp, no_print=True) assert nlp("hello world") @@ -112,11 +109,12 @@ def test_analysis_validate_attrs_remove_pipe(): nlp = Language() nlp.add_pipe("pipe_analysis_c6") - with pytest.warns(UserWarning): - nlp.add_pipe("pipe_analysis_c7") - with pytest.warns(None) as record: - nlp.remove_pipe("pipe_analysis_c7") - assert not record.list + nlp.add_pipe("pipe_analysis_c7") + problems = nlp.analyze_pipes(no_print=True)["problems"] + assert problems["pipe_analysis_c7"] == ["token.pos"] + nlp.remove_pipe("pipe_analysis_c7") + problems = nlp.analyze_pipes(no_print=True)["problems"] + assert all(p == [] for p in problems.values()) def test_pipe_interdependencies(): From b68c53858c0399541dc2f5611d2a7ccd8b291a84 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 31 Jul 2020 18:37:58 +0200 Subject: [PATCH 2/4] Remove global --- spacy/tests/pipeline/test_analysis.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 7d22bb1a0..80987c838 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,4 +1,3 @@ -import spacy.language from spacy.language import Language from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr from spacy.pipe_analysis import validate_attrs, count_pipeline_interdependencies @@ -97,7 +96,6 @@ def test_analysis_validate_attrs_invalid(attr): def test_analysis_validate_attrs_remove_pipe(): """Test that attributes are validated correctly on remove.""" - spacy.language.ENABLE_PIPELINE_ANALYSIS = True @Language.component("pipe_analysis_c6", assigns=["token.tag"]) def c1(doc): From 98c6a85c8bec04dd7be7c0dbc3c18e0efd9f822e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 31 Jul 2020 18:55:38 +0200 Subject: [PATCH 3/4] Update docs [ci skip] --- website/docs/api/language.md | 59 ++++++++++++++++++++++ website/docs/usage/processing-pipelines.md | 55 ++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 0662fb12a..608442122 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -598,6 +598,65 @@ contains the information about the component and its default provided by the | `name` | str | The pipeline component name. | | **RETURNS** | [`FactoryMeta`](#factorymeta) |  The factory meta. | +## Language.analyze_pipes {#analyze_pipes tag="method" new="3"} + +Analyze the current pipeline components and show a summary of the attributes +they assign and require, and the scores they set. The data is based on the +information provided in the [`@Language.component`](/api/language#component) and +[`@Language.factory`](/api/language#factory) decorator. If requirements aren't +met, e.g. if a component specifies a required property that is not set by a +previous component, a warning is shown. + + + +The pipeline analysis is static and does **not actually run the components**. +This means that it relies on the information provided by the components +themselves. If a custom component declares that it assigns an attribute but it +doesn't, the pipeline analysis won't catch that. + + + +> #### Example +> +> ```python +> nlp = spacy.blank("en") +> nlp.add_pipe("tagger") +> nlp.add_pipe("entity_linker") +> nlp.analyze_pipes() +> ``` + + + +``` +============================= Pipeline Overview ============================= + +# Component Assigns Requires Scores Retokenizes +- ------------- --------------- -------------- --------- ----------- +0 tagger token.tag tag_acc False + pos_acc + lemma_acc + +1 entity_linker token.ent_kb_id doc.ents False + doc.sents + token.ent_iob + token.ent_type + + +================================ Problems (4) ================================ +⚠ 'entity_linker' requirements not met: doc.ents, doc.sents, +token.ent_iob, token.ent_type +``` + + + +| Name | Type | Description | +| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | +| `pretty` | bool | Pretty-print the results with colors and icons. Defaults to `True`. | +| `no_print` | bool | Don't print anything and return a structured dict instead. Defaults to `False`. | +| **RETURNS** | dict | Optional dict, if `no_print` is set to `True`. | + ## Language.meta {#meta tag="property"} Custom meta data for the Language class. If a model is loaded, contains meta diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 486cef1be..deca96840 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -311,6 +311,61 @@ nlp.rename_pipe("ner", "entityrecognizer") nlp.replace_pipe("tagger", my_custom_tagger) ``` +### Analyzing pipeline components {#analysis new="3"} + +The [`nlp.analyze_pipes`](/api/language#analyze_pipes) method analyzes the +components in the current pipeline and outputs information about them, like the +attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether +they retokenize the `Doc` and which scores they produce during training. It will +also show warnings if components require values that aren't set by previous +component – for instance, if the entity linker is used but no component that +runs before it sets named entities. + +```python +nlp = spacy.blank("en") +nlp.add_pipe("tagger") +nlp.add_pipe("entity_linker") # this is a problem, because it needs entities +nlp.analyze_pipes() +``` + +``` +### Example output +============================= Pipeline Overview ============================= + +# Component Assigns Requires Scores Retokenizes +- ------------- --------------- -------------- --------- ----------- +0 tagger token.tag tag_acc False + pos_acc + lemma_acc + +1 entity_linker token.ent_kb_id doc.ents False + doc.sents + token.ent_iob + token.ent_type + + +================================ Problems (4) ================================ +⚠ 'entity_linker' requirements not met: doc.ents, doc.sents, +token.ent_iob, token.ent_type +``` + +If you prefer a structured dictionary containing the component information and +the problems, you can set `no_print=True`. This will return the data instead of +printing it. + +``` +result = nlp.analyze_pipes(no_print=True) +``` + + + +The pipeline analysis is static and does **not actually run the components**. +This means that it relies on the information provided by the components +themselves. If a custom component declares that it assigns an attribute but it +doesn't, the pipeline analysis won't catch that. + + + ## Creating custom pipeline components {#custom-components} A pipeline component is a function that receives a `Doc` object, modifies it and From b40f44419b03010d7eb14d255f9bfc99c3cad637 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 1 Aug 2020 13:40:06 +0200 Subject: [PATCH 4/4] Simplify pipe analysis - remove unused code - don't print by default - integrate attrs info into analysis output --- spacy/errors.py | 2 - spacy/language.py | 15 +- spacy/pipe_analysis.py | 159 +++++++-------------- spacy/tests/pipeline/test_analysis.py | 33 +---- website/docs/api/language.md | 72 +++++++--- website/docs/usage/processing-pipelines.md | 60 ++++++-- 6 files changed, 171 insertions(+), 170 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 3fe53d6db..124572b0b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -63,8 +63,6 @@ class Warnings: "have the spacy-lookups-data package installed.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") - W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " - "previous components in the pipeline declare that they assign it.") W026 = ("Unable to set all sentence boundaries from dependency parses.") W027 = ("Found a large training file of {size} bytes. Note that it may " "be more efficient to split your training data into multiple " diff --git a/spacy/language.py b/spacy/language.py index 6230913b4..d1b180cef 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,7 +18,7 @@ from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab -from .pipe_analysis import validate_attrs, print_summary +from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .gold import Example from .scorer import Scorer from .util import create_default_optimizer, registry @@ -524,19 +524,20 @@ class Language: self, *, keys: List[str] = ["assigns", "requires", "scores", "retokenizes"], - pretty: bool = True, - no_print: bool = False, + pretty: bool = False, ) -> Optional[Dict[str, Any]]: """Analyze the current pipeline components, print a summary of what they assign or require and check that all requirements are met. keys (List[str]): The meta values to display in the table. Corresponds to values in FactoryMeta, defined by @Language.factory decorator. - pretty (bool): Pretty-print the results with colors and icons. - no_print (bool): Don't print anything and return structured dict instead. - RETURNS (dict): The data, if no_print is set to True. + pretty (bool): Pretty-print the results. + RETURNS (dict): The data. """ - return print_summary(self, keys=keys, pretty=pretty, no_print=no_print) + analysis = analyze_pipes(self, keys=keys) + if pretty: + print_pipe_analysis(analysis, keys=keys) + return analysis def get_pipe(self, name: str) -> Callable[[Doc], Doc]: """Get a pipeline component for a given component name. diff --git a/spacy/pipe_analysis.py b/spacy/pipe_analysis.py index 71f99daef..008ac3384 100644 --- a/spacy/pipe_analysis.py +++ b/spacy/pipe_analysis.py @@ -1,9 +1,8 @@ from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING -from wasabi import Printer -import warnings +from wasabi import msg from .tokens import Doc, Token, Span -from .errors import Errors, Warnings +from .errors import Errors from .util import dot_to_dict if TYPE_CHECKING: @@ -11,35 +10,7 @@ if TYPE_CHECKING: from .language import Language # noqa: F401 -def analyze_pipes( - nlp: "Language", name: str, index: int, warn: bool = True -) -> List[str]: - """Analyze a pipeline component with respect to its position in the current - pipeline and the other components. Will check whether requirements are - fulfilled (e.g. if previous components assign the attributes). - - nlp (Language): The current nlp object. - name (str): The name of the pipeline component to analyze. - index (int): The index of the component in the pipeline. - warn (bool): Show user warning if problem is found. - RETURNS (List[str]): The problems found for the given pipeline component. - """ - assert nlp.pipeline[index][0] == name - prev_pipes = nlp.pipeline[:index] - meta = nlp.get_pipe_meta(name) - requires = {annot: False for annot in meta.requires} - if requires: - for prev_name, prev_pipe in prev_pipes: - prev_meta = nlp.get_pipe_meta(prev_name) - for annot in prev_meta.assigns: - requires[annot] = True - problems = [] - for annot, fulfilled in requires.items(): - if not fulfilled: - problems.append(annot) - if warn: - warnings.warn(Warnings.W025.format(name=name, attr=annot)) - return problems +DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"] def validate_attrs(values: Iterable[str]) -> Iterable[str]: @@ -88,97 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]: return values -def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]: - assert feature in ["assigns", "requires"] - result = [] +def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]: + """Check which components in the pipeline assign or require an attribute. + + nlp (Language): The current nlp object. + attr (str): The attribute, e.g. "doc.tensor". + RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires", + mapped to a list of component names. + """ + result = {"assigns": [], "requires": []} for pipe_name in nlp.pipe_names: meta = nlp.get_pipe_meta(pipe_name) - pipe_assigns = getattr(meta, feature, []) - if attr in pipe_assigns: - result.append(pipe_name) + if attr in meta.assigns: + result["assigns"].append(pipe_name) + if attr in meta.requires: + result["requires"].append(pipe_name) return result -def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]: - """Get all pipeline components that assign an attr, e.g. "doc.tensor". - - pipeline (Language): The current nlp object. - attr (str): The attribute to check. - RETURNS (List[str]): Names of components that require the attr. - """ - return _get_feature_for_attr(nlp, attr, "assigns") - - -def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]: - """Get all pipeline components that require an attr, e.g. "doc.tensor". - - pipeline (Language): The current nlp object. - attr (str): The attribute to check. - RETURNS (List[str]): Names of components that require the attr. - """ - return _get_feature_for_attr(nlp, attr, "requires") - - -def print_summary( - nlp: "Language", - *, - keys: List[str] = ["requires", "assigns", "scores", "retokenizes"], - pretty: bool = True, - no_print: bool = False, -) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: +def analyze_pipes( + nlp: "Language", *, keys: List[str] = DEFAULT_KEYS, +) -> Dict[str, Union[List[str], Dict[str, List[str]]]]: """Print a formatted summary for the current nlp object's pipeline. Shows a table with the pipeline components and why they assign and require, as well as any problems if available. nlp (Language): The nlp object. keys (List[str]): The meta keys to show in the table. - pretty (bool): Pretty-print the results (color etc). - no_print (bool): Don't print anything, just return the data. - RETURNS (dict): A dict with "overview" and "problems". + RETURNS (dict): A dict with "summary" and "problems". """ - msg = Printer(pretty=pretty, no_print=no_print) - overview = {} - problems = {} + result = {"summary": {}, "problems": {}} + all_attrs = set() for i, name in enumerate(nlp.pipe_names): meta = nlp.get_pipe_meta(name) - overview[name] = {"i": i, "name": name} - for key in keys: - overview[name][key] = getattr(meta, key, None) - problems[name] = analyze_pipes(nlp, name, i, warn=False) + all_attrs.update(meta.assigns) + all_attrs.update(meta.requires) + result["summary"][name] = {key: getattr(meta, key, None) for key in keys} + prev_pipes = nlp.pipeline[:i] + requires = {annot: False for annot in meta.requires} + if requires: + for prev_name, prev_pipe in prev_pipes: + prev_meta = nlp.get_pipe_meta(prev_name) + for annot in prev_meta.assigns: + requires[annot] = True + result["problems"][name] = [] + for annot, fulfilled in requires.items(): + if not fulfilled: + result["problems"][name].append(annot) + result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs} + return result + + +def print_pipe_analysis( + analysis: Dict[str, Union[List[str], Dict[str, List[str]]]], + *, + keys: List[str] = DEFAULT_KEYS, +) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]: + """Print a formatted version of the pipe analysis produced by analyze_pipes. + + analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis. + keys (List[str]): The meta keys to show in the table. + """ msg.divider("Pipeline Overview") header = ["#", "Component", *[key.capitalize() for key in keys]] - body = [[info for info in entry.values()] for entry in overview.values()] + summary = analysis["summary"].items() + body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)] msg.table(body, header=header, divider=True, multiline=True) - n_problems = sum(len(p) for p in problems.values()) - if any(p for p in problems.values()): + n_problems = sum(len(p) for p in analysis["problems"].values()) + if any(p for p in analysis["problems"].values()): msg.divider(f"Problems ({n_problems})") - for name, problem in problems.items(): + for name, problem in analysis["problems"].items(): if problem: msg.warn(f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.") - if no_print: - return {"overview": overview, "problems": problems} - - -def count_pipeline_interdependencies(nlp: "Language") -> List[int]: - """Count how many subsequent components require an annotation set by each - component in the pipeline. - - nlp (Language): The current nlp object. - RETURNS (List[int]): The interdependency counts. - """ - pipe_assigns = [] - pipe_requires = [] - for name in nlp.pipe_names: - meta = nlp.get_pipe_meta(name) - pipe_assigns.append(set(meta.assigns)) - pipe_requires.append(set(meta.requires)) - counts = [] - for i, assigns in enumerate(pipe_assigns): - count = 0 - for requires in pipe_requires[i + 1 :]: - if assigns.intersection(requires): - count += 1 - counts.append(count) - return counts diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 80987c838..df3d7dff5 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,6 +1,5 @@ from spacy.language import Language -from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.pipe_analysis import validate_attrs, count_pipeline_interdependencies +from spacy.pipe_analysis import get_attr_info, validate_attrs from mock import Mock import pytest @@ -29,10 +28,10 @@ def test_component_decorator_assigns(): nlp = Language() nlp.add_pipe("c1") nlp.add_pipe("c2") - problems = nlp.analyze_pipes(no_print=True)["problems"] + problems = nlp.analyze_pipes()["problems"] assert problems["c2"] == ["token.pos"] nlp.add_pipe("c3") - assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"] + assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"] nlp.add_pipe("c1", name="c4") test_component4_meta = nlp.get_pipe_meta("c1") assert test_component4_meta.factory == "c1" @@ -40,8 +39,8 @@ def test_component_decorator_assigns(): assert not Language.has_factory("c4") assert nlp.pipe_factories["c1"] == "c1" assert nlp.pipe_factories["c4"] == "c1" - assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"] - assert get_requires_for_attr(nlp, "token.pos") == ["c2"] + assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"] + assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"] assert nlp("hello world") @@ -108,26 +107,8 @@ def test_analysis_validate_attrs_remove_pipe(): nlp = Language() nlp.add_pipe("pipe_analysis_c6") nlp.add_pipe("pipe_analysis_c7") - problems = nlp.analyze_pipes(no_print=True)["problems"] + problems = nlp.analyze_pipes()["problems"] assert problems["pipe_analysis_c7"] == ["token.pos"] nlp.remove_pipe("pipe_analysis_c7") - problems = nlp.analyze_pipes(no_print=True)["problems"] + problems = nlp.analyze_pipes()["problems"] assert all(p == [] for p in problems.values()) - - -def test_pipe_interdependencies(): - prefix = "test_pipe_interdependencies" - - @Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",)) - def fancifier(doc): - return doc - - @Language.component(f"{prefix}.needer", requires=("doc._.fancy",)) - def needer(doc): - return doc - - nlp = Language() - nlp.add_pipe(f"{prefix}.fancifier") - nlp.add_pipe(f"{prefix}.needer") - counts = count_pipeline_interdependencies(nlp) - assert counts == [1, 0] diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 608442122..ba62d0b13 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -98,10 +98,10 @@ decorator. For more details and examples, see the | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `name` | str | The name of the component factory. | | _keyword-only_ | | | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -146,10 +146,10 @@ examples, see the | `name` | str | The name of the component factory. | | _keyword-only_ | | | | `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | @@ -622,12 +622,45 @@ doesn't, the pipeline analysis won't catch that. > nlp = spacy.blank("en") > nlp.add_pipe("tagger") > nlp.add_pipe("entity_linker") -> nlp.analyze_pipes() +> analysis = nlp.analyze_pipes() > ``` +```json +### Structured +{ + "summary": { + "tagger": { + "assigns": ["token.tag"], + "requires": [], + "scores": ["tag_acc", "pos_acc", "lemma_acc"], + "retokenizes": false + }, + "entity_linker": { + "assigns": ["token.ent_kb_id"], + "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + "scores": [], + "retokenizes": false + } + }, + "problems": { + "tagger": [], + "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"] + }, + "attrs": { + "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] }, + "doc.ents": { "assigns": [], "requires": ["entity_linker"] }, + "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] }, + "doc.sents": { "assigns": [], "requires": ["entity_linker"] }, + "token.tag": { "assigns": ["tagger"], "requires": [] }, + "token.ent_type": { "assigns": [], "requires": ["entity_linker"] } + } +} ``` + +``` +### Pretty ============================= Pipeline Overview ============================= # Component Assigns Requires Scores Retokenizes @@ -649,13 +682,12 @@ token.ent_iob, token.ent_type -| Name | Type | Description | -| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| _keyword-only_ | | | -| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | -| `pretty` | bool | Pretty-print the results with colors and icons. Defaults to `True`. | -| `no_print` | bool | Don't print anything and return a structured dict instead. Defaults to `False`. | -| **RETURNS** | dict | Optional dict, if `no_print` is set to `True`. | +| Name | Type | Description | +| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | +| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. | +| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). | ## Language.meta {#meta tag="property"} @@ -892,8 +924,8 @@ instance and factory instance. | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `factory` | str | The name of the registered component factory. | | `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | -| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. | -| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.  | -| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.  | -| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | +| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | +| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | +| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | +| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index deca96840..6388529f6 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -319,17 +319,61 @@ attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether they retokenize the `Doc` and which scores they produce during training. It will also show warnings if components require values that aren't set by previous component – for instance, if the entity linker is used but no component that -runs before it sets named entities. +runs before it sets named entities. Setting `pretty=True` will pretty-print a +table instead of only returning the structured data. + +> #### ✏️ Things to try +> +> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker. +> The analysis should now show no problems, because requirements are met. ```python +### {executable="true"} +import spacy + nlp = spacy.blank("en") nlp.add_pipe("tagger") -nlp.add_pipe("entity_linker") # this is a problem, because it needs entities -nlp.analyze_pipes() +# This is a problem because it needs entities and sentence boundaries +nlp.add_pipe("entity_linker") +analysis = nlp.analyze_pipes(pretty=True) +``` + + + +```json +### Structured +{ + "summary": { + "tagger": { + "assigns": ["token.tag"], + "requires": [], + "scores": ["tag_acc", "pos_acc", "lemma_acc"], + "retokenizes": false + }, + "entity_linker": { + "assigns": ["token.ent_kb_id"], + "requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], + "scores": [], + "retokenizes": false + } + }, + "problems": { + "tagger": [], + "entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"] + }, + "attrs": { + "token.ent_iob": { "assigns": [], "requires": ["entity_linker"] }, + "doc.ents": { "assigns": [], "requires": ["entity_linker"] }, + "token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] }, + "doc.sents": { "assigns": [], "requires": ["entity_linker"] }, + "token.tag": { "assigns": ["tagger"], "requires": [] }, + "token.ent_type": { "assigns": [], "requires": ["entity_linker"] } + } +} ``` ``` -### Example output +### Pretty ============================= Pipeline Overview ============================= # Component Assigns Requires Scores Retokenizes @@ -349,13 +393,7 @@ nlp.analyze_pipes() token.ent_iob, token.ent_type ``` -If you prefer a structured dictionary containing the component information and -the problems, you can set `no_print=True`. This will return the data instead of -printing it. - -``` -result = nlp.analyze_pipes(no_print=True) -``` +