mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Simplify pipe analysis
- remove unused code - don't print by default - integrate attrs info into analysis output
This commit is contained in:
parent
98c6a85c8b
commit
b40f44419b
|
@ -63,8 +63,6 @@ class Warnings:
|
||||||
"have the spacy-lookups-data package installed.")
|
"have the spacy-lookups-data package installed.")
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
|
||||||
"previous components in the pipeline declare that they assign it.")
|
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||||
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
"be more efficient to split your training data into multiple "
|
"be more efficient to split your training data into multiple "
|
||||||
|
|
|
@ -18,7 +18,7 @@ from timeit import default_timer as timer
|
||||||
|
|
||||||
from .tokens.underscore import Underscore
|
from .tokens.underscore import Underscore
|
||||||
from .vocab import Vocab, create_vocab
|
from .vocab import Vocab, create_vocab
|
||||||
from .pipe_analysis import validate_attrs, print_summary
|
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||||
from .gold import Example
|
from .gold import Example
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .util import create_default_optimizer, registry
|
from .util import create_default_optimizer, registry
|
||||||
|
@ -524,19 +524,20 @@ class Language:
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
|
keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
|
||||||
pretty: bool = True,
|
pretty: bool = False,
|
||||||
no_print: bool = False,
|
|
||||||
) -> Optional[Dict[str, Any]]:
|
) -> Optional[Dict[str, Any]]:
|
||||||
"""Analyze the current pipeline components, print a summary of what
|
"""Analyze the current pipeline components, print a summary of what
|
||||||
they assign or require and check that all requirements are met.
|
they assign or require and check that all requirements are met.
|
||||||
|
|
||||||
keys (List[str]): The meta values to display in the table. Corresponds
|
keys (List[str]): The meta values to display in the table. Corresponds
|
||||||
to values in FactoryMeta, defined by @Language.factory decorator.
|
to values in FactoryMeta, defined by @Language.factory decorator.
|
||||||
pretty (bool): Pretty-print the results with colors and icons.
|
pretty (bool): Pretty-print the results.
|
||||||
no_print (bool): Don't print anything and return structured dict instead.
|
RETURNS (dict): The data.
|
||||||
RETURNS (dict): The data, if no_print is set to True.
|
|
||||||
"""
|
"""
|
||||||
return print_summary(self, keys=keys, pretty=pretty, no_print=no_print)
|
analysis = analyze_pipes(self, keys=keys)
|
||||||
|
if pretty:
|
||||||
|
print_pipe_analysis(analysis, keys=keys)
|
||||||
|
return analysis
|
||||||
|
|
||||||
def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
|
def get_pipe(self, name: str) -> Callable[[Doc], Doc]:
|
||||||
"""Get a pipeline component for a given component name.
|
"""Get a pipeline component for a given component name.
|
||||||
|
|
|
@ -1,9 +1,8 @@
|
||||||
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
|
from typing import List, Dict, Iterable, Optional, Union, TYPE_CHECKING
|
||||||
from wasabi import Printer
|
from wasabi import msg
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .tokens import Doc, Token, Span
|
from .tokens import Doc, Token, Span
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors
|
||||||
from .util import dot_to_dict
|
from .util import dot_to_dict
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -11,35 +10,7 @@ if TYPE_CHECKING:
|
||||||
from .language import Language # noqa: F401
|
from .language import Language # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def analyze_pipes(
|
DEFAULT_KEYS = ["requires", "assigns", "scores", "retokenizes"]
|
||||||
nlp: "Language", name: str, index: int, warn: bool = True
|
|
||||||
) -> List[str]:
|
|
||||||
"""Analyze a pipeline component with respect to its position in the current
|
|
||||||
pipeline and the other components. Will check whether requirements are
|
|
||||||
fulfilled (e.g. if previous components assign the attributes).
|
|
||||||
|
|
||||||
nlp (Language): The current nlp object.
|
|
||||||
name (str): The name of the pipeline component to analyze.
|
|
||||||
index (int): The index of the component in the pipeline.
|
|
||||||
warn (bool): Show user warning if problem is found.
|
|
||||||
RETURNS (List[str]): The problems found for the given pipeline component.
|
|
||||||
"""
|
|
||||||
assert nlp.pipeline[index][0] == name
|
|
||||||
prev_pipes = nlp.pipeline[:index]
|
|
||||||
meta = nlp.get_pipe_meta(name)
|
|
||||||
requires = {annot: False for annot in meta.requires}
|
|
||||||
if requires:
|
|
||||||
for prev_name, prev_pipe in prev_pipes:
|
|
||||||
prev_meta = nlp.get_pipe_meta(prev_name)
|
|
||||||
for annot in prev_meta.assigns:
|
|
||||||
requires[annot] = True
|
|
||||||
problems = []
|
|
||||||
for annot, fulfilled in requires.items():
|
|
||||||
if not fulfilled:
|
|
||||||
problems.append(annot)
|
|
||||||
if warn:
|
|
||||||
warnings.warn(Warnings.W025.format(name=name, attr=annot))
|
|
||||||
return problems
|
|
||||||
|
|
||||||
|
|
||||||
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
||||||
|
@ -88,97 +59,77 @@ def validate_attrs(values: Iterable[str]) -> Iterable[str]:
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
def _get_feature_for_attr(nlp: "Language", attr: str, feature: str) -> List[str]:
|
def get_attr_info(nlp: "Language", attr: str) -> Dict[str, List[str]]:
|
||||||
assert feature in ["assigns", "requires"]
|
"""Check which components in the pipeline assign or require an attribute.
|
||||||
result = []
|
|
||||||
|
nlp (Language): The current nlp object.
|
||||||
|
attr (str): The attribute, e.g. "doc.tensor".
|
||||||
|
RETURNS (Dict[str, List[str]]): A dict keyed by "assigns" and "requires",
|
||||||
|
mapped to a list of component names.
|
||||||
|
"""
|
||||||
|
result = {"assigns": [], "requires": []}
|
||||||
for pipe_name in nlp.pipe_names:
|
for pipe_name in nlp.pipe_names:
|
||||||
meta = nlp.get_pipe_meta(pipe_name)
|
meta = nlp.get_pipe_meta(pipe_name)
|
||||||
pipe_assigns = getattr(meta, feature, [])
|
if attr in meta.assigns:
|
||||||
if attr in pipe_assigns:
|
result["assigns"].append(pipe_name)
|
||||||
result.append(pipe_name)
|
if attr in meta.requires:
|
||||||
|
result["requires"].append(pipe_name)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_assigns_for_attr(nlp: "Language", attr: str) -> List[str]:
|
def analyze_pipes(
|
||||||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
nlp: "Language", *, keys: List[str] = DEFAULT_KEYS,
|
||||||
|
) -> Dict[str, Union[List[str], Dict[str, List[str]]]]:
|
||||||
pipeline (Language): The current nlp object.
|
|
||||||
attr (str): The attribute to check.
|
|
||||||
RETURNS (List[str]): Names of components that require the attr.
|
|
||||||
"""
|
|
||||||
return _get_feature_for_attr(nlp, attr, "assigns")
|
|
||||||
|
|
||||||
|
|
||||||
def get_requires_for_attr(nlp: "Language", attr: str) -> List[str]:
|
|
||||||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
|
||||||
|
|
||||||
pipeline (Language): The current nlp object.
|
|
||||||
attr (str): The attribute to check.
|
|
||||||
RETURNS (List[str]): Names of components that require the attr.
|
|
||||||
"""
|
|
||||||
return _get_feature_for_attr(nlp, attr, "requires")
|
|
||||||
|
|
||||||
|
|
||||||
def print_summary(
|
|
||||||
nlp: "Language",
|
|
||||||
*,
|
|
||||||
keys: List[str] = ["requires", "assigns", "scores", "retokenizes"],
|
|
||||||
pretty: bool = True,
|
|
||||||
no_print: bool = False,
|
|
||||||
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
|
|
||||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||||
a table with the pipeline components and why they assign and require, as
|
a table with the pipeline components and why they assign and require, as
|
||||||
well as any problems if available.
|
well as any problems if available.
|
||||||
|
|
||||||
nlp (Language): The nlp object.
|
nlp (Language): The nlp object.
|
||||||
keys (List[str]): The meta keys to show in the table.
|
keys (List[str]): The meta keys to show in the table.
|
||||||
pretty (bool): Pretty-print the results (color etc).
|
RETURNS (dict): A dict with "summary" and "problems".
|
||||||
no_print (bool): Don't print anything, just return the data.
|
|
||||||
RETURNS (dict): A dict with "overview" and "problems".
|
|
||||||
"""
|
"""
|
||||||
msg = Printer(pretty=pretty, no_print=no_print)
|
result = {"summary": {}, "problems": {}}
|
||||||
overview = {}
|
all_attrs = set()
|
||||||
problems = {}
|
|
||||||
for i, name in enumerate(nlp.pipe_names):
|
for i, name in enumerate(nlp.pipe_names):
|
||||||
meta = nlp.get_pipe_meta(name)
|
meta = nlp.get_pipe_meta(name)
|
||||||
overview[name] = {"i": i, "name": name}
|
all_attrs.update(meta.assigns)
|
||||||
for key in keys:
|
all_attrs.update(meta.requires)
|
||||||
overview[name][key] = getattr(meta, key, None)
|
result["summary"][name] = {key: getattr(meta, key, None) for key in keys}
|
||||||
problems[name] = analyze_pipes(nlp, name, i, warn=False)
|
prev_pipes = nlp.pipeline[:i]
|
||||||
|
requires = {annot: False for annot in meta.requires}
|
||||||
|
if requires:
|
||||||
|
for prev_name, prev_pipe in prev_pipes:
|
||||||
|
prev_meta = nlp.get_pipe_meta(prev_name)
|
||||||
|
for annot in prev_meta.assigns:
|
||||||
|
requires[annot] = True
|
||||||
|
result["problems"][name] = []
|
||||||
|
for annot, fulfilled in requires.items():
|
||||||
|
if not fulfilled:
|
||||||
|
result["problems"][name].append(annot)
|
||||||
|
result["attrs"] = {attr: get_attr_info(nlp, attr) for attr in all_attrs}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def print_pipe_analysis(
|
||||||
|
analysis: Dict[str, Union[List[str], Dict[str, List[str]]]],
|
||||||
|
*,
|
||||||
|
keys: List[str] = DEFAULT_KEYS,
|
||||||
|
) -> Optional[Dict[str, Union[List[str], Dict[str, List[str]]]]]:
|
||||||
|
"""Print a formatted version of the pipe analysis produced by analyze_pipes.
|
||||||
|
|
||||||
|
analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis.
|
||||||
|
keys (List[str]): The meta keys to show in the table.
|
||||||
|
"""
|
||||||
msg.divider("Pipeline Overview")
|
msg.divider("Pipeline Overview")
|
||||||
header = ["#", "Component", *[key.capitalize() for key in keys]]
|
header = ["#", "Component", *[key.capitalize() for key in keys]]
|
||||||
body = [[info for info in entry.values()] for entry in overview.values()]
|
summary = analysis["summary"].items()
|
||||||
|
body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)]
|
||||||
msg.table(body, header=header, divider=True, multiline=True)
|
msg.table(body, header=header, divider=True, multiline=True)
|
||||||
n_problems = sum(len(p) for p in problems.values())
|
n_problems = sum(len(p) for p in analysis["problems"].values())
|
||||||
if any(p for p in problems.values()):
|
if any(p for p in analysis["problems"].values()):
|
||||||
msg.divider(f"Problems ({n_problems})")
|
msg.divider(f"Problems ({n_problems})")
|
||||||
for name, problem in problems.items():
|
for name, problem in analysis["problems"].items():
|
||||||
if problem:
|
if problem:
|
||||||
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
|
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
|
||||||
else:
|
else:
|
||||||
msg.good("No problems found.")
|
msg.good("No problems found.")
|
||||||
if no_print:
|
|
||||||
return {"overview": overview, "problems": problems}
|
|
||||||
|
|
||||||
|
|
||||||
def count_pipeline_interdependencies(nlp: "Language") -> List[int]:
|
|
||||||
"""Count how many subsequent components require an annotation set by each
|
|
||||||
component in the pipeline.
|
|
||||||
|
|
||||||
nlp (Language): The current nlp object.
|
|
||||||
RETURNS (List[int]): The interdependency counts.
|
|
||||||
"""
|
|
||||||
pipe_assigns = []
|
|
||||||
pipe_requires = []
|
|
||||||
for name in nlp.pipe_names:
|
|
||||||
meta = nlp.get_pipe_meta(name)
|
|
||||||
pipe_assigns.append(set(meta.assigns))
|
|
||||||
pipe_requires.append(set(meta.requires))
|
|
||||||
counts = []
|
|
||||||
for i, assigns in enumerate(pipe_assigns):
|
|
||||||
count = 0
|
|
||||||
for requires in pipe_requires[i + 1 :]:
|
|
||||||
if assigns.intersection(requires):
|
|
||||||
count += 1
|
|
||||||
counts.append(count)
|
|
||||||
return counts
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
|
from spacy.pipe_analysis import get_attr_info, validate_attrs
|
||||||
from spacy.pipe_analysis import validate_attrs, count_pipeline_interdependencies
|
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -29,10 +28,10 @@ def test_component_decorator_assigns():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("c1")
|
nlp.add_pipe("c1")
|
||||||
nlp.add_pipe("c2")
|
nlp.add_pipe("c2")
|
||||||
problems = nlp.analyze_pipes(no_print=True)["problems"]
|
problems = nlp.analyze_pipes()["problems"]
|
||||||
assert problems["c2"] == ["token.pos"]
|
assert problems["c2"] == ["token.pos"]
|
||||||
nlp.add_pipe("c3")
|
nlp.add_pipe("c3")
|
||||||
assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2"]
|
assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2"]
|
||||||
nlp.add_pipe("c1", name="c4")
|
nlp.add_pipe("c1", name="c4")
|
||||||
test_component4_meta = nlp.get_pipe_meta("c1")
|
test_component4_meta = nlp.get_pipe_meta("c1")
|
||||||
assert test_component4_meta.factory == "c1"
|
assert test_component4_meta.factory == "c1"
|
||||||
|
@ -40,8 +39,8 @@ def test_component_decorator_assigns():
|
||||||
assert not Language.has_factory("c4")
|
assert not Language.has_factory("c4")
|
||||||
assert nlp.pipe_factories["c1"] == "c1"
|
assert nlp.pipe_factories["c1"] == "c1"
|
||||||
assert nlp.pipe_factories["c4"] == "c1"
|
assert nlp.pipe_factories["c4"] == "c1"
|
||||||
assert get_assigns_for_attr(nlp, "doc.tensor") == ["c1", "c2", "c4"]
|
assert get_attr_info(nlp, "doc.tensor")["assigns"] == ["c1", "c2", "c4"]
|
||||||
assert get_requires_for_attr(nlp, "token.pos") == ["c2"]
|
assert get_attr_info(nlp, "token.pos")["requires"] == ["c2"]
|
||||||
assert nlp("hello world")
|
assert nlp("hello world")
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,26 +107,8 @@ def test_analysis_validate_attrs_remove_pipe():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe("pipe_analysis_c6")
|
nlp.add_pipe("pipe_analysis_c6")
|
||||||
nlp.add_pipe("pipe_analysis_c7")
|
nlp.add_pipe("pipe_analysis_c7")
|
||||||
problems = nlp.analyze_pipes(no_print=True)["problems"]
|
problems = nlp.analyze_pipes()["problems"]
|
||||||
assert problems["pipe_analysis_c7"] == ["token.pos"]
|
assert problems["pipe_analysis_c7"] == ["token.pos"]
|
||||||
nlp.remove_pipe("pipe_analysis_c7")
|
nlp.remove_pipe("pipe_analysis_c7")
|
||||||
problems = nlp.analyze_pipes(no_print=True)["problems"]
|
problems = nlp.analyze_pipes()["problems"]
|
||||||
assert all(p == [] for p in problems.values())
|
assert all(p == [] for p in problems.values())
|
||||||
|
|
||||||
|
|
||||||
def test_pipe_interdependencies():
|
|
||||||
prefix = "test_pipe_interdependencies"
|
|
||||||
|
|
||||||
@Language.component(f"{prefix}.fancifier", assigns=("doc._.fancy",))
|
|
||||||
def fancifier(doc):
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@Language.component(f"{prefix}.needer", requires=("doc._.fancy",))
|
|
||||||
def needer(doc):
|
|
||||||
return doc
|
|
||||||
|
|
||||||
nlp = Language()
|
|
||||||
nlp.add_pipe(f"{prefix}.fancifier")
|
|
||||||
nlp.add_pipe(f"{prefix}.needer")
|
|
||||||
counts = count_pipeline_interdependencies(nlp)
|
|
||||||
assert counts == [1, 0]
|
|
||||||
|
|
|
@ -98,10 +98,10 @@ decorator. For more details and examples, see the
|
||||||
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | str | The name of the component factory. |
|
| `name` | str | The name of the component factory. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
|
||||||
|
@ -146,10 +146,10 @@ examples, see the
|
||||||
| `name` | str | The name of the component factory. |
|
| `name` | str | The name of the component factory. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
|
||||||
|
@ -622,12 +622,45 @@ doesn't, the pipeline analysis won't catch that.
|
||||||
> nlp = spacy.blank("en")
|
> nlp = spacy.blank("en")
|
||||||
> nlp.add_pipe("tagger")
|
> nlp.add_pipe("tagger")
|
||||||
> nlp.add_pipe("entity_linker")
|
> nlp.add_pipe("entity_linker")
|
||||||
> nlp.analyze_pipes()
|
> analysis = nlp.analyze_pipes()
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
<Accordion title="Example output" spaced>
|
<Accordion title="Example output" spaced>
|
||||||
|
|
||||||
|
```json
|
||||||
|
### Structured
|
||||||
|
{
|
||||||
|
"summary": {
|
||||||
|
"tagger": {
|
||||||
|
"assigns": ["token.tag"],
|
||||||
|
"requires": [],
|
||||||
|
"scores": ["tag_acc", "pos_acc", "lemma_acc"],
|
||||||
|
"retokenizes": false
|
||||||
|
},
|
||||||
|
"entity_linker": {
|
||||||
|
"assigns": ["token.ent_kb_id"],
|
||||||
|
"requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
|
"scores": [],
|
||||||
|
"retokenizes": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"problems": {
|
||||||
|
"tagger": [],
|
||||||
|
"entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
|
||||||
|
},
|
||||||
|
"attrs": {
|
||||||
|
"token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"doc.ents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
|
||||||
|
"doc.sents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.tag": { "assigns": ["tagger"], "requires": [] },
|
||||||
|
"token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
### Pretty
|
||||||
============================= Pipeline Overview =============================
|
============================= Pipeline Overview =============================
|
||||||
|
|
||||||
# Component Assigns Requires Scores Retokenizes
|
# Component Assigns Requires Scores Retokenizes
|
||||||
|
@ -649,13 +682,12 @@ token.ent_iob, token.ent_type
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. |
|
| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. |
|
||||||
| `pretty` | bool | Pretty-print the results with colors and icons. Defaults to `True`. |
|
| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. |
|
||||||
| `no_print` | bool | Don't print anything and return a structured dict instead. Defaults to `False`. |
|
| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). |
|
||||||
| **RETURNS** | dict | Optional dict, if `no_print` is set to `True`. |
|
|
||||||
|
|
||||||
## Language.meta {#meta tag="property"}
|
## Language.meta {#meta tag="property"}
|
||||||
|
|
||||||
|
@ -892,8 +924,8 @@ instance and factory instance.
|
||||||
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `factory` | str | The name of the registered component factory. |
|
| `factory` | str | The name of the registered component factory. |
|
||||||
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
|
|
|
@ -319,17 +319,61 @@ attributes they set on the [`Doc`](/api/doc) and [`Token`](/api/token), whether
|
||||||
they retokenize the `Doc` and which scores they produce during training. It will
|
they retokenize the `Doc` and which scores they produce during training. It will
|
||||||
also show warnings if components require values that aren't set by previous
|
also show warnings if components require values that aren't set by previous
|
||||||
component – for instance, if the entity linker is used but no component that
|
component – for instance, if the entity linker is used but no component that
|
||||||
runs before it sets named entities.
|
runs before it sets named entities. Setting `pretty=True` will pretty-print a
|
||||||
|
table instead of only returning the structured data.
|
||||||
|
|
||||||
|
> #### ✏️ Things to try
|
||||||
|
>
|
||||||
|
> 1. Add the components `"ner"` and `"sentencizer"` _before_ the entity linker.
|
||||||
|
> The analysis should now show no problems, because requirements are met.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
import spacy
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
nlp = spacy.blank("en")
|
||||||
nlp.add_pipe("tagger")
|
nlp.add_pipe("tagger")
|
||||||
nlp.add_pipe("entity_linker") # this is a problem, because it needs entities
|
# This is a problem because it needs entities and sentence boundaries
|
||||||
nlp.analyze_pipes()
|
nlp.add_pipe("entity_linker")
|
||||||
|
analysis = nlp.analyze_pipes(pretty=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
<Accordion title="Example output">
|
||||||
|
|
||||||
|
```json
|
||||||
|
### Structured
|
||||||
|
{
|
||||||
|
"summary": {
|
||||||
|
"tagger": {
|
||||||
|
"assigns": ["token.tag"],
|
||||||
|
"requires": [],
|
||||||
|
"scores": ["tag_acc", "pos_acc", "lemma_acc"],
|
||||||
|
"retokenizes": false
|
||||||
|
},
|
||||||
|
"entity_linker": {
|
||||||
|
"assigns": ["token.ent_kb_id"],
|
||||||
|
"requires": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
|
||||||
|
"scores": [],
|
||||||
|
"retokenizes": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"problems": {
|
||||||
|
"tagger": [],
|
||||||
|
"entity_linker": ["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"]
|
||||||
|
},
|
||||||
|
"attrs": {
|
||||||
|
"token.ent_iob": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"doc.ents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.ent_kb_id": { "assigns": ["entity_linker"], "requires": [] },
|
||||||
|
"doc.sents": { "assigns": [], "requires": ["entity_linker"] },
|
||||||
|
"token.tag": { "assigns": ["tagger"], "requires": [] },
|
||||||
|
"token.ent_type": { "assigns": [], "requires": ["entity_linker"] }
|
||||||
|
}
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
### Example output
|
### Pretty
|
||||||
============================= Pipeline Overview =============================
|
============================= Pipeline Overview =============================
|
||||||
|
|
||||||
# Component Assigns Requires Scores Retokenizes
|
# Component Assigns Requires Scores Retokenizes
|
||||||
|
@ -349,13 +393,7 @@ nlp.analyze_pipes()
|
||||||
token.ent_iob, token.ent_type
|
token.ent_iob, token.ent_type
|
||||||
```
|
```
|
||||||
|
|
||||||
If you prefer a structured dictionary containing the component information and
|
</Accordion>
|
||||||
the problems, you can set `no_print=True`. This will return the data instead of
|
|
||||||
printing it.
|
|
||||||
|
|
||||||
```
|
|
||||||
result = nlp.analyze_pipes(no_print=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
<Infobox variant="warning" title="Important note">
|
<Infobox variant="warning" title="Important note">
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user