From b278f31ee684e5d402a1891a0445a9c7c1c1f644 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 12 Aug 2021 12:50:03 +0200 Subject: [PATCH] Document scorers in registry and components from #8766 (#8929) * Document scorers in registry and components from #8766 * Update spacy/pipeline/lemmatizer.py Co-authored-by: Sofie Van Landeghem * Update website/docs/api/dependencyparser.md Co-authored-by: Sofie Van Landeghem * Reformat Co-authored-by: Sofie Van Landeghem --- spacy/pipeline/attributeruler.py | 8 +++++--- spacy/pipeline/dep_parser.pyx | 1 + spacy/pipeline/entity_linker.py | 3 +++ spacy/pipeline/entityruler.py | 2 ++ spacy/pipeline/lemmatizer.py | 2 ++ spacy/pipeline/morphologizer.pyx | 3 +++ spacy/pipeline/ner.pyx | 2 ++ spacy/pipeline/sentencizer.pyx | 3 ++- spacy/pipeline/senter.pyx | 2 ++ spacy/pipeline/spancat.py | 16 +++++++++++++++ spacy/pipeline/tagger.pyx | 2 ++ spacy/pipeline/textcat.py | 3 +++ spacy/pipeline/transition_parser.pyx | 1 + website/docs/api/attributeruler.md | 28 +++++++------------------- website/docs/api/dependencyparser.md | 16 +-------------- website/docs/api/entitylinker.md | 16 +-------------- website/docs/api/entityrecognizer.md | 16 +-------------- website/docs/api/entityruler.md | 1 + website/docs/api/lemmatizer.md | 12 ++++++----- website/docs/api/morphologizer.md | 12 ++++++----- website/docs/api/pipe.md | 10 +++++---- website/docs/api/scorer.md | 20 ++++++++++++++---- website/docs/api/sentencerecognizer.md | 27 +++++++------------------ website/docs/api/sentencizer.md | 26 ++++++------------------ website/docs/api/spancategorizer.md | 17 +--------------- website/docs/api/tagger.md | 27 +++++++------------------ website/docs/api/textcategorizer.md | 15 +++++++------- website/docs/api/top-level.md | 1 + 28 files changed, 121 insertions(+), 171 deletions(-) diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 733a65199..b1a2f3e9c 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -36,9 +36,7 @@ def make_attribute_ruler( return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer) -def attribute_ruler_score( - examples: Iterable[Example], **kwargs -) -> Dict[str, Any]: +def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: def morph_key_getter(token, attr): return getattr(token, attr).key @@ -84,6 +82,10 @@ class AttributeRuler(Pipe): vocab (Vocab): The vocab. name (str): The pipe name. Defaults to "attribute_ruler". + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attributes "tag", "pos", "morph" and + "lemma" and Scorer.score_token_attr_per_feat for the attribute + "morph". RETURNS (AttributeRuler): The AttributeRuler component. diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 59364326b..50c57ee5b 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -102,6 +102,7 @@ def make_parser( primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. + scorer (Optional[Callable]): The scoring method. """ return DependencyParser( nlp.vocab, diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 919d1fe6d..a17eed13c 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -83,6 +83,7 @@ def make_entity_linker( entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. + scorer (Optional[Callable]): The scoring method. """ return EntityLinker( nlp.vocab, @@ -142,6 +143,8 @@ class EntityLinker(TrainablePipe): entity_vector_length (int): Size of encoding vectors in the KB. get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_links. DOCS: https://spacy.io/api/entitylinker#init """ diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index d2a0c5045..ad67a7a1f 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -106,6 +106,8 @@ class EntityRuler(Pipe): overwrite_ents (bool): If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. ent_id_sep (str): Separator used internally for entity IDs. + scorer (Optional[Callable]): The scoring method. Defaults to + spacy.scorer.get_ner_prf. DOCS: https://spacy.io/api/entityruler#init """ diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 1bf513661..74f502d80 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -90,6 +90,8 @@ class Lemmatizer(Pipe): mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". overwrite (bool): Whether to overwrite existing lemmas. Defaults to `False`. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attribute "lemma". DOCS: https://spacy.io/api/lemmatizer#init """ diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index c5293e860..f476f02c4 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -95,6 +95,9 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attributes "pos" and "morph" and + Scorer.score_token_attr_per_feat for the attribute "morph". DOCS: https://spacy.io/api/morphologizer#init """ diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 857e3c088..4835a8c4b 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -82,6 +82,7 @@ def make_ner( incorrect_spans_key (Optional[str]): Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. + scorer (Optional[Callable]): The scoring method. """ return EntityRecognizer( nlp.vocab, @@ -158,6 +159,7 @@ def make_beam_ner( and are faster to compute. incorrect_spans_key (Optional[str]): Optional key into span groups of entities known to be non-entities. + scorer (Optional[Callable]): The scoring method. """ return EntityRecognizer( nlp.vocab, diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index c535c7e48..5e64246ad 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -55,7 +55,8 @@ class Sentencizer(Pipe): punct_chars (list): Punctuation characters to split on. Will be serialized with the nlp object. - RETURNS (Sentencizer): The sentencizer component. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the attribute "sents". DOCS: https://spacy.io/api/sentencizer#init """ diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 3eeb9b5da..b1fb2ec37 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -69,6 +69,8 @@ class SentenceRecognizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the attribute "sents". DOCS: https://spacy.io/api/sentencerecognizer#init """ diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index a143ac4eb..6bc93d693 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -181,6 +181,22 @@ class SpanCategorizer(TrainablePipe): scorer: Optional[Callable] = spancat_score, ) -> None: """Initialize the span categorizer. + vocab (Vocab): The shared vocabulary. + model (thinc.api.Model): The Thinc Model powering the pipeline component. + name (str): The component instance name, used to add entries to the + losses during training. + spans_key (str): Key of the Doc.spans dict to save the spans under. + During initialization and training, the component will look for + spans on the reference document under the same key. Defaults to + `"spans"`. + threshold (float): Minimum probability to consider a prediction + positive. Spans with a positive prediction will be saved on the Doc. + Defaults to 0.5. + max_positive (Optional[int]): Maximum number of labels to consider + positive per span. Defaults to None, indicating no limit. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the Doc.spans[spans_key] with overlapping + spans allowed. DOCS: https://spacy.io/api/spancategorizer#init """ diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 327a18f25..16d16b497 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -78,6 +78,8 @@ class Tagger(TrainablePipe): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_token_attr for the attribute "tag". DOCS: https://spacy.io/api/tagger#init """ diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 5ede18424..6956a919d 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -104,6 +104,7 @@ def make_textcat( model (Model[List[Doc], List[Floats2d]]): A model instance that predicts scores for each category. threshold (float): Cutoff to consider a prediction "positive". + scorer (Optional[Callable]): The scoring method. """ return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) @@ -144,6 +145,8 @@ class TextCategorizer(TrainablePipe): name (str): The component instance name, used to add entries to the losses during training. threshold (float): Cutoff to consider a prediction "positive". + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_cats for the attribute "cats". DOCS: https://spacy.io/api/textcategorizer#init """ diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index fa2b28aa5..2571af102 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -87,6 +87,7 @@ cdef class Parser(TrainablePipe): incorrect_spans_key (Optional[str]): Identifies spans that are known to be incorrect entity annotations. The incorrect entity annotations can be stored in the span group, under this key. + scorer (Optional[Callable]): The scoring method. Defaults to None. """ self.vocab = vocab self.name = name diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index a253ca9f8..965bffbcc 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -48,12 +48,13 @@ Initialize the attribute ruler. > ruler = nlp.add_pipe("attribute_ruler") > ``` -| Name | Description | -| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | -| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | -| _keyword-only_ | | -| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | +| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | +| _keyword-only_ | | +| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag`", `"pos"`, `"morph"` and `"lemma"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | ## AttributeRuler.\_\_call\_\_ {#call tag="method"} @@ -175,21 +176,6 @@ Load attribute ruler patterns from morph rules. | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | -## AttributeRuler.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = ruler.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ | - ## AttributeRuler.to_disk {#to_disk tag="method"} Serialize the pipe to disk. diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index fa02a6f99..3d326a41b 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -91,6 +91,7 @@ shortcut for this and instantiate the component using its string name and | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_deps`](/api/scorer#score_deps) for the attribute `"dep"` ignoring the labels `p` and `punct` and [`Scorer.score_spans`](/api/scorer/#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | ## DependencyParser.\_\_call\_\_ {#call tag="method"} @@ -259,21 +260,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. ~~StateClass~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## DependencyParser.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = parser.score(examples) -> ``` - -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ | - ## DependencyParser.create_optimizer {#create_optimizer tag="method"} Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 2994d934b..764f63a1a 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -50,6 +50,7 @@ architectures and their arguments and hyperparameters. | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -259,21 +260,6 @@ pipe's entity linking model and context encoder. Delegates to | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | -## EntityLinker.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = entity_linker.score(examples) -> ``` - -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ | - ## EntityLinker.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 601b644c1..68c048428 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -48,6 +48,7 @@ architectures and their arguments and hyperparameters. | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | | `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | +| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/ner.pyx @@ -251,21 +252,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. ~~StateClass~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## EntityRecognizer.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = ner.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | - ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 93b5da45a..63b4d1823 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -40,6 +40,7 @@ how the component should be configured. You can override its settings via the | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | +| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entityruler.py diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 279821e71..b67673599 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -47,11 +47,13 @@ data format used by the lookup and rule-based lemmatizers, see > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ | -| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | -| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| Setting | Description | +| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ | +| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | +| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| _keyword-only_ | | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ | Many languages specify a default lemmatizer mode other than `lookup` if a better lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index d2dd28ac2..d2a927f4b 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -61,11 +61,13 @@ shortcut for this and instantiate the component using its string name and > morphologizer = Morphologizer(nlp.vocab, model) > ``` -| Name | Description | -| ------- | -------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ | ## Morphologizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 2f856c667..263942e3e 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -297,10 +297,12 @@ Score a batch of examples. > scores = pipe.score(examples) > ``` -| Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| _keyword-only_ | +| `\*\*kwargs` | Any additional settings to pass on to the scorer. ~~Any~~ | +| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## TrainablePipe.create_optimizer {#create_optimizer tag="method"} diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index ad908f204..da7da5f82 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -27,9 +27,13 @@ Create a new `Scorer`. > scorer = Scorer(nlp) > ``` -| Name | Description | -| ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ | +| Name | Description | +| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ | +| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ | +| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ | +| _keyword-only_ | | +| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ | ## Scorer.score {#score tag="method"} @@ -80,7 +84,7 @@ Docs with `has_unknown_spaces` are skipped during scoring. > ``` | Name | Description | -| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | +| ----------- | ------------------------------------------------------------------------------------------------------------------- | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ | @@ -253,3 +257,11 @@ entities that overlap between the gold reference and the predictions. | _keyword-only_ | | | `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ | | **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ | + +## get_ner_prf {#get_ner_prf new="3"} + +Compute micro-PRF and per-entity PRF scores. + +| Name | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index e82a4bef6..d6d82c259 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -60,11 +60,13 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ------- | -------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ | ## SentenceRecognizer.\_\_call\_\_ {#call tag="method"} @@ -238,21 +240,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## SentenceRecognizer.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = senter.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ | - ## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 75a253fc0..4570e8746 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -28,7 +28,7 @@ how the component should be configured. You can override its settings via the > ``` | Setting | Description | -| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | ```python @@ -50,10 +50,11 @@ Initialize the sentencizer. > sentencizer = Sentencizer() > ``` -| Name | Description | -| -------------- | ----------------------------------------------------------------------------------------------------------------------- | -| _keyword-only_ | | -| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ | +| Name | Description | +| -------------- | ---------------------------------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | +| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ | ```python ### punct_chars defaults @@ -112,21 +113,6 @@ applied to the `Doc` in order. | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | -## Sentencizer.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = sentencizer.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ | - ## Sentencizer.to_disk {#to_disk tag="method"} Save the sentencizer settings (punctuation characters) to a directory. Will diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md index 57395846d..8748b23a2 100644 --- a/website/docs/api/spancategorizer.md +++ b/website/docs/api/spancategorizer.md @@ -43,6 +43,7 @@ architectures and their arguments and hyperparameters. | `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ | | `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/spancat.py @@ -241,22 +242,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## SpanCategorizer.score {#score tag="method"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = spancat.score(examples) -> ``` - -| Name | Description | -| -------------- | ---------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ | - ## SpanCategorizer.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 3002aff7b..c37483ca4 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -54,11 +54,13 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ | ## Tagger.\_\_call\_\_ {#call tag="method"} @@ -249,21 +251,6 @@ predicted scores. | `scores` | Scores representing the model's predictions. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | -## Tagger.score {#score tag="method" new="3"} - -Score a batch of examples. - -> #### Example -> -> ```python -> scores = tagger.score(examples) -> ``` - -| Name | Description | -| ----------- | --------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ | - ## Tagger.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 923da0048..4b1348fa4 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -96,13 +96,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ | ## TextCategorizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 8190d9f78..be45add72 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -373,6 +373,7 @@ factories. | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | +| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | ### spacy-transformers registry {#registry-transformers}