Document scorers in registry and components from #8766 (#8929)

* Document scorers in registry and components from #8766

* Update spacy/pipeline/lemmatizer.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update website/docs/api/dependencyparser.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Reformat

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Adriane Boyd 2021-08-12 12:50:03 +02:00 committed by GitHub
parent 944ad6b1d4
commit b278f31ee6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 121 additions and 171 deletions

View File

@ -36,9 +36,7 @@ def make_attribute_ruler(
return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer) return AttributeRuler(nlp.vocab, name, validate=validate, scorer=scorer)
def attribute_ruler_score( def attribute_ruler_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
examples: Iterable[Example], **kwargs
) -> Dict[str, Any]:
def morph_key_getter(token, attr): def morph_key_getter(token, attr):
return getattr(token, attr).key return getattr(token, attr).key
@ -84,6 +82,10 @@ class AttributeRuler(Pipe):
vocab (Vocab): The vocab. vocab (Vocab): The vocab.
name (str): The pipe name. Defaults to "attribute_ruler". name (str): The pipe name. Defaults to "attribute_ruler".
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attributes "tag", "pos", "morph" and
"lemma" and Scorer.score_token_attr_per_feat for the attribute
"morph".
RETURNS (AttributeRuler): The AttributeRuler component. RETURNS (AttributeRuler): The AttributeRuler component.

View File

@ -102,6 +102,7 @@ def make_parser(
primarily affects the label accuracy, it can also affect the attachment primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity structure, as the labels are used to represent the pseudo-projectivity
transformation. transformation.
scorer (Optional[Callable]): The scoring method.
""" """
return DependencyParser( return DependencyParser(
nlp.vocab, nlp.vocab,

View File

@ -83,6 +83,7 @@ def make_entity_linker(
entity_vector_length (int): Size of encoding vectors in the KB. entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that get_candidates (Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention. produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method.
""" """
return EntityLinker( return EntityLinker(
nlp.vocab, nlp.vocab,
@ -142,6 +143,8 @@ class EntityLinker(TrainablePipe):
entity_vector_length (int): Size of encoding vectors in the KB. entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention. produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init DOCS: https://spacy.io/api/entitylinker#init
""" """

View File

@ -106,6 +106,8 @@ class EntityRuler(Pipe):
overwrite_ents (bool): If existing entities are present, e.g. entities overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary. added by the model, overwrite them by matches if necessary.
ent_id_sep (str): Separator used internally for entity IDs. ent_id_sep (str): Separator used internally for entity IDs.
scorer (Optional[Callable]): The scoring method. Defaults to
spacy.scorer.get_ner_prf.
DOCS: https://spacy.io/api/entityruler#init DOCS: https://spacy.io/api/entityruler#init
""" """

View File

@ -90,6 +90,8 @@ class Lemmatizer(Pipe):
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
overwrite (bool): Whether to overwrite existing lemmas. Defaults to overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`. `False`.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "lemma".
DOCS: https://spacy.io/api/lemmatizer#init DOCS: https://spacy.io/api/lemmatizer#init
""" """

View File

@ -95,6 +95,9 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attributes "pos" and "morph" and
Scorer.score_token_attr_per_feat for the attribute "morph".
DOCS: https://spacy.io/api/morphologizer#init DOCS: https://spacy.io/api/morphologizer#init
""" """

View File

@ -82,6 +82,7 @@ def make_ner(
incorrect_spans_key (Optional[str]): Identifies spans that are known incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key. can be stored in the span group, under this key.
scorer (Optional[Callable]): The scoring method.
""" """
return EntityRecognizer( return EntityRecognizer(
nlp.vocab, nlp.vocab,
@ -158,6 +159,7 @@ def make_beam_ner(
and are faster to compute. and are faster to compute.
incorrect_spans_key (Optional[str]): Optional key into span groups of incorrect_spans_key (Optional[str]): Optional key into span groups of
entities known to be non-entities. entities known to be non-entities.
scorer (Optional[Callable]): The scoring method.
""" """
return EntityRecognizer( return EntityRecognizer(
nlp.vocab, nlp.vocab,

View File

@ -55,7 +55,8 @@ class Sentencizer(Pipe):
punct_chars (list): Punctuation characters to split on. Will be punct_chars (list): Punctuation characters to split on. Will be
serialized with the nlp object. serialized with the nlp object.
RETURNS (Sentencizer): The sentencizer component. scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencizer#init DOCS: https://spacy.io/api/sentencizer#init
""" """

View File

@ -69,6 +69,8 @@ class SentenceRecognizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the attribute "sents".
DOCS: https://spacy.io/api/sentencerecognizer#init DOCS: https://spacy.io/api/sentencerecognizer#init
""" """

View File

@ -181,6 +181,22 @@ class SpanCategorizer(TrainablePipe):
scorer: Optional[Callable] = spancat_score, scorer: Optional[Callable] = spancat_score,
) -> None: ) -> None:
"""Initialize the span categorizer. """Initialize the span categorizer.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
spans_key (str): Key of the Doc.spans dict to save the spans under.
During initialization and training, the component will look for
spans on the reference document under the same key. Defaults to
`"spans"`.
threshold (float): Minimum probability to consider a prediction
positive. Spans with a positive prediction will be saved on the Doc.
Defaults to 0.5.
max_positive (Optional[int]): Maximum number of labels to consider
positive per span. Defaults to None, indicating no limit.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
spans allowed.
DOCS: https://spacy.io/api/spancategorizer#init DOCS: https://spacy.io/api/spancategorizer#init
""" """

View File

@ -78,6 +78,8 @@ class Tagger(TrainablePipe):
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_token_attr for the attribute "tag".
DOCS: https://spacy.io/api/tagger#init DOCS: https://spacy.io/api/tagger#init
""" """

View File

@ -104,6 +104,7 @@ def make_textcat(
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category. scores for each category.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method.
""" """
return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer) return TextCategorizer(nlp.vocab, model, name, threshold=threshold, scorer=scorer)
@ -144,6 +145,8 @@ class TextCategorizer(TrainablePipe):
name (str): The component instance name, used to add entries to the name (str): The component instance name, used to add entries to the
losses during training. losses during training.
threshold (float): Cutoff to consider a prediction "positive". threshold (float): Cutoff to consider a prediction "positive".
scorer (Optional[Callable]): The scoring method. Defaults to
Scorer.score_cats for the attribute "cats".
DOCS: https://spacy.io/api/textcategorizer#init DOCS: https://spacy.io/api/textcategorizer#init
""" """

View File

@ -87,6 +87,7 @@ cdef class Parser(TrainablePipe):
incorrect_spans_key (Optional[str]): Identifies spans that are known incorrect_spans_key (Optional[str]): Identifies spans that are known
to be incorrect entity annotations. The incorrect entity annotations to be incorrect entity annotations. The incorrect entity annotations
can be stored in the span group, under this key. can be stored in the span group, under this key.
scorer (Optional[Callable]): The scoring method. Defaults to None.
""" """
self.vocab = vocab self.vocab = vocab
self.name = name self.name = name

View File

@ -49,11 +49,12 @@ Initialize the attribute ruler.
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ | | `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ | | `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ | | `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag`", `"pos"`, `"morph"` and `"lemma"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
## AttributeRuler.\_\_call\_\_ {#call tag="method"} ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
@ -175,21 +176,6 @@ Load attribute ruler patterns from morph rules.
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ | | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
## AttributeRuler.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = ruler.score(examples)
> ```
| Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"tag"`, `"pos"`, `"morph"` and `"lemma"` if present in any of the target token attributes. ~~Dict[str, float]~~ |
## AttributeRuler.to_disk {#to_disk tag="method"} ## AttributeRuler.to_disk {#to_disk tag="method"}
Serialize the pipe to disk. Serialize the pipe to disk.

View File

@ -91,6 +91,7 @@ shortcut for this and instantiate the component using its string name and
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_deps`](/api/scorer#score_deps) for the attribute `"dep"` ignoring the labels `p` and `punct` and [`Scorer.score_spans`](/api/scorer/#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
## DependencyParser.\_\_call\_\_ {#call tag="method"} ## DependencyParser.\_\_call\_\_ {#call tag="method"}
@ -259,21 +260,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. ~~StateClass~~ | | `scores` | Scores representing the model's predictions. ~~StateClass~~ |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## DependencyParser.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = parser.score(examples)
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## DependencyParser.create_optimizer {#create_optimizer tag="method"} ## DependencyParser.create_optimizer {#create_optimizer tag="method"}
Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline

View File

@ -50,6 +50,7 @@ architectures and their arguments and hyperparameters.
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py %%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@ -259,21 +260,6 @@ pipe's entity linking model and context encoder. Delegates to
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## EntityLinker.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = entity_linker.score(examples)
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ |
## EntityLinker.create_optimizer {#create_optimizer tag="method"} ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an optimizer for the pipeline component.

View File

@ -48,6 +48,7 @@ architectures and their arguments and hyperparameters.
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
| `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ | | `incorrect_spans_key` | This key refers to a `SpanGroup` in `doc.spans` that specifies incorrect spans. The NER wiill learn not to predict (exactly) those spans. Defaults to `None`. ~~Optional[str]~~ |
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/ner.pyx %%GITHUB_SPACY/spacy/pipeline/ner.pyx
@ -251,21 +252,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. ~~StateClass~~ | | `scores` | Scores representing the model's predictions. ~~StateClass~~ |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## EntityRecognizer.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = ner.score(examples)
> ```
| Name | Description |
| ----------- | --------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an optimizer for the pipeline component.

View File

@ -40,6 +40,7 @@ how the component should be configured. You can override its settings via the
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/entityruler.py %%GITHUB_SPACY/spacy/pipeline/entityruler.py

View File

@ -48,10 +48,12 @@ data format used by the lookup and rule-based lemmatizers, see
> ``` > ```
| Setting | Description | | Setting | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ | | `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `lookup` if no language-specific lemmatizer is available (see the following table). ~~str~~ |
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | | `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
| `model` | **Not yet implemented:** the model to use. ~~Model~~ | | `model` | **Not yet implemented:** the model to use. ~~Model~~ |
| _keyword-only_ | |
| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"lemma"`. ~~Optional[Callable]~~ |
Many languages specify a default lemmatizer mode other than `lookup` if a better Many languages specify a default lemmatizer mode other than `lookup` if a better
lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require

View File

@ -62,10 +62,12 @@ shortcut for this and instantiate the component using its string name and
> ``` > ```
| Name | Description | | Name | Description |
| ------- | -------------------------------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"` and `"morph"` and [`Scorer.score_token_attr_per_feat`](/api/scorer#score_token_attr_per_feat) for the attribute `"morph"`. ~~Optional[Callable]~~ |
## Morphologizer.\_\_call\_\_ {#call tag="method"} ## Morphologizer.\_\_call\_\_ {#call tag="method"}

View File

@ -298,8 +298,10 @@ Score a batch of examples.
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| _keyword-only_ |
| `\*\*kwargs` | Any additional settings to pass on to the scorer. ~~Any~~ |
| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ | | **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## TrainablePipe.create_optimizer {#create_optimizer tag="method"} ## TrainablePipe.create_optimizer {#create_optimizer tag="method"}

View File

@ -28,8 +28,12 @@ Create a new `Scorer`.
> ``` > ```
| Name | Description | | Name | Description |
| ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ | | `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline is constructed using the `default_lang` and `default_pipeline` settings. ~~Optional[Language]~~ |
| `default_lang` | The language to use for a default pipeline if `nlp` is not provided. Defaults to `xx`. ~~str~~ |
| `default_pipeline` | The pipeline components to use for a default pipeline if `nlp` is not provided. Defaults to `("senter", "tagger", "morphologizer", "parser", "ner", "textcat")`. ~~Iterable[string]~~ |
| _keyword-only_ | |
| `\*\*kwargs` | Any additional settings to pass on to the individual scoring methods. ~~Any~~ |
## Scorer.score {#score tag="method"} ## Scorer.score {#score tag="method"}
@ -80,7 +84,7 @@ Docs with `has_unknown_spaces` are skipped during scoring.
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ |
@ -253,3 +257,11 @@ entities that overlap between the gold reference and the predictions.
| _keyword-only_ | | | _keyword-only_ | |
| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ | | `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ |
| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ | | **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ |
## get_ner_prf {#get_ner_prf new="3"}
Compute micro-PRF and per-entity PRF scores.
| Name | Description |
| ---------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |

View File

@ -61,10 +61,12 @@ shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Description | | Name | Description |
| ------- | -------------------------------------------------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"`. ~~Optional[Callable]~~ |
## SentenceRecognizer.\_\_call\_\_ {#call tag="method"} ## SentenceRecognizer.\_\_call\_\_ {#call tag="method"}
@ -238,21 +240,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## SentenceRecognizer.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = senter.score(examples)
> ```
| Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ |
## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"} ## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an optimizer for the pipeline component.

View File

@ -28,7 +28,7 @@ how the component should be configured. You can override its settings via the
> ``` > ```
| Setting | Description | | Setting | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------ | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` | | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
```python ```python
@ -51,9 +51,10 @@ Initialize the sentencizer.
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | _keyword-only_ | |
| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ | | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for the attribute `"sents"` ~~Optional[Callable]~~ |
```python ```python
### punct_chars defaults ### punct_chars defaults
@ -112,21 +113,6 @@ applied to the `Doc` in order.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Sentencizer.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = sentencizer.score(examples)
> ```
| Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ |
## Sentencizer.to_disk {#to_disk tag="method"} ## Sentencizer.to_disk {#to_disk tag="method"}
Save the sentencizer settings (punctuation characters) to a directory. Will Save the sentencizer settings (punctuation characters) to a directory. Will

View File

@ -43,6 +43,7 @@ architectures and their arguments and hyperparameters.
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ | | `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. ~~str~~ |
| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | | `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ |
| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | | `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/spancat.py %%GITHUB_SPACY/spacy/pipeline/spancat.py
@ -241,22 +242,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## SpanCategorizer.score {#score tag="method"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = spancat.score(examples)
> ```
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## SpanCategorizer.create_optimizer {#create_optimizer tag="method"} ## SpanCategorizer.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an optimizer for the pipeline component.

View File

@ -55,10 +55,12 @@ shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Description | | Name | Description |
| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `scorer` | The scoring method. Defaults to [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Optional[Callable]~~ |
## Tagger.\_\_call\_\_ {#call tag="method"} ## Tagger.\_\_call\_\_ {#call tag="method"}
@ -249,21 +251,6 @@ predicted scores.
| `scores` | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## Tagger.score {#score tag="method" new="3"}
Score a batch of examples.
> #### Example
>
> ```python
> scores = tagger.score(examples)
> ```
| Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attribute `"tag"`. ~~Dict[str, float]~~ |
## Tagger.create_optimizer {#create_optimizer tag="method"} ## Tagger.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component. Create an optimizer for the pipeline component.

View File

@ -97,12 +97,13 @@ shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe). [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Description | | Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------- | | -------------- | -------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ | | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~ |
| `scorer` | The scoring method. Defaults to [`Scorer.score_cats`](/api/scorer#score_cats) for the attribute `"cats"`. ~~Optional[Callable]~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"} ## TextCategorizer.\_\_call\_\_ {#call tag="method"}

View File

@ -373,6 +373,7 @@ factories.
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). | | `readers` | Registry for file and data readers, including training and evaluation data readers like [`Corpus`](/api/corpus). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
| `scorers` | Registry for functions that create scoring methods for user with the [`Scorer`](/api/scorer). Scoring methods are called with `Iterable[Example]` and arbitrary `\*\*kwargs` and return scores as `Dict[str, Any]`. |
| `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. | | `tokenizers` | Registry for tokenizer factories. Registered functions should return a callback that receives the `nlp` object and returns a [`Tokenizer`](/api/tokenizer) or a custom callable. |
### spacy-transformers registry {#registry-transformers} ### spacy-transformers registry {#registry-transformers}