2023-09-12 09:50:01 +03:00
|
|
|
# cython: infer_types=True, binding=True
|
2023-06-26 12:41:03 +03:00
|
|
|
from itertools import islice
|
|
|
|
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
|
|
2020-07-22 14:42:59 +03:00
|
|
|
import numpy
|
2024-01-16 16:54:26 +03:00
|
|
|
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
from thinc.types import Floats2d, Ints1d
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2023-06-26 12:41:03 +03:00
|
|
|
from ..tokens.doc cimport Doc
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2023-06-26 12:41:03 +03:00
|
|
|
from .. import util
|
2023-07-19 13:03:31 +03:00
|
|
|
from ..errors import Errors
|
2020-07-22 14:42:59 +03:00
|
|
|
from ..language import Language
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
from ..scorer import Scorer
|
2020-10-08 22:33:49 +03:00
|
|
|
from ..training import validate_examples, validate_get_examples
|
2021-08-10 16:13:39 +03:00
|
|
|
from ..util import registry
|
2023-06-26 12:41:03 +03:00
|
|
|
from .trainable_pipe import TrainablePipe
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
|
|
|
|
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
|
|
|
|
2020-07-22 14:42:59 +03:00
|
|
|
default_model_config = """
|
|
|
|
[model]
|
2022-03-15 16:15:31 +03:00
|
|
|
@architectures = "spacy.Tagger.v2"
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
[model.tok2vec]
|
2021-04-22 11:04:15 +03:00
|
|
|
@architectures = "spacy.HashEmbedCNN.v2"
|
2020-07-22 14:42:59 +03:00
|
|
|
pretrained_vectors = null
|
|
|
|
width = 96
|
|
|
|
depth = 4
|
|
|
|
embed_size = 2000
|
|
|
|
window_size = 1
|
|
|
|
maxout_pieces = 3
|
|
|
|
subword_features = true
|
|
|
|
"""
|
|
|
|
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|
|
|
|
|
|
|
|
|
|
|
@Language.factory(
|
|
|
|
"tagger",
|
|
|
|
assigns=["token.tag"],
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
default_config={
|
|
|
|
"model": DEFAULT_TAGGER_MODEL,
|
|
|
|
"overwrite": False,
|
|
|
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
|
|
|
"neg_prefix": "!",
|
2023-06-12 16:57:10 +03:00
|
|
|
"label_smoothing": 0.0,
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
"save_activations": False,
|
|
|
|
},
|
2020-07-27 13:27:40 +03:00
|
|
|
default_score_weights={"tag_acc": 1.0},
|
2020-07-22 14:42:59 +03:00
|
|
|
)
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
def make_tagger(
|
|
|
|
nlp: Language,
|
|
|
|
name: str,
|
|
|
|
model: Model,
|
|
|
|
overwrite: bool,
|
|
|
|
scorer: Optional[Callable],
|
2021-12-06 20:04:44 +03:00
|
|
|
neg_prefix: str,
|
2023-03-22 14:17:56 +03:00
|
|
|
label_smoothing: float,
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
save_activations: bool,
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
):
|
2020-08-09 16:09:31 +03:00
|
|
|
"""Construct a part-of-speech tagger component.
|
|
|
|
|
|
|
|
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
|
|
|
the tag probabilities. The output vectors should match the number of tags
|
|
|
|
in size, and be normalized as probabilities (all scores between 0 and 1,
|
|
|
|
with the rows summing to 1).
|
|
|
|
"""
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
|
2023-06-12 16:57:10 +03:00
|
|
|
label_smoothing=label_smoothing, save_activations=save_activations)
|
2021-08-10 16:13:39 +03:00
|
|
|
|
|
|
|
|
|
|
|
def tagger_score(examples, **kwargs):
|
|
|
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
@registry.scorers("spacy.tagger_scorer.v1")
|
|
|
|
def make_tagger_scorer():
|
|
|
|
return tagger_score
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
|
2020-10-08 22:33:49 +03:00
|
|
|
class Tagger(TrainablePipe):
|
2020-07-22 14:42:59 +03:00
|
|
|
"""Pipeline component for part-of-speech tagging.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger
|
2020-07-22 14:42:59 +03:00
|
|
|
"""
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
vocab,
|
|
|
|
model,
|
|
|
|
name="tagger",
|
|
|
|
*,
|
2023-02-02 16:13:38 +03:00
|
|
|
overwrite=False,
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
scorer=tagger_score,
|
2021-12-06 20:04:44 +03:00
|
|
|
neg_prefix="!",
|
2023-03-22 14:17:56 +03:00
|
|
|
label_smoothing=0.0,
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
save_activations: bool = False,
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Initialize a part-of-speech tagger.
|
|
|
|
|
|
|
|
vocab (Vocab): The shared vocabulary.
|
|
|
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
|
|
|
name (str): The component instance name, used to add entries to the
|
|
|
|
losses during training.
|
2023-02-02 16:13:38 +03:00
|
|
|
overwrite (bool): Whether to overwrite existing annotations.
|
2021-08-12 13:50:03 +03:00
|
|
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
|
|
|
Scorer.score_token_attr for the attribute "tag".
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
save_activations (bool): save model activations in Doc when annotating.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#init
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
self.vocab = vocab
|
|
|
|
self.model = model
|
|
|
|
self.name = name
|
|
|
|
self._rehearsal_model = None
|
2023-06-12 16:57:10 +03:00
|
|
|
cfg = {
|
|
|
|
"labels": [],
|
|
|
|
"overwrite": overwrite,
|
|
|
|
"neg_prefix": neg_prefix,
|
|
|
|
"label_smoothing": label_smoothing
|
|
|
|
}
|
2020-07-22 14:42:59 +03:00
|
|
|
self.cfg = dict(sorted(cfg.items()))
|
2021-08-10 16:13:39 +03:00
|
|
|
self.scorer = scorer
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
self.save_activations = save_activations
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
@property
|
|
|
|
def labels(self):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""The labels currently added to the component. Note that even for a
|
|
|
|
blank component, this will always include the built-in coarse-grained
|
|
|
|
part-of-speech tags by default.
|
|
|
|
|
|
|
|
RETURNS (Tuple[str]): The labels.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#labels
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-08-07 16:27:13 +03:00
|
|
|
return tuple(self.cfg["labels"])
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2020-09-29 17:22:13 +03:00
|
|
|
@property
|
|
|
|
def label_data(self):
|
2020-09-29 19:30:38 +03:00
|
|
|
"""Data about the labels currently added to the component."""
|
2020-09-29 17:22:13 +03:00
|
|
|
return tuple(self.cfg["labels"])
|
|
|
|
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
def predict(self, docs) -> ActivationsT:
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
|
|
|
|
|
|
|
docs (Iterable[Doc]): The documents to predict.
|
|
|
|
RETURNS: The models prediction for each document.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#predict
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
if not any(len(doc) for doc in docs):
|
|
|
|
# Handle cases where there are no tokens in any docs.
|
|
|
|
n_labels = len(self.labels)
|
|
|
|
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
|
|
|
assert len(guesses) == len(docs)
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
return {"probabilities": guesses, "label_ids": guesses}
|
2020-07-22 14:42:59 +03:00
|
|
|
scores = self.model.predict(docs)
|
|
|
|
assert len(scores) == len(docs), (len(scores), len(docs))
|
|
|
|
guesses = self._scores2guesses(scores)
|
|
|
|
assert len(guesses) == len(docs)
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
return {"probabilities": scores, "label_ids": guesses}
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
def _scores2guesses(self, scores):
|
|
|
|
guesses = []
|
|
|
|
for doc_scores in scores:
|
|
|
|
doc_guesses = doc_scores.argmax(axis=1)
|
|
|
|
if not isinstance(doc_guesses, numpy.ndarray):
|
|
|
|
doc_guesses = doc_guesses.get()
|
|
|
|
guesses.append(doc_guesses)
|
|
|
|
return guesses
|
|
|
|
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Modify a batch of documents, using pre-computed scores.
|
|
|
|
|
|
|
|
docs (Iterable[Doc]): The documents to modify.
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#set_annotations
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
batch_tag_ids = activations["label_ids"]
|
2020-07-22 14:42:59 +03:00
|
|
|
if isinstance(docs, Doc):
|
|
|
|
docs = [docs]
|
|
|
|
cdef Doc doc
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
cdef bint overwrite = self.cfg["overwrite"]
|
2021-11-30 13:58:59 +03:00
|
|
|
labels = self.labels
|
2020-07-22 14:42:59 +03:00
|
|
|
for i, doc in enumerate(docs):
|
Store activations in `Doc`s when `save_activations` is enabled (#11002)
* Store activations in Doc when `store_activations` is enabled
This change adds the new `activations` attribute to `Doc`. This
attribute can be used by trainable pipes to store their activations,
probabilities, and guesses for downstream users.
As an example, this change modifies the `tagger` and `senter` pipes to
add an `store_activations` option. When this option is enabled, the
probabilities and guesses are stored in `set_annotations`.
* Change type of `store_activations` to `Union[bool, List[str]]`
When the value is:
- A bool: all activations are stored when set to `True`.
- A List[str]: the activations named in the list are stored
* Formatting fixes in Tagger
* Support store_activations in spancat and morphologizer
* Make Doc.activations type visible to MyPy
* textcat/textcat_multilabel: add store_activations option
* trainable_lemmatizer/entity_linker: add store_activations option
* parser/ner: do not currently support returning activations
* Extend tagger and senter tests
So that they, like the other tests, also check that we get no
activations if no activations were requested.
* Document `Doc.activations` and `store_activations` in the relevant pipes
* Start errors/warnings at higher numbers to avoid merge conflicts
Between the master and v4 branches.
* Add `store_activations` to docstrings.
* Replace store_activations setter by set_store_activations method
Setters that take a different type than what the getter returns are still
problematic for MyPy. Replace the setter by a method, so that type inference
works everywhere.
* Use dict comprehension suggested by @svlandeg
* Revert "Use dict comprehension suggested by @svlandeg"
This reverts commit 6e7b958f7060397965176c69649e5414f1f24988.
* EntityLinker: add type annotations to _add_activations
* _store_activations: make kwarg-only, remove doc_scores_lens arg
* set_annotations: add type annotations
* Apply suggestions from code review
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* TextCat.predict: return dict
* Make the `TrainablePipe.store_activations` property a bool
This means that we can also bring back `store_activations` setter.
* Remove `TrainablePipe.activations`
We do not need to enumerate the activations anymore since `store_activations` is
`bool`.
* Add type annotations for activations in predict/set_annotations
* Rename `TrainablePipe.store_activations` to `save_activations`
* Error E1400 is not used anymore
This error was used when activations were still `Union[bool, List[str]]`.
* Change wording in API docs after store -> save change
* docs: tag (save_)activations as new in spaCy 4.0
* Fix copied line in morphologizer activations test
* Don't train in any test_save_activations test
* Rename activations
- "probs" -> "probabilities"
- "guesses" -> "label_ids", except in the edit tree lemmatizer, where
"guesses" -> "tree_ids".
* Remove unused W400 warning.
This warning was used when we still allowed the user to specify
which activations to save.
* Formatting fixes
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
* Replace "kb_ids" by a constant
* spancat: replace a cast by an assertion
* Fix EOF spacing
* Fix comments in test_save_activations tests
* Do not set RNG seed in activation saving tests
* Revert "spancat: replace a cast by an assertion"
This reverts commit 0bd5730d16432443a2b247316928d4f789ad8741.
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2022-09-13 10:51:12 +03:00
|
|
|
if self.save_activations:
|
|
|
|
doc.activations[self.name] = {}
|
|
|
|
for act_name, acts in activations.items():
|
|
|
|
doc.activations[self.name][act_name] = acts[i]
|
2020-07-22 14:42:59 +03:00
|
|
|
doc_tag_ids = batch_tag_ids[i]
|
|
|
|
if hasattr(doc_tag_ids, "get"):
|
|
|
|
doc_tag_ids = doc_tag_ids.get()
|
|
|
|
for j, tag_id in enumerate(doc_tag_ids):
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
if doc.c[j].tag == 0 or overwrite:
|
2021-11-30 13:58:59 +03:00
|
|
|
doc.c[j].tag = self.vocab.strings[labels[tag_id]]
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2021-01-20 03:49:25 +03:00
|
|
|
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Learn from a batch of documents and gold-standard information,
|
2021-01-25 17:18:45 +03:00
|
|
|
updating the pipe's model. Delegates to predict and get_loss.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
|
|
|
examples (Iterable[Example]): A batch of Example objects.
|
|
|
|
drop (float): The dropout rate.
|
|
|
|
sgd (thinc.api.Optimizer): The optimizer.
|
|
|
|
losses (Dict[str, float]): Optional record of the loss during training.
|
|
|
|
Updated using the component name as the key.
|
|
|
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#update
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
if losses is None:
|
|
|
|
losses = {}
|
|
|
|
losses.setdefault(self.name, 0.0)
|
2020-08-12 00:29:31 +03:00
|
|
|
validate_examples(examples, "Tagger.update")
|
|
|
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
|
|
|
# Handle cases where there are no tokens in any docs.
|
2020-10-14 16:00:49 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
set_dropout_rate(self.model, drop)
|
2020-08-12 00:29:31 +03:00
|
|
|
tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
|
2020-07-22 14:42:59 +03:00
|
|
|
for sc in tag_scores:
|
|
|
|
if self.model.ops.xp.isnan(sc.sum()):
|
2020-08-12 00:29:31 +03:00
|
|
|
raise ValueError(Errors.E940)
|
2020-07-22 14:42:59 +03:00
|
|
|
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
|
|
|
bp_tag_scores(d_tag_scores)
|
|
|
|
if sgd not in (None, False):
|
2020-10-05 17:23:33 +03:00
|
|
|
self.finish_update(sgd)
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
losses[self.name] += loss
|
|
|
|
return losses
|
|
|
|
|
2020-07-27 19:11:45 +03:00
|
|
|
def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
|
|
|
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
|
|
|
teach the current model to make predictions similar to an initial model,
|
|
|
|
to try to address the "catastrophic forgetting" problem. This feature is
|
|
|
|
experimental.
|
|
|
|
|
|
|
|
examples (Iterable[Example]): A batch of Example objects.
|
|
|
|
drop (float): The dropout rate.
|
|
|
|
sgd (thinc.api.Optimizer): The optimizer.
|
|
|
|
losses (Dict[str, float]): Optional record of the loss during training.
|
|
|
|
Updated using the component name as the key.
|
|
|
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#rehearse
|
2020-07-22 14:42:59 +03:00
|
|
|
"""
|
2020-10-14 16:11:34 +03:00
|
|
|
if losses is None:
|
|
|
|
losses = {}
|
|
|
|
losses.setdefault(self.name, 0.0)
|
2020-08-12 00:29:31 +03:00
|
|
|
validate_examples(examples, "Tagger.rehearse")
|
|
|
|
docs = [eg.predicted for eg in examples]
|
2020-07-22 14:42:59 +03:00
|
|
|
if self._rehearsal_model is None:
|
2020-10-14 16:11:34 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
if not any(len(doc) for doc in docs):
|
|
|
|
# Handle cases where there are no tokens in any docs.
|
2020-10-14 16:00:49 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
set_dropout_rate(self.model, drop)
|
2022-02-23 18:10:05 +03:00
|
|
|
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
|
|
|
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
2023-01-16 12:25:53 +03:00
|
|
|
loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
|
2022-02-23 18:10:05 +03:00
|
|
|
bp_tag_scores(grads)
|
2023-01-16 12:25:53 +03:00
|
|
|
if sgd is not None:
|
|
|
|
self.finish_update(sgd)
|
2022-02-23 18:10:05 +03:00
|
|
|
losses[self.name] += loss
|
2020-10-14 16:00:49 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2023-01-16 12:25:53 +03:00
|
|
|
def get_teacher_student_loss(
|
|
|
|
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
|
|
|
|
) -> Tuple[float, List[Floats2d]]:
|
|
|
|
"""Calculate the loss and its gradient for a batch of student
|
|
|
|
scores, relative to teacher scores.
|
|
|
|
|
|
|
|
teacher_scores: Scores representing the teacher model's predictions.
|
|
|
|
student_scores: Scores representing the student model's predictions.
|
|
|
|
|
|
|
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
2023-07-19 17:37:31 +03:00
|
|
|
|
2023-01-16 12:25:53 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
|
|
|
|
"""
|
2024-01-16 16:54:26 +03:00
|
|
|
loss_func = SequenceCategoricalCrossentropy(normalize=False)
|
2023-01-16 12:25:53 +03:00
|
|
|
d_scores, loss = loss_func(student_scores, teacher_scores)
|
|
|
|
if self.model.ops.xp.isnan(loss):
|
|
|
|
raise ValueError(Errors.E910.format(name=self.name))
|
|
|
|
return float(loss), d_scores
|
|
|
|
|
2020-07-22 14:42:59 +03:00
|
|
|
def get_loss(self, examples, scores):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Find the loss and gradient of loss for the batch of documents and
|
|
|
|
their predicted scores.
|
|
|
|
|
|
|
|
examples (Iterable[Examples]): The batch of examples.
|
|
|
|
scores: Scores representing the model's predictions.
|
2020-10-05 15:58:56 +03:00
|
|
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#get_loss
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-08-12 00:29:31 +03:00
|
|
|
validate_examples(examples, "Tagger.get_loss")
|
2024-01-16 16:54:26 +03:00
|
|
|
loss_func = SequenceCategoricalCrossentropy(
|
2023-06-12 16:57:10 +03:00
|
|
|
names=self.labels,
|
|
|
|
normalize=False,
|
|
|
|
neg_prefix=self.cfg["neg_prefix"],
|
|
|
|
label_smoothing=self.cfg["label_smoothing"]
|
|
|
|
)
|
2021-01-10 03:30:37 +03:00
|
|
|
# Convert empty tag "" to missing value None so that both misaligned
|
|
|
|
# tokens and tokens with missing annotation have the default missing
|
|
|
|
# value None.
|
|
|
|
truths = []
|
|
|
|
for eg in examples:
|
|
|
|
eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
|
|
|
|
truths.append(eg_truths)
|
2020-07-22 14:42:59 +03:00
|
|
|
d_scores, loss = loss_func(scores, truths)
|
|
|
|
if self.model.ops.xp.isnan(loss):
|
2020-10-04 12:16:31 +03:00
|
|
|
raise ValueError(Errors.E910.format(name=self.name))
|
2020-07-22 14:42:59 +03:00
|
|
|
return float(loss), d_scores
|
|
|
|
|
2020-09-29 17:48:44 +03:00
|
|
|
def initialize(self, get_examples, *, nlp=None, labels=None):
|
2020-09-08 23:44:25 +03:00
|
|
|
"""Initialize the pipe for training, using a representative set
|
|
|
|
of data examples.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2020-09-08 23:44:25 +03:00
|
|
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
|
|
|
returns a representative sample of gold-standard Example objects..
|
2020-09-29 13:20:26 +03:00
|
|
|
nlp (Language): The current nlp object the component is part of.
|
2020-10-01 18:38:17 +03:00
|
|
|
labels: The labels to add to the component, typically generated by the
|
|
|
|
`init labels` command. If no labels are provided, the get_examples
|
|
|
|
callback is used to extract the labels from the data.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#initialize
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-10-08 22:33:49 +03:00
|
|
|
validate_get_examples(get_examples, "Tagger.initialize")
|
2021-03-19 12:45:16 +03:00
|
|
|
util.check_lexeme_norms(self.vocab, "tagger")
|
2020-09-29 17:48:44 +03:00
|
|
|
if labels is not None:
|
|
|
|
for tag in labels:
|
|
|
|
self.add_label(tag)
|
|
|
|
else:
|
|
|
|
tags = set()
|
|
|
|
for example in get_examples():
|
|
|
|
for token in example.y:
|
|
|
|
if token.tag_:
|
|
|
|
tags.add(token.tag_)
|
|
|
|
for tag in sorted(tags):
|
|
|
|
self.add_label(tag)
|
2020-08-27 04:21:03 +03:00
|
|
|
doc_sample = []
|
2020-09-08 23:44:25 +03:00
|
|
|
label_sample = []
|
|
|
|
for example in islice(get_examples(), 10):
|
|
|
|
doc_sample.append(example.x)
|
|
|
|
gold_tags = example.get_aligned("TAG", as_string=True)
|
|
|
|
gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
|
|
|
|
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
|
2020-12-18 13:51:47 +03:00
|
|
|
self._require_labels()
|
2020-09-08 23:44:25 +03:00
|
|
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
|
|
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
|
|
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2020-08-07 16:27:13 +03:00
|
|
|
def add_label(self, label):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Add a new label to the pipe.
|
|
|
|
|
|
|
|
label (str): The label to add.
|
2020-07-28 14:37:31 +03:00
|
|
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#add_label
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
if not isinstance(label, str):
|
|
|
|
raise ValueError(Errors.E187)
|
|
|
|
if label in self.labels:
|
|
|
|
return 0
|
2020-09-08 23:44:25 +03:00
|
|
|
self._allow_extra_label()
|
2020-08-07 16:27:13 +03:00
|
|
|
self.cfg["labels"].append(label)
|
2020-10-10 19:55:07 +03:00
|
|
|
self.vocab.strings.add(label)
|
2020-07-22 14:42:59 +03:00
|
|
|
return 1
|