2020-07-22 14:42:59 +03:00
|
|
|
# cython: infer_types=True, profile=True, binding=True
|
2022-08-05 15:09:30 +03:00
|
|
|
from typing import Callable, Dict, Iterable, List, Optional, Union
|
2020-07-22 14:42:59 +03:00
|
|
|
import numpy
|
|
|
|
import srsly
|
|
|
|
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
2022-08-05 15:09:30 +03:00
|
|
|
from thinc.types import Floats2d, Ints1d
|
2020-07-22 14:42:59 +03:00
|
|
|
import warnings
|
2020-09-08 23:44:25 +03:00
|
|
|
from itertools import islice
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
from ..tokens.doc cimport Doc
|
|
|
|
from ..morphology cimport Morphology
|
|
|
|
from ..vocab cimport Vocab
|
|
|
|
|
2020-10-08 22:33:49 +03:00
|
|
|
from .trainable_pipe import TrainablePipe
|
|
|
|
from .pipe import deserialize_config
|
2020-07-22 14:42:59 +03:00
|
|
|
from ..language import Language
|
|
|
|
from ..attrs import POS, ID
|
|
|
|
from ..parts_of_speech import X
|
2020-10-04 12:16:31 +03:00
|
|
|
from ..errors import Errors, Warnings
|
Refactor the Scorer to improve flexibility (#5731)
* Refactor the Scorer to improve flexibility
Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.
* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
* with a provided pipeline containing components to be scored
* with a default pipeline containing the built-in statistical
components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline
Significant differences:
* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100
* Add kwargs to Morphologizer.evaluate
* Create generalized scoring methods in Scorer
* Generalized static scoring methods are added to `Scorer`
* Methods require an attribute (either on Token or Doc) that is
used to key the returned scores
Naming differences:
* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`
Scoring differences:
* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)
* Simplify / extend hasattr check for eval method
* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring
* Reset Example alignment if docs are set
Reset the Example alignment if either doc is set in case the
tokenization has changed.
* Add PRF tokenization scoring for tokens as spans
Add PRF scores for tokens as character spans. The scores are:
* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))
* Add docstring to Scorer.score_tokenization
* Rename component.evaluate() to component.score()
* Update Scorer API docs
* Update scoring for positive_label in textcat
* Fix TextCategorizer.score kwargs
* Update Language.evaluate docs
* Update score names in default config
2020-07-25 13:53:02 +03:00
|
|
|
from ..scorer import Scorer
|
2020-10-08 22:33:49 +03:00
|
|
|
from ..training import validate_examples, validate_get_examples
|
2021-08-10 16:13:39 +03:00
|
|
|
from ..util import registry
|
2020-07-22 14:42:59 +03:00
|
|
|
from .. import util
|
|
|
|
|
2022-08-30 11:07:33 +03:00
|
|
|
|
|
|
|
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
|
|
|
|
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
# See #9050
|
|
|
|
BACKWARD_OVERWRITE = False
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
default_model_config = """
|
|
|
|
[model]
|
2022-03-15 16:15:31 +03:00
|
|
|
@architectures = "spacy.Tagger.v2"
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
[model.tok2vec]
|
2021-04-22 11:04:15 +03:00
|
|
|
@architectures = "spacy.HashEmbedCNN.v2"
|
2020-07-22 14:42:59 +03:00
|
|
|
pretrained_vectors = null
|
|
|
|
width = 96
|
|
|
|
depth = 4
|
|
|
|
embed_size = 2000
|
|
|
|
window_size = 1
|
|
|
|
maxout_pieces = 3
|
|
|
|
subword_features = true
|
|
|
|
"""
|
|
|
|
DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|
|
|
|
|
|
|
|
|
|
|
@Language.factory(
|
|
|
|
"tagger",
|
|
|
|
assigns=["token.tag"],
|
2022-06-22 10:58:29 +03:00
|
|
|
default_config={
|
|
|
|
"model": DEFAULT_TAGGER_MODEL,
|
|
|
|
"overwrite": False,
|
|
|
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"},
|
|
|
|
"neg_prefix": "!",
|
2022-08-30 11:20:59 +03:00
|
|
|
"save_activations": False,
|
2022-06-22 10:58:29 +03:00
|
|
|
},
|
2020-07-27 13:27:40 +03:00
|
|
|
default_score_weights={"tag_acc": 1.0},
|
2020-07-22 14:42:59 +03:00
|
|
|
)
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
def make_tagger(
|
|
|
|
nlp: Language,
|
|
|
|
name: str,
|
|
|
|
model: Model,
|
|
|
|
overwrite: bool,
|
|
|
|
scorer: Optional[Callable],
|
2021-12-06 20:04:44 +03:00
|
|
|
neg_prefix: str,
|
2022-08-30 11:20:59 +03:00
|
|
|
save_activations: bool,
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
):
|
2020-08-09 16:09:31 +03:00
|
|
|
"""Construct a part-of-speech tagger component.
|
|
|
|
|
|
|
|
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
|
|
|
the tag probabilities. The output vectors should match the number of tags
|
|
|
|
in size, and be normalized as probabilities (all scores between 0 and 1,
|
|
|
|
with the rows summing to 1).
|
|
|
|
"""
|
2022-06-22 17:24:14 +03:00
|
|
|
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix,
|
2022-08-30 11:20:59 +03:00
|
|
|
save_activations=save_activations)
|
2021-08-10 16:13:39 +03:00
|
|
|
|
|
|
|
|
|
|
|
def tagger_score(examples, **kwargs):
|
|
|
|
return Scorer.score_token_attr(examples, "tag", **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
@registry.scorers("spacy.tagger_scorer.v1")
|
|
|
|
def make_tagger_scorer():
|
|
|
|
return tagger_score
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
|
2020-10-08 22:33:49 +03:00
|
|
|
class Tagger(TrainablePipe):
|
2020-07-22 14:42:59 +03:00
|
|
|
"""Pipeline component for part-of-speech tagging.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger
|
2020-07-22 14:42:59 +03:00
|
|
|
"""
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
vocab,
|
|
|
|
model,
|
|
|
|
name="tagger",
|
|
|
|
*,
|
|
|
|
overwrite=BACKWARD_OVERWRITE,
|
|
|
|
scorer=tagger_score,
|
2021-12-06 20:04:44 +03:00
|
|
|
neg_prefix="!",
|
2022-08-30 11:20:59 +03:00
|
|
|
save_activations: bool = False,
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Initialize a part-of-speech tagger.
|
|
|
|
|
|
|
|
vocab (Vocab): The shared vocabulary.
|
|
|
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
|
|
|
name (str): The component instance name, used to add entries to the
|
|
|
|
losses during training.
|
2021-08-12 13:50:03 +03:00
|
|
|
scorer (Optional[Callable]): The scoring method. Defaults to
|
|
|
|
Scorer.score_token_attr for the attribute "tag".
|
2022-08-30 11:20:59 +03:00
|
|
|
save_activations (bool): save model activations in Doc when annotating.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#init
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
self.vocab = vocab
|
|
|
|
self.model = model
|
|
|
|
self.name = name
|
|
|
|
self._rehearsal_model = None
|
2021-12-06 20:04:44 +03:00
|
|
|
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
|
2020-07-22 14:42:59 +03:00
|
|
|
self.cfg = dict(sorted(cfg.items()))
|
2021-08-10 16:13:39 +03:00
|
|
|
self.scorer = scorer
|
2022-08-30 11:20:59 +03:00
|
|
|
self.save_activations = save_activations
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
@property
|
|
|
|
def labels(self):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""The labels currently added to the component. Note that even for a
|
|
|
|
blank component, this will always include the built-in coarse-grained
|
|
|
|
part-of-speech tags by default.
|
|
|
|
|
|
|
|
RETURNS (Tuple[str]): The labels.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#labels
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-08-07 16:27:13 +03:00
|
|
|
return tuple(self.cfg["labels"])
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2020-09-29 17:22:13 +03:00
|
|
|
@property
|
|
|
|
def label_data(self):
|
2020-09-29 19:30:38 +03:00
|
|
|
"""Data about the labels currently added to the component."""
|
2020-09-29 17:22:13 +03:00
|
|
|
return tuple(self.cfg["labels"])
|
|
|
|
|
2022-08-30 11:07:33 +03:00
|
|
|
def predict(self, docs) -> ActivationsT:
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
|
|
|
|
|
|
|
docs (Iterable[Doc]): The documents to predict.
|
|
|
|
RETURNS: The models prediction for each document.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#predict
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
if not any(len(doc) for doc in docs):
|
|
|
|
# Handle cases where there are no tokens in any docs.
|
|
|
|
n_labels = len(self.labels)
|
|
|
|
guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs]
|
|
|
|
assert len(guesses) == len(docs)
|
2022-08-31 12:18:40 +03:00
|
|
|
return {"probabilities": guesses, "label_ids": guesses}
|
2020-07-22 14:42:59 +03:00
|
|
|
scores = self.model.predict(docs)
|
|
|
|
assert len(scores) == len(docs), (len(scores), len(docs))
|
|
|
|
guesses = self._scores2guesses(scores)
|
|
|
|
assert len(guesses) == len(docs)
|
2022-08-31 12:18:40 +03:00
|
|
|
return {"probabilities": scores, "label_ids": guesses}
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
def _scores2guesses(self, scores):
|
|
|
|
guesses = []
|
|
|
|
for doc_scores in scores:
|
|
|
|
doc_guesses = doc_scores.argmax(axis=1)
|
|
|
|
if not isinstance(doc_guesses, numpy.ndarray):
|
|
|
|
doc_guesses = doc_guesses.get()
|
|
|
|
guesses.append(doc_guesses)
|
|
|
|
return guesses
|
|
|
|
|
2022-08-30 11:07:33 +03:00
|
|
|
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Modify a batch of documents, using pre-computed scores.
|
|
|
|
|
|
|
|
docs (Iterable[Doc]): The documents to modify.
|
2022-08-30 11:07:33 +03:00
|
|
|
activations (ActivationsT): The activations used for setting annotations, produced by Tagger.predict.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#set_annotations
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2022-08-31 12:18:40 +03:00
|
|
|
batch_tag_ids = activations["label_ids"]
|
2020-07-22 14:42:59 +03:00
|
|
|
if isinstance(docs, Doc):
|
|
|
|
docs = [docs]
|
|
|
|
cdef Doc doc
|
|
|
|
cdef Vocab vocab = self.vocab
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
cdef bint overwrite = self.cfg["overwrite"]
|
2021-11-30 13:58:59 +03:00
|
|
|
labels = self.labels
|
2020-07-22 14:42:59 +03:00
|
|
|
for i, doc in enumerate(docs):
|
2022-08-30 11:20:59 +03:00
|
|
|
if self.save_activations:
|
2022-08-29 17:33:08 +03:00
|
|
|
doc.activations[self.name] = {}
|
|
|
|
for act_name, acts in activations.items():
|
|
|
|
doc.activations[self.name][act_name] = acts[i]
|
2020-07-22 14:42:59 +03:00
|
|
|
doc_tag_ids = batch_tag_ids[i]
|
|
|
|
if hasattr(doc_tag_ids, "get"):
|
|
|
|
doc_tag_ids = doc_tag_ids.get()
|
|
|
|
for j, tag_id in enumerate(doc_tag_ids):
|
Add overwrite settings for more components (#9050)
* Add overwrite settings for more components
For pipeline components where it's relevant and not already implemented,
add an explicit `overwrite` setting that controls whether
`set_annotations` overwrites existing annotation.
For the `morphologizer`, add an additional setting `extend`, which
controls whether the existing features are preserved.
* +overwrite, +extend: overwrite values of existing features, add any new
features
* +overwrite, -extend: overwrite completely, removing any existing
features
* -overwrite, +extend: keep values of existing features, add any new
features
* -overwrite, -extend: do not modify the existing value if set
In all cases an unset value will be set by `set_annotations`.
Preserve current overwrite defaults:
* True: morphologizer, entity linker
* False: tagger, sentencizer, senter
* Add backwards compat overwrite settings
* Put empty line back
Removed by accident in last commit
* Set backwards-compatible defaults in __init__
Because the `TrainablePipe` serialization methods update `cfg`, there's
no straightforward way to detect whether models serialized with a
previous version are missing the overwrite settings.
It would be possible in the sentencizer due to its separate
serialization methods, however to keep the changes parallel, this also
sets the default in `__init__`.
* Remove traces
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2021-09-30 16:35:55 +03:00
|
|
|
if doc.c[j].tag == 0 or overwrite:
|
2021-11-30 13:58:59 +03:00
|
|
|
doc.c[j].tag = self.vocab.strings[labels[tag_id]]
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2021-01-20 03:49:25 +03:00
|
|
|
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Learn from a batch of documents and gold-standard information,
|
2021-01-25 17:18:45 +03:00
|
|
|
updating the pipe's model. Delegates to predict and get_loss.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
|
|
|
examples (Iterable[Example]): A batch of Example objects.
|
|
|
|
drop (float): The dropout rate.
|
|
|
|
sgd (thinc.api.Optimizer): The optimizer.
|
|
|
|
losses (Dict[str, float]): Optional record of the loss during training.
|
|
|
|
Updated using the component name as the key.
|
|
|
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#update
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
if losses is None:
|
|
|
|
losses = {}
|
|
|
|
losses.setdefault(self.name, 0.0)
|
2020-08-12 00:29:31 +03:00
|
|
|
validate_examples(examples, "Tagger.update")
|
|
|
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
|
|
|
# Handle cases where there are no tokens in any docs.
|
2020-10-14 16:00:49 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
set_dropout_rate(self.model, drop)
|
2020-08-12 00:29:31 +03:00
|
|
|
tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples])
|
2020-07-22 14:42:59 +03:00
|
|
|
for sc in tag_scores:
|
|
|
|
if self.model.ops.xp.isnan(sc.sum()):
|
2020-08-12 00:29:31 +03:00
|
|
|
raise ValueError(Errors.E940)
|
2020-07-22 14:42:59 +03:00
|
|
|
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
|
|
|
bp_tag_scores(d_tag_scores)
|
|
|
|
if sgd not in (None, False):
|
2020-10-05 17:23:33 +03:00
|
|
|
self.finish_update(sgd)
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
losses[self.name] += loss
|
|
|
|
return losses
|
|
|
|
|
2020-07-27 19:11:45 +03:00
|
|
|
def rehearse(self, examples, *, drop=0., sgd=None, losses=None):
|
|
|
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
|
|
|
teach the current model to make predictions similar to an initial model,
|
|
|
|
to try to address the "catastrophic forgetting" problem. This feature is
|
|
|
|
experimental.
|
|
|
|
|
|
|
|
examples (Iterable[Example]): A batch of Example objects.
|
|
|
|
drop (float): The dropout rate.
|
|
|
|
sgd (thinc.api.Optimizer): The optimizer.
|
|
|
|
losses (Dict[str, float]): Optional record of the loss during training.
|
|
|
|
Updated using the component name as the key.
|
|
|
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#rehearse
|
2020-07-22 14:42:59 +03:00
|
|
|
"""
|
2022-02-23 18:10:05 +03:00
|
|
|
loss_func = SequenceCategoricalCrossentropy()
|
2020-10-14 16:11:34 +03:00
|
|
|
if losses is None:
|
|
|
|
losses = {}
|
|
|
|
losses.setdefault(self.name, 0.0)
|
2020-08-12 00:29:31 +03:00
|
|
|
validate_examples(examples, "Tagger.rehearse")
|
|
|
|
docs = [eg.predicted for eg in examples]
|
2020-07-22 14:42:59 +03:00
|
|
|
if self._rehearsal_model is None:
|
2020-10-14 16:11:34 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
if not any(len(doc) for doc in docs):
|
|
|
|
# Handle cases where there are no tokens in any docs.
|
2020-10-14 16:00:49 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
set_dropout_rate(self.model, drop)
|
2022-02-23 18:10:05 +03:00
|
|
|
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
|
|
|
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
|
|
|
grads, loss = loss_func(tag_scores, tutor_tag_scores)
|
|
|
|
bp_tag_scores(grads)
|
2020-10-05 17:23:33 +03:00
|
|
|
self.finish_update(sgd)
|
2022-02-23 18:10:05 +03:00
|
|
|
losses[self.name] += loss
|
2020-10-14 16:00:49 +03:00
|
|
|
return losses
|
2020-07-22 14:42:59 +03:00
|
|
|
|
|
|
|
def get_loss(self, examples, scores):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Find the loss and gradient of loss for the batch of documents and
|
|
|
|
their predicted scores.
|
|
|
|
|
|
|
|
examples (Iterable[Examples]): The batch of examples.
|
|
|
|
scores: Scores representing the model's predictions.
|
2020-10-05 15:58:56 +03:00
|
|
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#get_loss
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-08-12 00:29:31 +03:00
|
|
|
validate_examples(examples, "Tagger.get_loss")
|
2021-12-06 20:04:44 +03:00
|
|
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
2021-01-10 03:30:37 +03:00
|
|
|
# Convert empty tag "" to missing value None so that both misaligned
|
|
|
|
# tokens and tokens with missing annotation have the default missing
|
|
|
|
# value None.
|
|
|
|
truths = []
|
|
|
|
for eg in examples:
|
|
|
|
eg_truths = [tag if tag is not "" else None for tag in eg.get_aligned("TAG", as_string=True)]
|
|
|
|
truths.append(eg_truths)
|
2020-07-22 14:42:59 +03:00
|
|
|
d_scores, loss = loss_func(scores, truths)
|
|
|
|
if self.model.ops.xp.isnan(loss):
|
2020-10-04 12:16:31 +03:00
|
|
|
raise ValueError(Errors.E910.format(name=self.name))
|
2020-07-22 14:42:59 +03:00
|
|
|
return float(loss), d_scores
|
|
|
|
|
2020-09-29 17:48:44 +03:00
|
|
|
def initialize(self, get_examples, *, nlp=None, labels=None):
|
2020-09-08 23:44:25 +03:00
|
|
|
"""Initialize the pipe for training, using a representative set
|
|
|
|
of data examples.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2020-09-08 23:44:25 +03:00
|
|
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
|
|
|
returns a representative sample of gold-standard Example objects..
|
2020-09-29 13:20:26 +03:00
|
|
|
nlp (Language): The current nlp object the component is part of.
|
2020-10-01 18:38:17 +03:00
|
|
|
labels: The labels to add to the component, typically generated by the
|
|
|
|
`init labels` command. If no labels are provided, the get_examples
|
|
|
|
callback is used to extract the labels from the data.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#initialize
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-10-08 22:33:49 +03:00
|
|
|
validate_get_examples(get_examples, "Tagger.initialize")
|
2021-03-19 12:45:16 +03:00
|
|
|
util.check_lexeme_norms(self.vocab, "tagger")
|
2020-09-29 17:48:44 +03:00
|
|
|
if labels is not None:
|
|
|
|
for tag in labels:
|
|
|
|
self.add_label(tag)
|
|
|
|
else:
|
|
|
|
tags = set()
|
|
|
|
for example in get_examples():
|
|
|
|
for token in example.y:
|
|
|
|
if token.tag_:
|
|
|
|
tags.add(token.tag_)
|
|
|
|
for tag in sorted(tags):
|
|
|
|
self.add_label(tag)
|
2020-08-27 04:21:03 +03:00
|
|
|
doc_sample = []
|
2020-09-08 23:44:25 +03:00
|
|
|
label_sample = []
|
|
|
|
for example in islice(get_examples(), 10):
|
|
|
|
doc_sample.append(example.x)
|
|
|
|
gold_tags = example.get_aligned("TAG", as_string=True)
|
|
|
|
gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
|
|
|
|
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
|
2020-12-18 13:51:47 +03:00
|
|
|
self._require_labels()
|
2020-09-08 23:44:25 +03:00
|
|
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
|
|
|
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
|
|
|
|
self.model.initialize(X=doc_sample, Y=label_sample)
|
2020-07-22 14:42:59 +03:00
|
|
|
|
2020-08-07 16:27:13 +03:00
|
|
|
def add_label(self, label):
|
2020-07-27 19:11:45 +03:00
|
|
|
"""Add a new label to the pipe.
|
|
|
|
|
|
|
|
label (str): The label to add.
|
2020-07-28 14:37:31 +03:00
|
|
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
2020-07-27 19:11:45 +03:00
|
|
|
|
2021-01-30 12:09:38 +03:00
|
|
|
DOCS: https://spacy.io/api/tagger#add_label
|
2020-07-27 19:11:45 +03:00
|
|
|
"""
|
2020-07-22 14:42:59 +03:00
|
|
|
if not isinstance(label, str):
|
|
|
|
raise ValueError(Errors.E187)
|
|
|
|
if label in self.labels:
|
|
|
|
return 0
|
2020-09-08 23:44:25 +03:00
|
|
|
self._allow_extra_label()
|
2020-08-07 16:27:13 +03:00
|
|
|
self.cfg["labels"].append(label)
|
2020-10-10 19:55:07 +03:00
|
|
|
self.vocab.strings.add(label)
|
2022-08-29 17:47:13 +03:00
|
|
|
return 1
|