diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 32d22d1bc..fbdb91ab9 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -295,7 +295,11 @@ def train_while_improving( nlp.rehearse(raw_batch, sgd=optimizer, losses=losses, exclude=exclude) # TODO: refactor this so we don't have to run it separately in here for name, proc in nlp.pipeline: - if name not in exclude and hasattr(proc, "model"): + if ( + name not in exclude + and hasattr(proc, "model") + and proc.model not in (True, False, None) + ): proc.model.finish_update(optimizer) optimizer.step_schedules() if not (step % eval_frequency): diff --git a/spacy/errors.py b/spacy/errors.py index 8e9a8d4b4..c4eb4af28 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -482,6 +482,15 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E930 = ("Received invalid get_examples callback in {name}.begin_training. " + "Expected function that returns an iterable of Example objects but " + "got: {obj}") + E931 = ("Encountered Pipe subclass without Pipe.{method} method in component " + "'{name}'. If the component is trainable and you want to use this " + "method, make sure it's overwritten on the subclass. If your " + "component isn't trainable, add a method that does nothing or " + "don't use the Pipe base class.") + E940 = ("Found NaN values in scores.") E941 = ("Can't find model '{name}'. It looks like you're trying to load a " "model from a shortcut, which is deprecated as of spaCy v3.0. To " "load the model, use its full name instead:\n\n" @@ -578,8 +587,7 @@ class Errors: "but received None.") E977 = ("Can not compare a MorphAnalysis with a string object. " "This is likely a bug in spaCy, so feel free to open an issue.") - E978 = ("The '{method}' method of {name} takes a list of Example objects, " - "but found {types} instead.") + E978 = ("The {name} method takes a list of Example objects, but got: {types}") E979 = ("Cannot convert {type} to an Example object.") E980 = ("Each link annotation should refer to a dictionary with at most one " "identifier mapping to 1.0, and all others to 0.0.") diff --git a/spacy/gold/__init__.py b/spacy/gold/__init__.py index 142c6b3a7..4d71eae09 100644 --- a/spacy/gold/__init__.py +++ b/spacy/gold/__init__.py @@ -1,5 +1,5 @@ from .corpus import Corpus # noqa: F401 -from .example import Example # noqa: F401 +from .example import Example, validate_examples # noqa: F401 from .align import Alignment # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401 diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 6093d2346..3344704bf 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -1,5 +1,5 @@ +from collections import Iterable as IterableInstance import warnings - import numpy from ..tokens.doc cimport Doc @@ -26,6 +26,22 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): return output +def validate_examples(examples, method): + """Check that a batch of examples received during processing is valid. + This function lives here to prevent circular imports. + + examples (Iterable[Examples]): A batch of examples. + method (str): The method name to show in error messages. + """ + if not isinstance(examples, IterableInstance): + err = Errors.E978.format(name=method, types=type(examples)) + raise TypeError(err) + wrong = set([type(eg) for eg in examples if not isinstance(eg, Example)]) + if wrong: + err = Errors.E978.format(name=method, types=wrong) + raise TypeError(err) + + cdef class Example: def __init__(self, Doc predicted, Doc reference, *, alignment=None): if predicted is None: @@ -263,12 +279,10 @@ def _annot2array(vocab, tok_annot, doc_annot): values.append([vocab.morphology.add(v) for v in value]) else: attrs.append(key) - try: - values.append([vocab.strings.add(v) for v in value]) - except TypeError: - types= set([type(v) for v in value]) + if not all(isinstance(v, str) for v in value): + types = set([type(v) for v in value]) raise TypeError(Errors.E969.format(field=key, types=types)) from None - + values.append([vocab.strings.add(v) for v in value]) array = numpy.asarray(values, dtype="uint64") return attrs, array.T diff --git a/spacy/language.py b/spacy/language.py index 85aac15ef..efdf633d4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -5,7 +5,6 @@ import random import itertools import weakref import functools -from collections import Iterable as IterableInstance from contextlib import contextmanager from copy import copy, deepcopy from pathlib import Path @@ -19,7 +18,7 @@ from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis -from .gold import Example +from .gold import Example, validate_examples from .scorer import Scorer from .util import create_default_optimizer, registry from .util import SimpleFrozenDict, combine_score_weights @@ -935,17 +934,7 @@ class Language: losses = {} if len(examples) == 0: return losses - if not isinstance(examples, IterableInstance): - raise TypeError( - Errors.E978.format( - name="language", method="update", types=type(examples) - ) - ) - wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) - if wrong_types: - raise TypeError( - Errors.E978.format(name="language", method="update", types=wrong_types) - ) + validate_examples(examples, "Language.update") if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer() @@ -962,7 +951,11 @@ class Language: proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) if sgd not in (None, False): for name, proc in self.pipeline: - if name not in exclude and hasattr(proc, "model"): + if ( + name not in exclude + and hasattr(proc, "model") + and proc.model not in (True, False, None) + ): proc.model.finish_update(sgd) return losses @@ -999,19 +992,7 @@ class Language: """ if len(examples) == 0: return - if not isinstance(examples, IterableInstance): - raise TypeError( - Errors.E978.format( - name="language", method="rehearse", types=type(examples) - ) - ) - wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) - if wrong_types: - raise TypeError( - Errors.E978.format( - name="language", method="rehearse", types=wrong_types - ) - ) + validate_examples(examples, "Language.rehearse") if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer() @@ -1060,7 +1041,15 @@ class Language: if get_examples is None: get_examples = lambda: [] else: # Populate vocab + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="Language", obj=type(get_examples)) + raise ValueError(err) for example in get_examples(): + if not isinstance(example, Example): + err = Errors.E978.format( + name="Language.begin_training", types=type(example) + ) + raise ValueError(err) for word in [t.text for t in example.reference]: _ = self.vocab[word] # noqa: F841 if device >= 0: # TODO: do we need this here? @@ -1133,17 +1122,7 @@ class Language: DOCS: https://spacy.io/api/language#evaluate """ - if not isinstance(examples, IterableInstance): - err = Errors.E978.format( - name="language", method="evaluate", types=type(examples) - ) - raise TypeError(err) - wrong_types = set([type(eg) for eg in examples if not isinstance(eg, Example)]) - if wrong_types: - err = Errors.E978.format( - name="language", method="evaluate", types=wrong_types - ) - raise TypeError(err) + validate_examples(examples, "Language.evaluate") if component_cfg is None: component_cfg = {} if scorer_cfg is None: @@ -1663,7 +1642,7 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None: else: raise ValueError(Errors.E092) for name, proc in nlp.pipeline: - if not hasattr(proc, "cfg"): + if not hasattr(proc, "cfg") or not isinstance(proc.cfg, dict): continue proc.cfg.setdefault("deprecation_fixes", {}) proc.cfg["deprecation_fixes"]["vectors_name"] = nlp.vocab.vectors.name diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index 801229af5..76f58df58 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -9,6 +9,7 @@ from .functions import merge_subtokens from ..language import Language from ._parser_internals import nonproj from ..scorer import Scorer +from ..gold import validate_examples default_model_config = """ @@ -147,6 +148,7 @@ cdef class DependencyParser(Parser): DOCS: https://spacy.io/api/dependencyparser#score """ + validate_examples(examples, "DependencyParser.score") def dep_getter(token, attr): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 080273f57..35bf2906e 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -11,7 +11,7 @@ from ..tokens import Doc from .pipe import Pipe, deserialize_config from ..language import Language from ..vocab import Vocab -from ..gold import Example +from ..gold import Example, validate_examples from ..errors import Errors, Warnings from .. import util @@ -142,7 +142,7 @@ class EntityLinker(Pipe): def begin_training( self, - get_examples: Callable[[], Iterable[Example]] = lambda: [], + get_examples: Callable[[], Iterable[Example]], *, pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, @@ -197,14 +197,9 @@ class EntityLinker(Pipe): losses.setdefault(self.name, 0.0) if not examples: return losses + validate_examples(examples, "EntityLinker.update") sentence_docs = [] - try: - docs = [eg.predicted for eg in examples] - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError( - Errors.E978.format(name="EntityLinker", method="update", types=types) - ) from None + docs = [eg.predicted for eg in examples] if set_annotations: # This seems simpler than other ways to get that exact output -- but # it does run the model twice :( @@ -250,6 +245,7 @@ class EntityLinker(Pipe): return losses def get_loss(self, examples: Iterable[Example], sentence_encodings): + validate_examples(examples, "EntityLinker.get_loss") entity_encodings = [] for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index bef97ec46..4f4e0fdd5 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -9,6 +9,7 @@ from ..util import ensure_path, to_disk, from_disk from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher from ..scorer import Scorer +from ..gold import validate_examples DEFAULT_ENT_ID_SEP = "||" @@ -312,6 +313,7 @@ class EntityRuler: return label def score(self, examples, **kwargs): + validate_examples(examples, "EntityRuler.score") return Scorer.score_spans(examples, "ents", **kwargs) def from_bytes( diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index f2028772f..76aea8f99 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,5 +1,4 @@ from typing import Optional, List, Dict, Any - from thinc.api import Model from .pipe import Pipe @@ -9,6 +8,7 @@ from ..lookups import Lookups, load_lookups from ..scorer import Scorer from ..tokens import Doc, Token from ..vocab import Vocab +from ..gold import validate_examples from .. import util @@ -135,10 +135,10 @@ class Lemmatizer(Pipe): elif self.mode == "rule": self.lemmatize = self.rule_lemmatize else: - try: - self.lemmatize = getattr(self, f"{self.mode}_lemmatize") - except AttributeError: + mode_attr = f"{self.mode}_lemmatize" + if not hasattr(self, mode_attr): raise ValueError(Errors.E1003.format(mode=mode)) + self.lemmatize = getattr(self, mode_attr) self.cache = {} @property @@ -271,6 +271,7 @@ class Lemmatizer(Pipe): DOCS: https://spacy.io/api/lemmatizer#score """ + validate_examples(examples, "Lemmatizer.score") return Scorer.score_token_attr(examples, "lemma", **kwargs) def to_disk(self, path, *, exclude=tuple()): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index efc494181..329a05f90 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -6,15 +6,16 @@ from thinc.api import SequenceCategoricalCrossentropy, Model, Config from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology + from ..parts_of_speech import IDS as POS_IDS from ..symbols import POS - from ..language import Language from ..errors import Errors from .pipe import deserialize_config from .tagger import Tagger from .. import util from ..scorer import Scorer +from ..gold import validate_examples default_model_config = """ @@ -126,7 +127,7 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] return 1 - def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None): + def begin_training(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that @@ -140,6 +141,9 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#begin_training """ + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="Morphologizer", obj=type(get_examples)) + raise ValueError(err) for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ @@ -192,6 +196,7 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#get_loss """ + validate_examples(examples, "Morphologizer.get_loss") loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) truths = [] for eg in examples: @@ -228,6 +233,7 @@ class Morphologizer(Tagger): DOCS: https://spacy.io/api/morphologizer#score """ + validate_examples(examples, "Morphologizer.score") results = {} results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 84ed19b0d..3ef85c821 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -8,6 +8,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from .tagger import Tagger +from ..gold import validate_examples from ..language import Language from ._parser_internals import nonproj from ..attrs import POS, ID @@ -80,10 +81,11 @@ class MultitaskObjective(Tagger): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None): - gold_examples = nonproj.preprocess_training_data(get_examples()) - # for raw_text, doc_annot in gold_tuples: - for example in gold_examples: + def begin_training(self, get_examples, pipeline=None, sgd=None): + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples)) + raise ValueError(err) + for example in get_examples(): for token in example.y: label = self.make_label(token) if label is not None and label not in self.labels: @@ -175,7 +177,7 @@ class ClozeMultitask(Pipe): def set_annotations(self, docs, dep_ids): pass - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None): + def begin_training(self, get_examples, pipeline=None, sgd=None): self.model.initialize() X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) @@ -189,6 +191,7 @@ class ClozeMultitask(Pipe): return tokvecs, vectors def get_loss(self, examples, vectors, prediction): + validate_examples(examples, "ClozeMultitask.get_loss") # The simplest way to implement this would be to vstack the # token.vector values, but that's a bit inefficient, especially on GPU. # Instead we fetch the index into the vectors table for each of our tokens, @@ -206,18 +209,16 @@ class ClozeMultitask(Pipe): if losses is not None and self.name not in losses: losses[self.name] = 0. set_dropout_rate(self.model, drop) - try: - predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples]) - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) from None + validate_examples(examples, "ClozeMultitask.rehearse") + docs = [eg.predicted for eg in examples] + predictions, bp_predictions = self.model.begin_update() loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions) if sgd is not None: self.model.finish_update(sgd) - if losses is not None: losses[self.name] += loss + return losses def add_label(self, label): raise NotImplementedError diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index a3bc3d920..631b5ae72 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -7,6 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import Scorer +from ..gold import validate_examples default_model_config = """ @@ -50,7 +51,7 @@ def make_ner( ): """Create a transition-based EntityRecognizer component. The entity recognizer identifies non-overlapping labelled spans of tokens. - + The transition-based algorithm used encodes certain assumptions that are effective for "traditional" named entity recognition tasks, but may not be a good fit for every span identification problem. Specifically, the loss @@ -120,4 +121,5 @@ cdef class EntityRecognizer(Parser): DOCS: https://spacy.io/api/entityrecognizer#score """ + validate_examples(examples, "EntityRecognizer.score") return Scorer.score_spans(examples, "ents", **kwargs) diff --git a/spacy/pipeline/pipe.pxd b/spacy/pipeline/pipe.pxd index bb97f79d0..bca94d528 100644 --- a/spacy/pipeline/pipe.pxd +++ b/spacy/pipeline/pipe.pxd @@ -1,2 +1,5 @@ cdef class Pipe: + cdef public object vocab + cdef public object model cdef public str name + cdef public object cfg diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 96a8b5944..51251dacc 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -1,9 +1,10 @@ # cython: infer_types=True, profile=True import srsly +from thinc.api import set_dropout_rate, Model from ..tokens.doc cimport Doc -from ..util import create_default_optimizer +from ..gold import validate_examples from ..errors import Errors from .. import util @@ -16,7 +17,6 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe """ - def __init__(self, vocab, model, name, **cfg): """Initialize a pipeline component. @@ -27,7 +27,10 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#init """ - raise NotImplementedError + self.vocab = vocab + self.model = model + self.name = name + self.cfg = dict(cfg) def __call__(self, Doc doc): """Apply the pipe to one document. The document is modified in place, @@ -68,7 +71,7 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#predict """ - raise NotImplementedError + raise NotImplementedError(Errors.E931.format(method="predict", name=self.name)) def set_annotations(self, docs, scores): """Modify a batch of documents, using pre-computed scores. @@ -78,7 +81,43 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#set_annotations """ - raise NotImplementedError + raise NotImplementedError(Errors.E931.format(method="set_annotations", name=self.name)) + + def update(self, examples, *, drop=0.0, set_annotations=False, sgd=None, losses=None): + """Learn from a batch of documents and gold-standard information, + updating the pipe's model. Delegates to predict and get_loss. + + examples (Iterable[Example]): A batch of Example objects. + drop (float): The dropout rate. + set_annotations (bool): Whether or not to update the Example objects + with the predictions. + sgd (thinc.api.Optimizer): The optimizer. + losses (Dict[str, float]): Optional record of the loss during training. + Updated using the component name as the key. + RETURNS (Dict[str, float]): The updated losses dictionary. + + DOCS: https://spacy.io/api/pipe#update + """ + if losses is None: + losses = {} + if not hasattr(self, "model") or self.model in (None, True, False): + return losses + losses.setdefault(self.name, 0.0) + validate_examples(examples, "Pipe.update") + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return + set_dropout_rate(self.model, drop) + scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples]) + loss, d_scores = self.get_loss(examples, scores) + bp_scores(d_scores) + if sgd not in (None, False): + self.model.finish_update(sgd) + losses[self.name] += loss + if set_annotations: + docs = [eg.predicted for eg in examples] + self.set_annotations(docs, scores=scores) + return losses def rehearse(self, examples, *, sgd=None, losses=None, **config): """Perform a "rehearsal" update from a batch of data. Rehearsal updates @@ -107,7 +146,7 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#get_loss """ - raise NotImplementedError + raise NotImplementedError(Errors.E931.format(method="get_loss", name=self.name)) def add_label(self, label): """Add an output label, to be predicted by the model. It's possible to @@ -119,7 +158,7 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#add_label """ - raise NotImplementedError + raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) def create_optimizer(self): """Create an optimizer for the pipeline component. @@ -128,9 +167,9 @@ cdef class Pipe: DOCS: https://spacy.io/api/pipe#create_optimizer """ - return create_default_optimizer() + return util.create_default_optimizer() - def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None): + def begin_training(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index be4351212..46d599497 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -7,6 +7,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from ..language import Language from ..scorer import Scorer +from ..gold import validate_examples from .. import util @@ -58,7 +59,7 @@ class Sentencizer(Pipe): else: self.punct_chars = set(self.default_punct_chars) - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None): + def begin_training(self, get_examples, pipeline=None, sgd=None): pass def __call__(self, doc): @@ -158,6 +159,7 @@ class Sentencizer(Pipe): DOCS: https://spacy.io/api/sentencizer#score """ + validate_examples(examples, "Sentencizer.score") results = Scorer.score_spans(examples, "sents", **kwargs) del results["sents_per_type"] return results diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index cf7479c29..e82225d27 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -9,6 +9,7 @@ from .tagger import Tagger from ..language import Language from ..errors import Errors from ..scorer import Scorer +from ..gold import validate_examples from .. import util @@ -102,6 +103,7 @@ class SentenceRecognizer(Tagger): DOCS: https://spacy.io/api/sentencerecognizer#get_loss """ + validate_examples(examples, "SentenceRecognizer.get_loss") labels = self.labels loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) truths = [] @@ -121,7 +123,7 @@ class SentenceRecognizer(Tagger): raise ValueError("nan value when computing loss") return float(loss), d_scores - def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None): + def begin_training(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that @@ -151,6 +153,7 @@ class SentenceRecognizer(Tagger): RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. DOCS: https://spacy.io/api/sentencerecognizer#score """ + validate_examples(examples, "SentenceRecognizer.score") results = Scorer.score_spans(examples, "sents", **kwargs) del results["sents_per_type"] return results diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 4965b2e13..5f3addbd7 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -1,4 +1,4 @@ -from typing import List, Iterable, Optional, Dict, Tuple, Callable +from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set from thinc.types import Floats2d from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model from thinc.api import Optimizer, Config @@ -6,6 +6,7 @@ from thinc.util import to_numpy from ..errors import Errors from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob +from ..gold import validate_examples from ..tokens import Doc from ..language import Language from ..vocab import Vocab @@ -127,6 +128,7 @@ class SimpleNER(Pipe): if losses is None: losses = {} losses.setdefault("ner", 0.0) + validate_examples(examples, "SimpleNER.update") if not any(_has_ner(eg) for eg in examples): return losses docs = [eg.predicted for eg in examples] @@ -142,6 +144,7 @@ class SimpleNER(Pipe): return losses def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]: + validate_examples(examples, "SimpleNER.get_loss") truths = [] for eg in examples: tags = eg.get_aligned_ner() @@ -161,14 +164,17 @@ class SimpleNER(Pipe): def begin_training( self, - get_examples: Callable, + get_examples: Callable[[], Iterable[Example]], pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ): + all_labels = set() if not hasattr(get_examples, "__call__"): - gold_tuples = get_examples - get_examples = lambda: gold_tuples - for label in _get_labels(get_examples()): + err = Errors.E930.format(name="SimpleNER", obj=type(get_examples)) + raise ValueError(err) + for example in get_examples(): + all_labels.update(_get_labels(example)) + for label in sorted(all_labels): self.add_label(label) labels = self.labels n_actions = self.model.attrs["get_num_actions"](len(labels)) @@ -185,6 +191,7 @@ class SimpleNER(Pipe): pass def score(self, examples, **kwargs): + validate_examples(examples, "SimpleNER.score") return Scorer.score_spans(examples, "ents", **kwargs) @@ -196,10 +203,9 @@ def _has_ner(example: Example) -> bool: return False -def _get_labels(examples: List[Example]) -> List[str]: +def _get_labels(example: Example) -> Set[str]: labels = set() - for eg in examples: - for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True): - if ner_tag != "O" and ner_tag != "-": - labels.add(ner_tag) - return list(sorted(labels)) + for ner_tag in example.get_aligned("ENT_TYPE", as_string=True): + if ner_tag != "O" and ner_tag != "-": + labels.add(ner_tag) + return labels diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 937290d5f..9070329e8 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -16,6 +16,7 @@ from ..attrs import POS, ID from ..parts_of_speech import X from ..errors import Errors, TempErrors, Warnings from ..scorer import Scorer +from ..gold import validate_examples from .. import util @@ -187,19 +188,15 @@ class Tagger(Pipe): if losses is None: losses = {} losses.setdefault(self.name, 0.0) - try: - if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): - # Handle cases where there are no tokens in any docs. - return - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) from None + validate_examples(examples, "Tagger.update") + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return set_dropout_rate(self.model, drop) - tag_scores, bp_tag_scores = self.model.begin_update( - [eg.predicted for eg in examples]) + tag_scores, bp_tag_scores = self.model.begin_update([eg.predicted for eg in examples]) for sc in tag_scores: if self.model.ops.xp.isnan(sc.sum()): - raise ValueError("nan value in scores") + raise ValueError(Errors.E940) loss, d_tag_scores = self.get_loss(examples, tag_scores) bp_tag_scores(d_tag_scores) if sgd not in (None, False): @@ -226,11 +223,8 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#rehearse """ - try: - docs = [eg.predicted for eg in examples] - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) from None + validate_examples(examples, "Tagger.rehearse") + docs = [eg.predicted for eg in examples] if self._rehearsal_model is None: return if not any(len(doc) for doc in docs): @@ -256,6 +250,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#get_loss """ + validate_examples(examples, "Tagger.get_loss") loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) truths = [eg.get_aligned("TAG", as_string=True) for eg in examples] d_scores, loss = loss_func(scores, truths) @@ -263,7 +258,7 @@ class Tagger(Pipe): raise ValueError("nan value when computing loss") return float(loss), d_scores - def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None): + def begin_training(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. get_examples (Callable[[], Iterable[Example]]): Optional function that @@ -277,13 +272,12 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#begin_training """ + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="Tagger", obj=type(get_examples)) + raise ValueError(err) tags = set() for example in get_examples(): - try: - y = example.y - except AttributeError: - raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None - for token in y: + for token in example.y: tags.add(token.tag_) for tag in sorted(tags): self.add_label(tag) @@ -318,6 +312,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#score """ + validate_examples(examples, "Tagger.score") return Scorer.score_token_attr(examples, "tag", **kwargs) def to_bytes(self, *, exclude=tuple()): diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 7b9cc1e24..ce4f286e5 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -5,7 +5,7 @@ import numpy from .pipe import Pipe from ..language import Language -from ..gold import Example +from ..gold import Example, validate_examples from ..errors import Errors from ..scorer import Scorer from .. import util @@ -209,15 +209,10 @@ class TextCategorizer(Pipe): if losses is None: losses = {} losses.setdefault(self.name, 0.0) - try: - if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): - # Handle cases where there are no tokens in any docs. - return losses - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError( - Errors.E978.format(name="TextCategorizer", method="update", types=types) - ) from None + validate_examples(examples, "TextCategorizer.update") + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): + # Handle cases where there are no tokens in any docs. + return losses set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples]) loss, d_scores = self.get_loss(examples, scores) @@ -252,19 +247,12 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer#rehearse """ - if losses is not None: losses.setdefault(self.name, 0.0) if self._rehearsal_model is None: return losses - try: - docs = [eg.predicted for eg in examples] - except AttributeError: - types = set([type(eg) for eg in examples]) - err = Errors.E978.format( - name="TextCategorizer", method="rehearse", types=types - ) - raise TypeError(err) from None + validate_examples(examples, "TextCategorizer.rehearse") + docs = [eg.predicted for eg in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return losses @@ -303,6 +291,7 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer#get_loss """ + validate_examples(examples, "TextCategorizer.get_loss") truths, not_missing = self._examples_to_truth(examples) not_missing = self.model.ops.asarray(not_missing) d_scores = (scores - truths) / scores.shape[0] @@ -338,7 +327,7 @@ class TextCategorizer(Pipe): def begin_training( self, - get_examples: Callable[[], Iterable[Example]] = lambda: [], + get_examples: Callable[[], Iterable[Example]], *, pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, @@ -356,21 +345,20 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer#begin_training """ - # TODO: begin_training is not guaranteed to see all data / labels ? - examples = list(get_examples()) - for example in examples: - try: - y = example.y - except AttributeError: - err = Errors.E978.format( - name="TextCategorizer", method="update", types=type(example) - ) - raise TypeError(err) from None - for cat in y.cats: + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples)) + raise ValueError(err) + subbatch = [] # Select a subbatch of examples to initialize the model + for example in get_examples(): + if len(subbatch) < 2: + subbatch.append(example) + for cat in example.y.cats: self.add_label(cat) self.require_labels() - docs = [Doc(self.vocab, words=["hello"])] - truths, _ = self._examples_to_truth(examples) + docs = [eg.reference for eg in subbatch] + if not docs: # need at least one doc + docs = [Doc(self.vocab, words=["hello"])] + truths, _ = self._examples_to_truth(subbatch) self.set_output(len(self.labels)) self.model.initialize(X=docs, Y=truths) if sgd is None: @@ -392,6 +380,7 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer#score """ + validate_examples(examples, "TextCategorizer.score") return Scorer.score_cats( examples, "cats", diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index c9f0a99e9..44cd457e4 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -2,7 +2,7 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, from thinc.api import Model, set_dropout_rate, Optimizer, Config from .pipe import Pipe -from ..gold import Example +from ..gold import Example, validate_examples from ..tokens import Doc from ..vocab import Vocab from ..language import Language @@ -166,9 +166,8 @@ class Tok2Vec(Pipe): """ if losses is None: losses = {} + validate_examples(examples, "Tok2Vec.update") docs = [eg.predicted for eg in examples] - if isinstance(docs, Doc): - docs = [docs] set_dropout_rate(self.model, drop) tokvecs, bp_tokvecs = self.model.begin_update(docs) d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs] @@ -204,7 +203,7 @@ class Tok2Vec(Pipe): def begin_training( self, - get_examples: Callable[[], Iterable[Example]] = lambda: [], + get_examples: Callable[[], Iterable[Example]], *, pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, diff --git a/spacy/pipeline/transition_parser.pxd b/spacy/pipeline/transition_parser.pxd index e594a3098..67bc01f97 100644 --- a/spacy/pipeline/transition_parser.pxd +++ b/spacy/pipeline/transition_parser.pxd @@ -8,11 +8,8 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC cdef class Parser(Pipe): - cdef readonly Vocab vocab - cdef public object model cdef public object _rehearsal_model cdef readonly TransitionSystem moves - cdef readonly object cfg cdef public object _multitasks cdef void _parseC(self, StateC** states, diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 9829e764d..443f7f6a0 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -8,22 +8,21 @@ from libc.string cimport memset from libc.stdlib cimport calloc, free import srsly +from thinc.api import set_dropout_rate +import numpy.random +import numpy +import warnings from ._parser_internals.stateclass cimport StateClass from ..ml.parser_model cimport alloc_activations, free_activations from ..ml.parser_model cimport predict_states, arg_max_if_valid from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ..ml.parser_model cimport get_c_weights, get_c_sizes - from ..tokens.doc cimport Doc + +from ..gold import validate_examples from ..errors import Errors, Warnings from .. import util -from ..util import create_default_optimizer - -from thinc.api import set_dropout_rate -import numpy.random -import numpy -import warnings cdef class Parser(Pipe): @@ -266,6 +265,7 @@ cdef class Parser(Pipe): if losses is None: losses = {} losses.setdefault(self.name, 0.) + validate_examples(examples, "Parser.update") for multitask in self._multitasks: multitask.update(examples, drop=drop, sgd=sgd) n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) @@ -329,7 +329,7 @@ cdef class Parser(Pipe): if self._rehearsal_model is None: return None losses.setdefault(self.name, 0.) - + validate_examples(examples, "Parser.rehearse") docs = [eg.predicted for eg in examples] states = self.moves.init_batch(docs) # This is pretty dirty, but the NER can resize itself in init_batch, @@ -398,21 +398,18 @@ cdef class Parser(Pipe): losses[self.name] += (d_scores**2).sum() return d_scores - def create_optimizer(self): - return create_default_optimizer() - def set_output(self, nO): self.model.attrs["resize_output"](self.model, nO) def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples)) + raise ValueError(err) self.cfg.update(kwargs) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: langs = ", ".join(util.LEXEME_NORM_LANGS) warnings.warn(Warnings.W033.format(model="parser or NER", langs=langs)) - if not hasattr(get_examples, '__call__'): - gold_tuples = get_examples - get_examples = lambda: gold_tuples actions = self.moves.get_actions( examples=get_examples(), min_freq=self.cfg['min_action_freq'], diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 5fb5f0914..d6e345336 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -18,7 +18,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training([]) + ner.begin_training(lambda: []) ner(doc) assert len(list(doc.ents)) == 0 assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) @@ -41,7 +41,7 @@ def test_ents_reset(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training([]) + ner.begin_training(lambda: []) ner(doc) assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) doc.ents = list(doc.ents) diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 88dfabdc8..fce5f679f 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -35,7 +35,7 @@ def test_init_parser(parser): def _train_parser(parser): fix_random_seed(1) parser.add_label("left") - parser.begin_training([], **parser.cfg) + parser.begin_training(lambda: [], **parser.cfg) sgd = Adam(0.001) for i in range(5): @@ -75,7 +75,7 @@ def test_add_label_deserializes_correctly(): ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") - ner1.begin_training([]) + ner1.begin_training(lambda: []) ner2 = EntityRecognizer(Vocab(), model, **config) # the second model needs to be resized before we can call from_bytes diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 939181419..594498b0b 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -28,7 +28,7 @@ def parser(vocab): parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") - parser.begin_training([], **parser.cfg) + parser.begin_training(lambda: [], **parser.cfg) sgd = Adam(0.001) for i in range(10): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index bb93cf118..b3fb6d0fc 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -136,7 +136,7 @@ def test_kb_undefined(nlp): """Test that the EL can't train without defining a KB""" entity_linker = nlp.add_pipe("entity_linker", config={}) with pytest.raises(ValueError): - entity_linker.begin_training() + entity_linker.begin_training(lambda: []) def test_kb_empty(nlp): @@ -145,7 +145,7 @@ def test_kb_empty(nlp): entity_linker = nlp.add_pipe("entity_linker", config=config) assert len(entity_linker.kb) == 0 with pytest.raises(ValueError): - entity_linker.begin_training() + entity_linker.begin_training(lambda: []) def test_candidate_generation(nlp): @@ -249,7 +249,7 @@ def test_preserving_links_asdoc(nlp): ruler.add_patterns(patterns) el_config = {"kb": {"@assets": "myLocationsKB.v1"}, "incl_prior": False} el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) - el_pipe.begin_training() + el_pipe.begin_training(lambda: []) el_pipe.incl_context = False el_pipe.incl_prior = True diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 17add7391..d5dd39cac 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -54,7 +54,7 @@ def test_textcat_learns_multilabel(): textcat = TextCategorizer(nlp.vocab, width=8) for letter in letters: textcat.add_label(letter) - optimizer = textcat.begin_training() + optimizer = textcat.begin_training(lambda: []) for i in range(30): losses = {} examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs] diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index cf43e1a17..2b0f9f427 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -20,7 +20,7 @@ def test_issue2564(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - tagger.begin_training() + tagger.begin_training(lambda: []) doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 423015106..2fae3484b 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -303,7 +303,7 @@ def test_issue4313(): config = {} ner = nlp.create_pipe("ner", config=config) ner.add_label("SOME_LABEL") - ner.begin_training([]) + ner.begin_training(lambda: []) # add a new label to the doc doc = nlp("What do you think about Apple ?") assert len(ner.labels) == 1 diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 4c6504f6b..93069d9a3 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -62,7 +62,7 @@ def tagger(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - tagger.begin_training(pipeline=nlp.pipeline) + tagger.begin_training(lambda: [], pipeline=nlp.pipeline) return tagger @@ -81,7 +81,7 @@ def entity_linker(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - entity_linker.begin_training(pipeline=nlp.pipeline) + entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) return entity_linker diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index ce7b5cb1c..a13299fff 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -24,6 +24,7 @@ from .util import registry from .attrs import intify_attrs from .symbols import ORTH from .scorer import Scorer +from .gold import validate_examples cdef class Tokenizer: @@ -712,6 +713,7 @@ cdef class Tokenizer: return tokens def score(self, examples, **kwargs): + validate_examples(examples, "Tokenizer.score") return Scorer.score_tokenization(examples) def to_disk(self, path, **kwargs): diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index 81ecc5faf..8302c2e8b 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -45,18 +45,12 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). - - -This method needs to be overwritten with your own custom `__init__` method. - - - -| Name | Type | Description | -| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `**cfg` | | Additional config parameters and settings. | +| Name | Type | Description | +| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `**cfg` | | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. | ## Pipe.\_\_call\_\_ {#call tag="method"} @@ -182,12 +176,6 @@ method. Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. - - -This method needs to be overwritten with your own custom `update` method. - - - > #### Example > > ```python @@ -384,6 +372,15 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. | `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Pipe` | The pipe. | +## Attributes {#attributes} + +| Name | Type | Description | +| ------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------- | +| `vocab` | [`Vocab`](/api/vocab) | The shared vocabulary that's passed in on initialization. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model powering the component. | +| `name` | str | The name of the component instance in the pipeline. Can be used in the losses. | +| `cfg` | dict | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. | + ## Serialization fields {#serialization-fields} During serialization, spaCy will export several data fields used to restore diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index ab5806764..00348065c 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -5,7 +5,6 @@ menu: - ['Processing Text', 'processing'] - ['How Pipelines Work', 'pipelines'] - ['Custom Components', 'custom-components'] - # - ['Trainable Components', 'trainable-components'] - ['Extension Attributes', 'custom-components-attributes'] - ['Plugins & Wrappers', 'plugins'] --- @@ -885,15 +884,117 @@ available, falls back to looking up the regular factory name. - +1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This + can be a model using [layers](https://thinc.ai/docs/api-layers) implemented + in Thinc, or a [wrapped model](https://thinc.ai/docs/usage-frameworks) + implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The + model must take a list of [`Doc`](/api/doc) objects as input and can have any + type of output. +2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least + two methods: [`Pipe.predict`](/api/pipe#predict) and + [`Pipe.set_annotations`](/api/pipe#set_annotations). +3. **Component factory:** A component factory registered with + [`@Language.factory`](/api/language#factory) that takes the `nlp` object and + component `name` and optional settings provided by the config and returns an + instance of your trainable component. + +> #### Example +> +> ```python +> from spacy.pipeline import Pipe +> from spacy.language import Language +> +> class TrainableComponent(Pipe): +> def predict(self, docs): +> ... +> +> def set_annotations(self, docs, scores): +> ... +> +> @Language.factory("my_trainable_component") +> def make_component(nlp, name, model): +> return TrainableComponent(nlp.vocab, model, name=name) +> ``` + +| Name | Description | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | +| [`predict`](/api/pipe#predict) | Apply the component's model to a batch of [`Doc`](/api/doc) objects (without modifying them) and return the scores. | +| [`set_annotations`](/api/pipe#set_annotations) | Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores generated by `predict`. | + +By default, [`Pipe.__init__`](/api/pipe#init) takes the shared vocab, the +[`Model`](https://thinc.ai/docs/api-model) and the name of the component +instance in the pipeline, which you can use as a key in the losses. All other +keyword arguments will become available as [`Pipe.cfg`](/api/pipe#cfg) and will +also be serialized with the component. + + + +spaCy's [config system](/usage/training#config) resolves the config describing +the pipeline components and models **bottom-up**. This means that it will +_first_ create a `Model` from a [registered architecture](/api/architectures), +validate its arguments and _then_ pass the object forward to the component. This +means that the config can express very complex, nested trees of objects – but +the objects don't have to pass the model settings all the way down to the +components. It also makes the components more **modular** and lets you swap +different architectures in your config, and re-use model definitions. + +```ini +### config.cfg (excerpt) +[components] + +[components.textcat] +factory = "textcat" +labels = [] + +# This function is created and then passed to the "textcat" component as +# the argument "model" +[components.textcat.model] +@architectures = "spacy.TextCatEnsemble.v1" +exclusive_classes = false +pretrained_vectors = null +width = 64 +conv_depth = 2 +embed_size = 2000 +window_size = 1 +ngram_size = 1 +dropout = null + +[components.other_textcat] +factory = "textcat" +# This references the [components.textcat.model] block above +model = ${components.textcat.model} +labels = [] +``` + +Your trainable pipeline component factories should therefore always take a +`model` argument instead of instantiating the +[`Model`](https://thinc.ai/docs/api-model) inside the component. To register +custom architectures, you can use the +[`@spacy.registry.architectures`](/api/top-level#registry) decorator. Also see +the [training guide](/usage/training#config) for details. + + + +For some use cases, it makes sense to also overwrite additional methods to +customize how the model is updated from examples, how it's initialized, how the +loss is calculated and to add evaluation scores to the training output. + +| Name | Description | +| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. | +| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. | +| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. | +| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. | + + ## Extension attributes {#custom-components-attributes new="2"}