begin_training -> initialize

This commit is contained in:
Ines Montani 2020-09-28 21:35:09 +02:00
parent 046f655d86
commit ff9a63bfbd
57 changed files with 301 additions and 253 deletions

View File

@ -103,12 +103,12 @@ def debug_model(
with data_validation(False):
try:
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
nlp.begin_training(lambda: train_corpus(nlp))
nlp.initialize(lambda: train_corpus(nlp))
msg.info("Initialized the model with the training corpus.")
except ValueError:
try:
_set_output_dim(nO=7, model=model)
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
msg.info("Initialized the model with dummy data.")
except Exception:
msg.fail(

View File

@ -85,6 +85,7 @@ class Warnings:
"attribute or operator.")
# TODO: fix numbering after merging develop into master
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
W090 = ("Could not locate any {format} files in path '{path}'.")
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -306,7 +307,7 @@ class Errors:
"settings: {opts}")
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
E109 = ("Component '{name}' could not be run. Did you forget to "
"call begin_training()?")
"call initialize()?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
E111 = ("Pickling a token is not supported, because tokens are only views "
"of the parent Doc and can't exist on their own. A pickled token "
@ -376,7 +377,7 @@ class Errors:
"provided {found}.")
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
"by calling add_label, or by providing a representative batch of "
"examples to the component's begin_training method.")
"examples to the component's initialize method.")
E145 = ("Error reading `{param}` from input file.")
E146 = ("Could not access `{path}`.")
E147 = ("Unexpected error in the {method} functionality of the "
@ -517,7 +518,7 @@ class Errors:
"but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
E930 = ("Received invalid get_examples callback in {name}.initialize. "
"Expected function that returns an iterable of Example objects but "
"got: {obj}")
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "

View File

@ -1154,6 +1154,16 @@ class Language:
*,
sgd: Optional[Optimizer] = None,
device: int = -1,
) -> Optimizer:
warnings.warn(Warnings.W089, DeprecationWarning)
return self.initialize(get_examples, sgd=sgd, device=device)
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
device: int = -1,
) -> Optimizer:
"""Initialize the pipe for training, using data examples if available.
@ -1163,11 +1173,11 @@ class Language:
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/language#begin_training
DOCS: https://nightly.spacy.io/api/language#initialize
"""
if get_examples is None:
util.logger.debug(
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
)
doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})]
@ -1179,7 +1189,7 @@ class Language:
for example in get_examples():
if not isinstance(example, Example):
err = Errors.E978.format(
name="Language.begin_training", types=type(example)
name="Language.initialize", types=type(example)
)
raise ValueError(err)
else:
@ -1198,8 +1208,8 @@ class Language:
sgd = create_default_optimizer()
self._optimizer = sgd
for name, proc in self.pipeline:
if hasattr(proc, "begin_training"):
proc.begin_training(
if hasattr(proc, "initialize"):
proc.initialize(
get_examples, pipeline=self.pipeline, sgd=self._optimizer
)
self._link_components()

View File

@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
@property
def labels(self):

View File

@ -140,7 +140,7 @@ class EntityLinker(Pipe):
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
def begin_training(
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
@ -159,7 +159,7 @@ class EntityLinker(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
"""
self._ensure_examples(get_examples)
self._require_kb()

View File

@ -129,7 +129,7 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -142,7 +142,7 @@ class Morphologizer(Tagger):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
"""
self._ensure_examples(get_examples)
# First, fetch all labels from the data

View File

@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, get_examples, pipeline=None, sgd=None):
def initialize(self, get_examples, pipeline=None, sgd=None):
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
raise ValueError(err)
@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, get_examples, pipeline=None, sgd=None):
def initialize(self, get_examples, pipeline=None, sgd=None):
self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X)
self.model.output_layer.initialize(X)
if sgd is None:
sgd = self.create_optimizer()
return sgd

View File

@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline)
labeller.initialize(get_examples, pipeline=pipeline)
@property
def labels(self):

View File

@ -183,7 +183,7 @@ cdef class Pipe:
"""
return util.create_default_optimizer()
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each Pipe component,
ensuring the internal model (if available) is initialized properly
@ -198,7 +198,7 @@ cdef class Pipe:
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/pipe#begin_training
DOCS: https://nightly.spacy.io/api/pipe#initialize
"""
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))

View File

@ -58,7 +58,7 @@ class Sentencizer(Pipe):
else:
self.punct_chars = set(self.default_punct_chars)
def begin_training(self, get_examples, pipeline=None, sgd=None):
def initialize(self, get_examples, pipeline=None, sgd=None):
pass
def __call__(self, doc):

View File

@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
"""
self._ensure_examples(get_examples)
doc_sample = []

View File

@ -256,7 +256,7 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -269,7 +269,7 @@ class Tagger(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tagger#begin_training
DOCS: https://nightly.spacy.io/api/tagger#initialize
"""
self._ensure_examples(get_examples)
doc_sample = []

View File

@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
self.labels = tuple(list(self.labels) + [label])
return 1
def begin_training(
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
"""
self._ensure_examples(get_examples)
subbatch = [] # Select a subbatch of examples to initialize the model

View File

@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
def get_loss(self, examples, scores) -> None:
pass
def begin_training(
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
"""
self._ensure_examples(get_examples)
doc_sample = []

View File

@ -405,7 +405,7 @@ cdef class Parser(Pipe):
def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO)
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
self._ensure_examples(get_examples)
self.cfg.update(kwargs)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})

View File

@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)])
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
doc.ents = [("ANIMAL", 3, 4)]
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)])
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
orig_iobs = [t.ent_iob_ for t in doc]
doc.ents = list(doc.ents)

View File

@ -35,7 +35,7 @@ def test_init_parser(parser):
def _train_parser(parser):
fix_random_seed(1)
parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001)
for i in range(5):
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
ner1.add_label("C")
ner1.add_label("B")
ner1.add_label("A")
ner1.begin_training(lambda: [_ner_example(ner1)])
ner1.initialize(lambda: [_ner_example(ner1)])
ner2 = EntityRecognizer(Vocab(), model, **config)
# the second model needs to be resized before we can call from_bytes

View File

@ -202,7 +202,7 @@ def test_train_empty():
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
ner = nlp.add_pipe("ner", last=True)
ner.add_label("PERSON")
nlp.begin_training()
nlp.initialize()
for itn in range(2):
losses = {}
batches = util.minibatch(train_examples, size=8)
@ -213,7 +213,7 @@ def test_train_empty():
def test_overwrite_token():
nlp = English()
nlp.add_pipe("ner")
nlp.begin_training()
nlp.initialize()
# The untrained NER will predict O for each token
doc = nlp("I live in New York")
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@ -235,7 +235,7 @@ def test_empty_ner():
nlp = English()
ner = nlp.add_pipe("ner")
ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
doc = nlp("John is watching the news about Croatia's elections")
# if this goes wrong, the initialization of the parser's upper layer is probably broken
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@ -254,7 +254,7 @@ def test_ruler_before_ner():
# 2: untrained NER - should set everything else to O
untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""]
@ -269,7 +269,7 @@ def test_ner_before_ruler():
# 1: untrained NER - should set everything to O
untrained_ner = nlp.add_pipe("ner", name="uner")
untrained_ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
# 2 : Entity Ruler - should set "this" to B and keep everything else O
patterns = [{"label": "THING", "pattern": "This"}]
@ -290,7 +290,7 @@ def test_block_ner():
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
doc = nlp("This is Antti L Korhonen speaking in Finland")
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
expected_types = ["", "", "", "", "", "", "", ""]
@ -307,7 +307,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"):
ner.add_label(ent[2])
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(50):
losses = {}
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner")
with caplog.at_level(logging.DEBUG):
nlp.begin_training()
nlp.initialize()
assert "W033" in caplog.text
caplog.clear()
nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
with caplog.at_level(logging.DEBUG):
nlp.begin_training()
nlp.initialize()
assert "W033" not in caplog.text
@ -358,5 +358,5 @@ class BlockerComponent1:
self.name = name
def __call__(self, doc):
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
return doc

View File

@ -191,7 +191,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []):
parser.add_label(dep)
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(100):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -34,7 +34,7 @@ def parser(vocab):
parser.cfg["hidden_width"] = 32
# parser.add_label('right')
parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001)
for i in range(10):

View File

@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
"""Test that the EL can't train without defining a KB"""
entity_linker = nlp.add_pipe("entity_linker", config={})
with pytest.raises(ValueError):
entity_linker.begin_training(lambda: [])
entity_linker.initialize(lambda: [])
def test_kb_empty(nlp):
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
entity_linker = nlp.add_pipe("entity_linker", config=config)
assert len(entity_linker.kb) == 0
with pytest.raises(ValueError):
entity_linker.begin_training(lambda: [])
entity_linker.initialize(lambda: [])
def test_kb_serialize(nlp):
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns)
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
nlp.begin_training()
nlp.initialize()
assert entity_linker.model.get_dim("nO") == vector_length
# test whether the entity links are preserved by the `as_doc()` function
@ -463,7 +463,7 @@ def test_overfitting_IO():
)
# train the NEL pipe
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert entity_linker.model.get_dim("nO") == vector_length
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length

View File

@ -33,7 +33,7 @@ def test_no_label():
nlp = Language()
nlp.add_pipe("morphologizer")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()
def test_implicit_label():
@ -42,7 +42,7 @@ def test_implicit_label():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize():
@ -50,13 +50,13 @@ def test_no_resize():
morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
nlp.begin_training()
nlp.initialize()
# this throws an error because the morphologizer can't be resized after initialization
with pytest.raises(ValueError):
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@ -64,12 +64,12 @@ def test_begin_training_examples():
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -79,7 +79,7 @@ def test_overfitting_IO():
train_examples = []
for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}

View File

@ -31,19 +31,19 @@ TRAIN_DATA = [
]
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
nlp.add_pipe("senter")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -58,7 +58,7 @@ def test_overfitting_IO():
train_examples[1].reference[11].is_sent_start = False
nlp.add_pipe("senter")
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(200):
losses = {}

View File

@ -15,14 +15,14 @@ def test_label_types():
tagger.add_label(9)
def test_tagger_begin_training_tag_map():
"""Test that Tagger.begin_training() without gold tuples does not clobber
def test_tagger_initialize_tag_map():
"""Test that Tagger.initialize() without gold tuples does not clobber
the tag map."""
nlp = Language()
tagger = nlp.add_pipe("tagger")
orig_tag_count = len(tagger.labels)
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
@ -38,7 +38,7 @@ def test_no_label():
nlp = Language()
nlp.add_pipe("tagger")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()
def test_no_resize():
@ -47,7 +47,7 @@ def test_no_resize():
tagger.add_label("N")
tagger.add_label("V")
assert tagger.labels == ("N", "V")
nlp.begin_training()
nlp.initialize()
assert tagger.model.get_dim("nO") == 2
# this throws an error because the tagger can't be resized after initialization
with pytest.raises(ValueError):
@ -60,10 +60,10 @@ def test_implicit_label():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize(get_examples=lambda: train_examples)
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
tagger = nlp.add_pipe("tagger")
train_examples = []
@ -72,16 +72,16 @@ def test_begin_training_examples():
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: train_examples[0])
nlp.initialize(get_examples=lambda: train_examples[0])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=lambda: [])
nlp.initialize(get_examples=lambda: [])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -91,7 +91,7 @@ def test_overfitting_IO():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert tagger.model.get_dim("nO") == len(TAGS)
for i in range(50):
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
nlp = English()
nlp.add_pipe("tagger")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()

View File

@ -26,7 +26,7 @@ def test_simple_train():
nlp = Language()
textcat = nlp.add_pipe("textcat")
textcat.add_label("answer")
nlp.begin_training()
nlp.initialize()
for i in range(5):
for text, answer in [
("aaaa", 1.0),
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
textcat = TextCategorizer(nlp.vocab, width=8)
for letter in letters:
textcat.add_label(letter)
optimizer = textcat.begin_training(lambda: [])
optimizer = textcat.initialize(lambda: [])
for i in range(30):
losses = {}
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@ -86,7 +86,7 @@ def test_no_label():
nlp = Language()
nlp.add_pipe("textcat")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()
def test_implicit_label():
@ -95,7 +95,7 @@ def test_implicit_label():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize():
@ -103,14 +103,14 @@ def test_no_resize():
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
nlp.begin_training()
nlp.initialize()
assert textcat.model.get_dim("nO") == 2
# this throws an error because the textcat can't be resized after initialization
with pytest.raises(ValueError):
textcat.add_label("NEUTRAL")
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
textcat = nlp.add_pipe("textcat")
train_examples = []
@ -119,12 +119,12 @@ def test_begin_training_examples():
for label, value in annotations.get("cats").items():
textcat.add_label(label)
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -139,7 +139,7 @@ def test_overfitting_IO():
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert textcat.model.get_dim("nO") == 2
for i in range(50):
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for label, value in annotations.get("cats").items():
textcat.add_label(label)
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(5):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -88,7 +88,7 @@ def test_init_tok2vec():
nlp = English()
tok2vec = nlp.add_pipe("tok2vec")
assert tok2vec.listeners == []
nlp.begin_training()
nlp.initialize()
assert tok2vec.model.get_dim("nO")
@ -154,7 +154,7 @@ def test_tok2vec_listener():
# Check that the Tok2Vec component finds it listeners
assert tok2vec.listeners == []
optimizer = nlp.begin_training(lambda: train_examples)
optimizer = nlp.initialize(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec]
for i in range(5):

View File

@ -428,7 +428,7 @@ def test_issue999():
for _, offsets in TRAIN_DATA:
for start, end, label in offsets:
ner.add_label(label)
nlp.begin_training()
nlp.initialize()
for itn in range(20):
random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA:

View File

@ -250,7 +250,7 @@ def test_issue1915():
ner = nlp.add_pipe("ner")
ner.add_label("answer")
with pytest.raises(ValueError):
nlp.begin_training(**cfg)
nlp.initialize(**cfg)
def test_issue1945():

View File

@ -30,7 +30,7 @@ def test_issue2179():
nlp = Italian()
ner = nlp.add_pipe("ner")
ner.add_label("CITIZENSHIP")
nlp.begin_training()
nlp.initialize()
nlp2 = Italian()
nlp2.add_pipe("ner")
assert len(nlp2.get_pipe("ner").labels) == 0

View File

@ -18,7 +18,7 @@ def test_issue2564():
nlp = Language()
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
doc = nlp("hello world")
assert doc.has_annotation("TAG")
docs = nlp.pipe(["hello", "world"])
@ -149,7 +149,7 @@ def test_issue2800():
ner = nlp.add_pipe("ner")
for entity_type in list(entity_types):
ner.add_label(entity_type)
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(20):
losses = {}
random.shuffle(train_data)

View File

@ -92,7 +92,7 @@ def test_issue3209():
nlp = English()
ner = nlp.add_pipe("ner")
ner.add_label("ANIMAL")
nlp.begin_training()
nlp.initialize()
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
assert ner.move_names == move_names
nlp2 = English()
@ -239,7 +239,7 @@ def test_issue3456():
nlp = English()
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
list(nlp.pipe(["hi", ""]))

View File

@ -223,7 +223,7 @@ def test_issue3611():
textcat.add_label(label)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [_parser_example(parser)])
parser.initialize(lambda: [_parser_example(parser)])
assert "subtok" not in parser.labels
@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [_parser_example(parser)])
parser.initialize(lambda: [_parser_example(parser)])
assert "subtok" in parser.labels
@ -342,7 +342,7 @@ def test_issue3880():
nlp.add_pipe("parser").add_label("dep")
nlp.add_pipe("ner").add_label("PERSON")
nlp.add_pipe("tagger").add_label("NN")
nlp.begin_training()
nlp.initialize()
for doc in nlp.pipe(texts):
pass

View File

@ -66,7 +66,7 @@ def test_issue4030():
textcat.add_label(label)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -87,7 +87,7 @@ def test_issue4042():
# add ner pipe
ner = nlp.add_pipe("ner")
ner.add_label("SOME_LABEL")
nlp.begin_training()
nlp.initialize()
# Add entity ruler
patterns = [
{"label": "MY_ORG", "pattern": "Apple"},
@ -118,7 +118,7 @@ def test_issue4042_bug2():
# add ner pipe
ner1 = nlp1.add_pipe("ner")
ner1.add_label("SOME_LABEL")
nlp1.begin_training()
nlp1.initialize()
# add a new label to the doc
doc1 = nlp1("What do you think about Apple ?")
assert len(ner1.labels) == 1
@ -244,7 +244,7 @@ def test_issue4267():
nlp = English()
ner = nlp.add_pipe("ner")
ner.add_label("PEOPLE")
nlp.begin_training()
nlp.initialize()
assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations
doc1 = nlp("hi")
@ -299,7 +299,7 @@ def test_issue4313():
config = {}
ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL")
ner.begin_training(lambda: [])
ner.initialize(lambda: [])
# add a new label to the doc
doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1
@ -327,7 +327,7 @@ def test_issue4348():
TRAIN_DATA = [example, example]
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(5):
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))

View File

@ -180,7 +180,7 @@ def test_issue4725_2():
vocab.set_vector("dog", data[1])
nlp = English(vocab=vocab)
nlp.add_pipe("ner")
nlp.begin_training()
nlp.initialize()
docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass

View File

@ -64,7 +64,7 @@ def tagger():
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
return tagger
@ -85,7 +85,7 @@ def entity_linker():
# need to add model for two reasons:
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
nlp.begin_training()
nlp.initialize()
return entity_linker

View File

@ -25,7 +25,7 @@ def test_issue5551():
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
for label in set(example[1]["cats"]):
pipe.add_label(label)
nlp.begin_training()
nlp.initialize()
# Store the result of each iteration
result = pipe.model.predict([nlp.make_doc(example[0])])

View File

@ -152,7 +152,7 @@ def test_serialize_nlp():
nlp_config = Config().from_str(nlp_config_string)
nlp = load_model_from_config(nlp_config, auto_fill=True)
nlp.get_pipe("tagger").add_label("A")
nlp.begin_training()
nlp.initialize()
assert "tok2vec" in nlp.pipe_names
assert "tagger" in nlp.pipe_names
assert "parser" not in nlp.pipe_names
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
parser_cfg = dict()
parser_cfg["model"] = {"@architectures": "my_test_parser"}
nlp.add_pipe("parser", config=parser_cfg)
nlp.begin_training()
nlp.initialize()
with make_tempdir() as d:
nlp.to_disk(d)
@ -191,7 +191,7 @@ def test_serialize_parser():
model_config = Config().from_str(parser_config_string)
parser = nlp.add_pipe("parser", config=model_config)
parser.add_label("nsubj")
nlp.begin_training()
nlp.initialize()
with make_tempdir() as d:
nlp.to_disk(d)

View File

@ -18,7 +18,7 @@ def nlp():
textcat = nlp.add_pipe("textcat")
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
nlp.begin_training()
nlp.initialize()
return nlp

View File

@ -47,7 +47,7 @@ def test_readers():
)
optimizer = T["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer)
scores = nlp.evaluate(list(dev_corpus(nlp)))
@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
)
optimizer = T["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
assert example.y.cats
# this shouldn't fail if each training example has at least one positive label

View File

@ -600,7 +600,7 @@ def _train_tuples(train_data):
train_examples = []
for t in train_data:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(5):
losses = {}
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))

View File

@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
msg.good(f"Initialized pipeline components")
# Verify the config after calling 'begin_training' to ensure labels
# Verify the config after calling 'initialize' to ensure labels
# are properly initialized
verify_config(nlp)
if "pretraining" in config and config["pretraining"]:

View File

@ -517,18 +517,18 @@ specific data and challenge.
Stacked ensemble of a bag-of-words model and a neural network model. The neural
network has an internal CNN Tok2Vec layer and uses attention.
| Name | Description |
| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
| `width` | Output dimension of the feature encoding step. ~~int~~ |
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `dropout` | The dropout rate. ~~float~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
| Name | Description |
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
| `width` | Output dimension of the feature encoding step. ~~int~~ |
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `dropout` | The dropout rate. ~~float~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.
| Name | Description |
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster.
An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short.
| Name | Description |
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
@ -629,11 +629,11 @@ into the "real world". This requires 3 main components:
The `EntityLinker` model architecture is a Thinc `Model` with a
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.EmptyKB.v1 {#EmptyKB}

View File

@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## DependencyParser.begin_training {#begin_training tag="method"}
## DependencyParser.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -151,11 +151,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> parser = nlp.add_pipe("parser")
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
>
> ```python
> parser = nlp.add_pipe("parser")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = parser.update(examples, sgd=optimizer)
> ```
@ -294,11 +300,10 @@ context, the original parameters are restored.
## DependencyParser.add_label {#add_label tag="method"}
Add a new label to the pipe. Note that you don't have to call this method if you
provide a **representative data sample** to the
[`begin_training`](#begin_training) method. In this case, all labels found in
the sample will be automatically added to the model, and the output dimension
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
provide a **representative data sample** to the [`initialize`](#initialize)
method. In this case, all labels found in the sample will be automatically added
to the model, and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
> #### Example
>

View File

@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityLinker.begin_training {#begin_training tag="method"}
## EntityLinker.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -150,11 +150,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> entity_linker = nlp.add_pipe("entity_linker", last=True)
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
>
> ```python
> entity_linker = nlp.add_pipe("entity_linker")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = entity_linker.update(examples, sgd=optimizer)
> ```

View File

@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityRecognizer.begin_training {#begin_training tag="method"}
## EntityRecognizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -140,11 +140,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> ner = nlp.add_pipe("ner")
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
>
> ```python
> ner = nlp.add_pipe("ner")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = ner.update(examples, sgd=optimizer)
> ```
@ -282,11 +288,10 @@ context, the original parameters are restored.
## EntityRecognizer.add_label {#add_label tag="method"}
Add a new label to the pipe. Note that you don't have to call this method if you
provide a **representative data sample** to the
[`begin_training`](#begin_training) method. In this case, all labels found in
the sample will be automatically added to the model, and the output dimension
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
provide a **representative data sample** to the [`initialize`](#initialize)
method. In this case, all labels found in the sample will be automatically added
to the model, and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
> #### Example
>

View File

@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
## Language.begin_training {#begin_training tag="method"}
## Language.initialize {#initialize tag="method"}
Initialize the pipeline for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
function that returns an iterable of [`Example`](/api/example) objects. The data
examples can either be the full training data or a representative sample. They
are used to **initialize the models** of trainable pipeline components and are
passed each component's [`begin_training`](/api/pipe#begin_training) method, if
passed each component's [`initialize`](/api/pipe#initialize) method, if
available. Initialization includes validating the network,
[inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
and setting up the label scheme based on the data.
If no `get_examples` function is provided when calling `nlp.begin_training`, the
If no `get_examples` function is provided when calling `nlp.initialize`, the
pipeline components will be initialized with generic data. In this case, it is
crucial that the output dimension of each component has already been defined
either in the [config](/usage/training#config), or by calling
[`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
the tagger or textcat).
<Infobox variant="warning" title="Changed in v3.0">
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
The `Language.update` method now takes a **function** that is called with no
arguments and returns a sequence of [`Example`](/api/example) objects instead of
tuples of `Doc` and `GoldParse` objects.
This method was previously called `begin_training`. It now also takes a
**function** that is called with no arguments and returns a sequence of
[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
objects.
</Infobox>
@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
>
> ```python
> get_examples = lambda: examples
> optimizer = nlp.begin_training(get_examples)
> optimizer = nlp.initialize(get_examples)
> ```
| Name | Description |
@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
>
> ```python
> with nlp.select_pipes(disable=["tagger", "parser"]):
> nlp.begin_training()
> nlp.initialize()
>
> with nlp.select_pipes(enable="ner"):
> nlp.begin_training()
> nlp.initialize()
>
> disabled = nlp.select_pipes(disable=["tagger", "parser"])
> nlp.begin_training()
> nlp.initialize()
> disabled.restore()
> ```

View File

@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Morphologizer.begin_training {#begin_training tag="method"}
## Morphologizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -133,7 +133,7 @@ setting up the label scheme based on the data.
> ```python
> morphologizer = nlp.add_pipe("morphologizer")
> nlp.pipeline.append(morphologizer)
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
>
> ```python
> morphologizer = nlp.add_pipe("morphologizer")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = morphologizer.update(examples, sgd=optimizer)
> ```
@ -259,12 +259,11 @@ context, the original parameters are restored.
Add a new label to the pipe. If the `Morphologizer` should set annotations for
both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
Raises an error if the output dimension is already set, or if the model has
already been fully [initialized](#begin_training). Note that you don't have to
call this method if you provide a **representative data sample** to the
[`begin_training`](#begin_training) method. In this case, all labels found in
the sample will be automatically added to the model, and the output dimension
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
already been fully [initialized](#initialize). Note that you don't have to call
this method if you provide a **representative data sample** to the
[`initialize`](#initialize) method. In this case, all labels found in the sample
will be automatically added to the model, and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
> #### Example
>

View File

@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Pipe.begin_training {#begin_training tag="method"}
## Pipe.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -109,11 +109,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = pipe.update(examples, sgd=optimizer)
> ```
@ -296,9 +302,9 @@ context, the original parameters are restored.
Add a new label to the pipe, to be predicted by the model. The actual
implementation depends on the specific component, but in general `add_label`
shouldn't be called if the output dimension is already set, or if the model has
already been fully [initialized](#begin_training). If these conditions are
violated, the function will raise an Error. The exception to this rule is when
the component is [resizable](#is_resizable), in which case
already been fully [initialized](#initialize). If these conditions are violated,
the function will raise an Error. The exception to this rule is when the
component is [resizable](#is_resizable), in which case
[`set_output`](#set_output) should be called to ensure that the model is
properly resized.
@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
| **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
Note that in general, you don't have to call `pipe.add_label` if you provide a
representative data sample to the [`begin_training`](#begin_training) method. In
this case, all labels found in the sample will be automatically added to the
model, and the output dimension will be
representative data sample to the [`initialize`](#initialize) method. In this
case, all labels found in the sample will be automatically added to the model,
and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
## Pipe.is_resizable {#is_resizable tag="method"}

View File

@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## SentenceRecognizer.begin_training {#begin_training tag="method"}
## SentenceRecognizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -129,7 +129,7 @@ setting up the label scheme based on the data.
>
> ```python
> senter = nlp.add_pipe("senter")
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
>
> ```python
> senter = nlp.add_pipe("senter")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = senter.update(examples, sgd=optimizer)
> ```

View File

@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tagger.begin_training {#begin_training tag="method"}
## Tagger.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -123,11 +123,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> tagger = nlp.add_pipe("tagger")
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
>
> ```python
> tagger = nlp.add_pipe("tagger")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = tagger.update(examples, sgd=optimizer)
> ```
@ -289,12 +295,12 @@ context, the original parameters are restored.
## Tagger.add_label {#add_label tag="method"}
Add a new label to the pipe. Raises an error if the output dimension is already
set, or if the model has already been fully [initialized](#begin_training). Note
set, or if the model has already been fully [initialized](#initialize). Note
that you don't have to call this method if you provide a **representative data
sample** to the [`begin_training`](#begin_training) method. In this case, all
labels found in the sample will be automatically added to the model, and the
output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
sample** to the [`initialize`](#initialize) method. In this case, all labels
found in the sample will be automatically added to the model, and the output
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
> #### Example
>

View File

@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## TextCategorizer.begin_training {#begin_training tag="method"}
## TextCategorizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -136,11 +136,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> textcat = nlp.add_pipe("textcat")
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -196,7 +202,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
>
> ```python
> textcat = nlp.add_pipe("textcat")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = textcat.update(examples, sgd=optimizer)
> ```
@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
## TextCategorizer.add_label {#add_label tag="method"}
Add a new label to the pipe. Raises an error if the output dimension is already
set, or if the model has already been fully [initialized](#begin_training). Note
set, or if the model has already been fully [initialized](#initialize). Note
that you don't have to call this method if you provide a **representative data
sample** to the [`begin_training`](#begin_training) method. In this case, all
labels found in the sample will be automatically added to the model, and the
output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
sample** to the [`initialize`](#initialize) method. In this case, all labels
found in the sample will be automatically added to the model, and the output
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
> #### Example
>

View File

@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tok2Vec.begin_training {#begin_training tag="method"}
## Tok2Vec.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -138,7 +138,7 @@ setting up the label scheme based on the data.
>
> ```python
> tok2vec = nlp.add_pipe("tok2vec")
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
>
> ```python
> tok2vec = nlp.add_pipe("tok2vec")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = tok2vec.update(examples, sgd=optimizer)
> ```

View File

@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Transformer.begin_training {#begin_training tag="method"}
## Transformer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -173,7 +173,7 @@ setting up the label scheme based on the data.
>
> ```python
> trf = nlp.add_pipe("transformer")
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
>
> ```python
> trf = nlp.add_pipe("transformer")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = trf.update(examples, sgd=optimizer)
> ```

View File

@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
that their internal models are **always initialized** with appropriate sample
data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
functionality is triggered when
[`nlp.begin_training`](/api/language#begin_training) is called.
functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
called.
### Dropout and normalization in Thinc {#thinc-dropout-norm}
@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
<!-- TODO: write trainable component section
- Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `begin_training`, correlation with add_label
- Initialization life-cycle with `initialize`, correlation with add_label
Example: relation extraction component (implemented as project template)
Avoid duplication with usage/processing-pipelines#trainable-components ?
-->

View File

@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
customize how the model is updated from examples, how it's initialized, how the
loss is calculated and to add evaluation scores to the training output.
| Name | Description |
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
| Name | Description |
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
<Infobox title="Custom trainable components and models" emoji="📖">

View File

@ -1045,8 +1045,8 @@ of being dropped.
> - [`nlp`](/api/language): The `nlp` object with the pipeline components and
> their models.
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
> return an optimizer to update the component model weights.
> - [`nlp.initialize`](/api/language#initialize): Start the training and return
> an optimizer to update the component model weights.
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
> state between updates.
> - [`nlp.update`](/api/language#update): Update component models with examples.
@ -1057,7 +1057,7 @@ of being dropped.
```python
### Example training loop
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for itn in range(100):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:

View File

@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
[`Pipe.update`](/api/pipe#update) methods now all take batches of
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
raw text and a dictionary of annotations.
[`Language.begin_training`](/api/language#begin_training) and
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
returns a sequence of `Example` objects to initialize the model instead of a
list of tuples.
[`Language.initialize`](/api/language#initialize) and
[`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
sequence of `Example` objects to initialize the model instead of a list of
tuples.
- The `begin_training` methods have been renamed to `initialize`.
- [`Matcher.add`](/api/matcher#add) and
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
patterns as the second argument (instead of a variable number of arguments).
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
| Removed | Replacement |
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
| `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) |
@ -936,7 +938,7 @@ TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London.", {"entities": [(7, 13, "LOC")]}),
]
nlp.begin_training()
nlp.initialize()
for i in range(20):
random.shuffle(TRAIN_DATA)
for batch in minibatch(TRAIN_DATA):
@ -946,17 +948,18 @@ for i in range(20):
nlp.update(examples)
```
[`Language.begin_training`](/api/language#begin_training) and
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
returns a sequence of `Example` objects to initialize the model instead of a
list of tuples. The data examples are used to **initialize the models** of
`Language.begin_training` and `Pipe.begin_training` have been renamed to
[`Language.initialize`](/api/language#initialize) and
[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
that returns a sequence of `Example` objects to initialize the model instead of
a list of tuples. The data examples are used to **initialize the models** of
trainable pipeline components, which includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme.
```diff
- nlp.begin_training(examples)
+ nlp.begin_training(lambda: examples)
- nlp.initialize(examples)
+ nlp.initialize(lambda: examples)
```
#### Packaging trained pipelines {#migrating-training-packaging}