begin_training -> initialize

This commit is contained in:
Ines Montani 2020-09-28 21:35:09 +02:00
parent 046f655d86
commit ff9a63bfbd
57 changed files with 301 additions and 253 deletions

View File

@ -103,12 +103,12 @@ def debug_model(
with data_validation(False):
try:
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
nlp.begin_training(lambda: train_corpus(nlp))
nlp.initialize(lambda: train_corpus(nlp))
msg.info("Initialized the model with the training corpus.")
except ValueError:
try:
_set_output_dim(nO=7, model=model)
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
msg.info("Initialized the model with dummy data.")
except Exception:
msg.fail(

View File

@ -85,6 +85,7 @@ class Warnings:
"attribute or operator.")
# TODO: fix numbering after merging develop into master
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
W090 = ("Could not locate any {format} files in path '{path}'.")
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
@ -306,7 +307,7 @@ class Errors:
"settings: {opts}")
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
E109 = ("Component '{name}' could not be run. Did you forget to "
"call begin_training()?")
"call initialize()?")
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
E111 = ("Pickling a token is not supported, because tokens are only views "
"of the parent Doc and can't exist on their own. A pickled token "
@ -376,7 +377,7 @@ class Errors:
"provided {found}.")
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
"by calling add_label, or by providing a representative batch of "
"examples to the component's begin_training method.")
"examples to the component's initialize method.")
E145 = ("Error reading `{param}` from input file.")
E146 = ("Could not access `{path}`.")
E147 = ("Unexpected error in the {method} functionality of the "
@ -517,7 +518,7 @@ class Errors:
"but the provided argument {loc} points to a file.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
"not seem to exist.")
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
E930 = ("Received invalid get_examples callback in {name}.initialize. "
"Expected function that returns an iterable of Example objects but "
"got: {obj}")
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "

View File

@ -1154,6 +1154,16 @@ class Language:
*,
sgd: Optional[Optimizer] = None,
device: int = -1,
) -> Optimizer:
warnings.warn(Warnings.W089, DeprecationWarning)
return self.initialize(get_examples, sgd=sgd, device=device)
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
sgd: Optional[Optimizer] = None,
device: int = -1,
) -> Optimizer:
"""Initialize the pipe for training, using data examples if available.
@ -1163,11 +1173,11 @@ class Language:
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/language#begin_training
DOCS: https://nightly.spacy.io/api/language#initialize
"""
if get_examples is None:
util.logger.debug(
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
)
doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})]
@ -1179,7 +1189,7 @@ class Language:
for example in get_examples():
if not isinstance(example, Example):
err = Errors.E978.format(
name="Language.begin_training", types=type(example)
name="Language.initialize", types=type(example)
)
raise ValueError(err)
else:
@ -1198,8 +1208,8 @@ class Language:
sgd = create_default_optimizer()
self._optimizer = sgd
for name, proc in self.pipeline:
if hasattr(proc, "begin_training"):
proc.begin_training(
if hasattr(proc, "initialize"):
proc.initialize(
get_examples, pipeline=self.pipeline, sgd=self._optimizer
)
self._link_components()

View File

@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
@property
def labels(self):

View File

@ -140,7 +140,7 @@ class EntityLinker(Pipe):
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
def begin_training(
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
@ -159,7 +159,7 @@ class EntityLinker(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
"""
self._ensure_examples(get_examples)
self._require_kb()

View File

@ -129,7 +129,7 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -142,7 +142,7 @@ class Morphologizer(Tagger):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
"""
self._ensure_examples(get_examples)
# First, fetch all labels from the data

View File

@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, get_examples, pipeline=None, sgd=None):
def initialize(self, get_examples, pipeline=None, sgd=None):
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
raise ValueError(err)
@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids):
pass
def begin_training(self, get_examples, pipeline=None, sgd=None):
def initialize(self, get_examples, pipeline=None, sgd=None):
self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X)
self.model.output_layer.initialize(X)
if sgd is None:
sgd = self.create_optimizer()
return sgd

View File

@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.begin_training(get_examples, pipeline=pipeline)
labeller.initialize(get_examples, pipeline=pipeline)
@property
def labels(self):

View File

@ -183,7 +183,7 @@ cdef class Pipe:
"""
return util.create_default_optimizer()
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each Pipe component,
ensuring the internal model (if available) is initialized properly
@ -198,7 +198,7 @@ cdef class Pipe:
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/pipe#begin_training
DOCS: https://nightly.spacy.io/api/pipe#initialize
"""
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))

View File

@ -58,7 +58,7 @@ class Sentencizer(Pipe):
else:
self.punct_chars = set(self.default_punct_chars)
def begin_training(self, get_examples, pipeline=None, sgd=None):
def initialize(self, get_examples, pipeline=None, sgd=None):
pass
def __call__(self, doc):

View File

@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
"""
self._ensure_examples(get_examples)
doc_sample = []

View File

@ -256,7 +256,7 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
def initialize(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using a representative set
of data examples.
@ -269,7 +269,7 @@ class Tagger(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tagger#begin_training
DOCS: https://nightly.spacy.io/api/tagger#initialize
"""
self._ensure_examples(get_examples)
doc_sample = []

View File

@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
self.labels = tuple(list(self.labels) + [label])
return 1
def begin_training(
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
"""
self._ensure_examples(get_examples)
subbatch = [] # Select a subbatch of examples to initialize the model

View File

@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
def get_loss(self, examples, scores) -> None:
pass
def begin_training(
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
"""
self._ensure_examples(get_examples)
doc_sample = []

View File

@ -405,7 +405,7 @@ cdef class Parser(Pipe):
def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO)
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
self._ensure_examples(get_examples)
self.cfg.update(kwargs)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})

View File

@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)])
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
doc.ents = [("ANIMAL", 3, 4)]
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL}
model = registry.resolve(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: [_ner_example(ner)])
ner.initialize(lambda: [_ner_example(ner)])
ner(doc)
orig_iobs = [t.ent_iob_ for t in doc]
doc.ents = list(doc.ents)

View File

@ -35,7 +35,7 @@ def test_init_parser(parser):
def _train_parser(parser):
fix_random_seed(1)
parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001)
for i in range(5):
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
ner1.add_label("C")
ner1.add_label("B")
ner1.add_label("A")
ner1.begin_training(lambda: [_ner_example(ner1)])
ner1.initialize(lambda: [_ner_example(ner1)])
ner2 = EntityRecognizer(Vocab(), model, **config)
# the second model needs to be resized before we can call from_bytes

View File

@ -202,7 +202,7 @@ def test_train_empty():
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
ner = nlp.add_pipe("ner", last=True)
ner.add_label("PERSON")
nlp.begin_training()
nlp.initialize()
for itn in range(2):
losses = {}
batches = util.minibatch(train_examples, size=8)
@ -213,7 +213,7 @@ def test_train_empty():
def test_overwrite_token():
nlp = English()
nlp.add_pipe("ner")
nlp.begin_training()
nlp.initialize()
# The untrained NER will predict O for each token
doc = nlp("I live in New York")
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
@ -235,7 +235,7 @@ def test_empty_ner():
nlp = English()
ner = nlp.add_pipe("ner")
ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
doc = nlp("John is watching the news about Croatia's elections")
# if this goes wrong, the initialization of the parser's upper layer is probably broken
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
@ -254,7 +254,7 @@ def test_ruler_before_ner():
# 2: untrained NER - should set everything else to O
untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
doc = nlp("This is Antti Korhonen speaking in Finland")
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
expected_types = ["THING", "", "", "", "", "", ""]
@ -269,7 +269,7 @@ def test_ner_before_ruler():
# 1: untrained NER - should set everything to O
untrained_ner = nlp.add_pipe("ner", name="uner")
untrained_ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
# 2 : Entity Ruler - should set "this" to B and keep everything else O
patterns = [{"label": "THING", "pattern": "This"}]
@ -290,7 +290,7 @@ def test_block_ner():
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
untrained_ner = nlp.add_pipe("ner")
untrained_ner.add_label("MY_LABEL")
nlp.begin_training()
nlp.initialize()
doc = nlp("This is Antti L Korhonen speaking in Finland")
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
expected_types = ["", "", "", "", "", "", "", ""]
@ -307,7 +307,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"):
ner.add_label(ent[2])
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(50):
losses = {}
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
assert not len(nlp.vocab.lookups)
nlp.add_pipe("ner")
with caplog.at_level(logging.DEBUG):
nlp.begin_training()
nlp.initialize()
assert "W033" in caplog.text
caplog.clear()
nlp.vocab.lookups.add_table("lexeme_norm")
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
with caplog.at_level(logging.DEBUG):
nlp.begin_training()
nlp.initialize()
assert "W033" not in caplog.text

View File

@ -191,7 +191,7 @@ def test_overfitting_IO():
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []):
parser.add_label(dep)
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(100):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -34,7 +34,7 @@ def parser(vocab):
parser.cfg["hidden_width"] = 32
# parser.add_label('right')
parser.add_label("left")
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001)
for i in range(10):

View File

@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
"""Test that the EL can't train without defining a KB"""
entity_linker = nlp.add_pipe("entity_linker", config={})
with pytest.raises(ValueError):
entity_linker.begin_training(lambda: [])
entity_linker.initialize(lambda: [])
def test_kb_empty(nlp):
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
entity_linker = nlp.add_pipe("entity_linker", config=config)
assert len(entity_linker.kb) == 0
with pytest.raises(ValueError):
entity_linker.begin_training(lambda: [])
entity_linker.initialize(lambda: [])
def test_kb_serialize(nlp):
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
ruler.add_patterns(patterns)
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
nlp.begin_training()
nlp.initialize()
assert entity_linker.model.get_dim("nO") == vector_length
# test whether the entity links are preserved by the `as_doc()` function
@ -463,7 +463,7 @@ def test_overfitting_IO():
)
# train the NEL pipe
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert entity_linker.model.get_dim("nO") == vector_length
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length

View File

@ -33,7 +33,7 @@ def test_no_label():
nlp = Language()
nlp.add_pipe("morphologizer")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()
def test_implicit_label():
@ -42,7 +42,7 @@ def test_implicit_label():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize():
@ -50,13 +50,13 @@ def test_no_resize():
morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
nlp.begin_training()
nlp.initialize()
# this throws an error because the morphologizer can't be resized after initialization
with pytest.raises(ValueError):
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
@ -64,12 +64,12 @@ def test_begin_training_examples():
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -79,7 +79,7 @@ def test_overfitting_IO():
train_examples = []
for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}

View File

@ -31,19 +31,19 @@ TRAIN_DATA = [
]
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
nlp.add_pipe("senter")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -58,7 +58,7 @@ def test_overfitting_IO():
train_examples[1].reference[11].is_sent_start = False
nlp.add_pipe("senter")
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(200):
losses = {}

View File

@ -15,14 +15,14 @@ def test_label_types():
tagger.add_label(9)
def test_tagger_begin_training_tag_map():
"""Test that Tagger.begin_training() without gold tuples does not clobber
def test_tagger_initialize_tag_map():
"""Test that Tagger.initialize() without gold tuples does not clobber
the tag map."""
nlp = Language()
tagger = nlp.add_pipe("tagger")
orig_tag_count = len(tagger.labels)
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
@ -38,7 +38,7 @@ def test_no_label():
nlp = Language()
nlp.add_pipe("tagger")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()
def test_no_resize():
@ -47,7 +47,7 @@ def test_no_resize():
tagger.add_label("N")
tagger.add_label("V")
assert tagger.labels == ("N", "V")
nlp.begin_training()
nlp.initialize()
assert tagger.model.get_dim("nO") == 2
# this throws an error because the tagger can't be resized after initialization
with pytest.raises(ValueError):
@ -60,10 +60,10 @@ def test_implicit_label():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize(get_examples=lambda: train_examples)
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
tagger = nlp.add_pipe("tagger")
train_examples = []
@ -72,16 +72,16 @@ def test_begin_training_examples():
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: train_examples[0])
nlp.initialize(get_examples=lambda: train_examples[0])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=lambda: [])
nlp.initialize(get_examples=lambda: [])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -91,7 +91,7 @@ def test_overfitting_IO():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert tagger.model.get_dim("nO") == len(TAGS)
for i in range(50):
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
nlp = English()
nlp.add_pipe("tagger")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()

View File

@ -26,7 +26,7 @@ def test_simple_train():
nlp = Language()
textcat = nlp.add_pipe("textcat")
textcat.add_label("answer")
nlp.begin_training()
nlp.initialize()
for i in range(5):
for text, answer in [
("aaaa", 1.0),
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
textcat = TextCategorizer(nlp.vocab, width=8)
for letter in letters:
textcat.add_label(letter)
optimizer = textcat.begin_training(lambda: [])
optimizer = textcat.initialize(lambda: [])
for i in range(30):
losses = {}
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
@ -86,7 +86,7 @@ def test_no_label():
nlp = Language()
nlp.add_pipe("textcat")
with pytest.raises(ValueError):
nlp.begin_training()
nlp.initialize()
def test_implicit_label():
@ -95,7 +95,7 @@ def test_implicit_label():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize(get_examples=lambda: train_examples)
def test_no_resize():
@ -103,14 +103,14 @@ def test_no_resize():
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
nlp.begin_training()
nlp.initialize()
assert textcat.model.get_dim("nO") == 2
# this throws an error because the textcat can't be resized after initialization
with pytest.raises(ValueError):
textcat.add_label("NEUTRAL")
def test_begin_training_examples():
def test_initialize_examples():
nlp = Language()
textcat = nlp.add_pipe("textcat")
train_examples = []
@ -119,12 +119,12 @@ def test_begin_training_examples():
for label, value in annotations.get("cats").items():
textcat.add_label(label)
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
nlp.initialize(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
nlp.initialize(get_examples=train_examples)
def test_overfitting_IO():
@ -139,7 +139,7 @@ def test_overfitting_IO():
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
assert textcat.model.get_dim("nO") == 2
for i in range(50):
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for label, value in annotations.get("cats").items():
textcat.add_label(label)
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(5):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -88,7 +88,7 @@ def test_init_tok2vec():
nlp = English()
tok2vec = nlp.add_pipe("tok2vec")
assert tok2vec.listeners == []
nlp.begin_training()
nlp.initialize()
assert tok2vec.model.get_dim("nO")
@ -154,7 +154,7 @@ def test_tok2vec_listener():
# Check that the Tok2Vec component finds it listeners
assert tok2vec.listeners == []
optimizer = nlp.begin_training(lambda: train_examples)
optimizer = nlp.initialize(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec]
for i in range(5):

View File

@ -428,7 +428,7 @@ def test_issue999():
for _, offsets in TRAIN_DATA:
for start, end, label in offsets:
ner.add_label(label)
nlp.begin_training()
nlp.initialize()
for itn in range(20):
random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA:

View File

@ -250,7 +250,7 @@ def test_issue1915():
ner = nlp.add_pipe("ner")
ner.add_label("answer")
with pytest.raises(ValueError):
nlp.begin_training(**cfg)
nlp.initialize(**cfg)
def test_issue1945():

View File

@ -30,7 +30,7 @@ def test_issue2179():
nlp = Italian()
ner = nlp.add_pipe("ner")
ner.add_label("CITIZENSHIP")
nlp.begin_training()
nlp.initialize()
nlp2 = Italian()
nlp2.add_pipe("ner")
assert len(nlp2.get_pipe("ner").labels) == 0

View File

@ -18,7 +18,7 @@ def test_issue2564():
nlp = Language()
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
doc = nlp("hello world")
assert doc.has_annotation("TAG")
docs = nlp.pipe(["hello", "world"])
@ -149,7 +149,7 @@ def test_issue2800():
ner = nlp.add_pipe("ner")
for entity_type in list(entity_types):
ner.add_label(entity_type)
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(20):
losses = {}
random.shuffle(train_data)

View File

@ -92,7 +92,7 @@ def test_issue3209():
nlp = English()
ner = nlp.add_pipe("ner")
ner.add_label("ANIMAL")
nlp.begin_training()
nlp.initialize()
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
assert ner.move_names == move_names
nlp2 = English()
@ -239,7 +239,7 @@ def test_issue3456():
nlp = English()
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
list(nlp.pipe(["hi", ""]))

View File

@ -223,7 +223,7 @@ def test_issue3611():
textcat.add_label(label)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [_parser_example(parser)])
parser.initialize(lambda: [_parser_example(parser)])
assert "subtok" not in parser.labels
@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [_parser_example(parser)])
parser.initialize(lambda: [_parser_example(parser)])
assert "subtok" in parser.labels
@ -342,7 +342,7 @@ def test_issue3880():
nlp.add_pipe("parser").add_label("dep")
nlp.add_pipe("ner").add_label("PERSON")
nlp.add_pipe("tagger").add_label("NN")
nlp.begin_training()
nlp.initialize()
for doc in nlp.pipe(texts):
pass

View File

@ -66,7 +66,7 @@ def test_issue4030():
textcat.add_label(label)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
@ -87,7 +87,7 @@ def test_issue4042():
# add ner pipe
ner = nlp.add_pipe("ner")
ner.add_label("SOME_LABEL")
nlp.begin_training()
nlp.initialize()
# Add entity ruler
patterns = [
{"label": "MY_ORG", "pattern": "Apple"},
@ -118,7 +118,7 @@ def test_issue4042_bug2():
# add ner pipe
ner1 = nlp1.add_pipe("ner")
ner1.add_label("SOME_LABEL")
nlp1.begin_training()
nlp1.initialize()
# add a new label to the doc
doc1 = nlp1("What do you think about Apple ?")
assert len(ner1.labels) == 1
@ -244,7 +244,7 @@ def test_issue4267():
nlp = English()
ner = nlp.add_pipe("ner")
ner.add_label("PEOPLE")
nlp.begin_training()
nlp.initialize()
assert "ner" in nlp.pipe_names
# assert that we have correct IOB annotations
doc1 = nlp("hi")
@ -299,7 +299,7 @@ def test_issue4313():
config = {}
ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL")
ner.begin_training(lambda: [])
ner.initialize(lambda: [])
# add a new label to the doc
doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1
@ -327,7 +327,7 @@ def test_issue4348():
TRAIN_DATA = [example, example]
tagger = nlp.add_pipe("tagger")
tagger.add_label("A")
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(5):
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))

View File

@ -180,7 +180,7 @@ def test_issue4725_2():
vocab.set_vector("dog", data[1])
nlp = English(vocab=vocab)
nlp.add_pipe("ner")
nlp.begin_training()
nlp.initialize()
docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass

View File

@ -64,7 +64,7 @@ def tagger():
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
tagger.add_label("A")
nlp.begin_training()
nlp.initialize()
return tagger
@ -85,7 +85,7 @@ def entity_linker():
# need to add model for two reasons:
# 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization
nlp.begin_training()
nlp.initialize()
return entity_linker

View File

@ -25,7 +25,7 @@ def test_issue5551():
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
for label in set(example[1]["cats"]):
pipe.add_label(label)
nlp.begin_training()
nlp.initialize()
# Store the result of each iteration
result = pipe.model.predict([nlp.make_doc(example[0])])

View File

@ -152,7 +152,7 @@ def test_serialize_nlp():
nlp_config = Config().from_str(nlp_config_string)
nlp = load_model_from_config(nlp_config, auto_fill=True)
nlp.get_pipe("tagger").add_label("A")
nlp.begin_training()
nlp.initialize()
assert "tok2vec" in nlp.pipe_names
assert "tagger" in nlp.pipe_names
assert "parser" not in nlp.pipe_names
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
parser_cfg = dict()
parser_cfg["model"] = {"@architectures": "my_test_parser"}
nlp.add_pipe("parser", config=parser_cfg)
nlp.begin_training()
nlp.initialize()
with make_tempdir() as d:
nlp.to_disk(d)
@ -191,7 +191,7 @@ def test_serialize_parser():
model_config = Config().from_str(parser_config_string)
parser = nlp.add_pipe("parser", config=model_config)
parser.add_label("nsubj")
nlp.begin_training()
nlp.initialize()
with make_tempdir() as d:
nlp.to_disk(d)

View File

@ -18,7 +18,7 @@ def nlp():
textcat = nlp.add_pipe("textcat")
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
nlp.begin_training()
nlp.initialize()
return nlp

View File

@ -47,7 +47,7 @@ def test_readers():
)
optimizer = T["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer)
scores = nlp.evaluate(list(dev_corpus(nlp)))
@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
)
optimizer = T["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
assert example.y.cats
# this shouldn't fail if each training example has at least one positive label

View File

@ -600,7 +600,7 @@ def _train_tuples(train_data):
train_examples = []
for t in train_data:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for i in range(5):
losses = {}
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))

View File

@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
msg.info(f"Resuming training for: {resume_components}")
nlp.resume_training(sgd=optimizer)
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
msg.good(f"Initialized pipeline components")
# Verify the config after calling 'begin_training' to ensure labels
# Verify the config after calling 'initialize' to ensure labels
# are properly initialized
verify_config(nlp)
if "pretraining" in config and config["pretraining"]:

View File

@ -518,7 +518,7 @@ Stacked ensemble of a bag-of-words model and a neural network model. The neural
network has an internal CNN Tok2Vec layer and uses attention.
| Name | Description |
| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
| `width` | Output dimension of the feature encoding step. ~~int~~ |
@ -527,7 +527,7 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `dropout` | The dropout rate. ~~float~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -556,10 +556,10 @@ vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.
| Name | Description |
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -579,11 +579,11 @@ An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short.
| Name | Description |
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
@ -630,9 +630,9 @@ The `EntityLinker` model architecture is a Thinc `Model` with a
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
### spacy.EmptyKB.v1 {#EmptyKB}

View File

@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## DependencyParser.begin_training {#begin_training tag="method"}
## DependencyParser.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -151,11 +151,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> parser = nlp.add_pipe("parser")
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
>
> ```python
> parser = nlp.add_pipe("parser")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = parser.update(examples, sgd=optimizer)
> ```
@ -294,11 +300,10 @@ context, the original parameters are restored.
## DependencyParser.add_label {#add_label tag="method"}
Add a new label to the pipe. Note that you don't have to call this method if you
provide a **representative data sample** to the
[`begin_training`](#begin_training) method. In this case, all labels found in
the sample will be automatically added to the model, and the output dimension
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
provide a **representative data sample** to the [`initialize`](#initialize)
method. In this case, all labels found in the sample will be automatically added
to the model, and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
> #### Example
>

View File

@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityLinker.begin_training {#begin_training tag="method"}
## EntityLinker.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -150,11 +150,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> entity_linker = nlp.add_pipe("entity_linker", last=True)
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
>
> ```python
> entity_linker = nlp.add_pipe("entity_linker")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = entity_linker.update(examples, sgd=optimizer)
> ```

View File

@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityRecognizer.begin_training {#begin_training tag="method"}
## EntityRecognizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -140,11 +140,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> ner = nlp.add_pipe("ner")
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
>
> ```python
> ner = nlp.add_pipe("ner")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = ner.update(examples, sgd=optimizer)
> ```
@ -282,11 +288,10 @@ context, the original parameters are restored.
## EntityRecognizer.add_label {#add_label tag="method"}
Add a new label to the pipe. Note that you don't have to call this method if you
provide a **representative data sample** to the
[`begin_training`](#begin_training) method. In this case, all labels found in
the sample will be automatically added to the model, and the output dimension
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
provide a **representative data sample** to the [`initialize`](#initialize)
method. In this case, all labels found in the sample will be automatically added
to the model, and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
> #### Example
>

View File

@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
## Language.begin_training {#begin_training tag="method"}
## Language.initialize {#initialize tag="method"}
Initialize the pipeline for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
function that returns an iterable of [`Example`](/api/example) objects. The data
examples can either be the full training data or a representative sample. They
are used to **initialize the models** of trainable pipeline components and are
passed each component's [`begin_training`](/api/pipe#begin_training) method, if
passed each component's [`initialize`](/api/pipe#initialize) method, if
available. Initialization includes validating the network,
[inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
and setting up the label scheme based on the data.
If no `get_examples` function is provided when calling `nlp.begin_training`, the
If no `get_examples` function is provided when calling `nlp.initialize`, the
pipeline components will be initialized with generic data. In this case, it is
crucial that the output dimension of each component has already been defined
either in the [config](/usage/training#config), or by calling
[`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
the tagger or textcat).
<Infobox variant="warning" title="Changed in v3.0">
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
The `Language.update` method now takes a **function** that is called with no
arguments and returns a sequence of [`Example`](/api/example) objects instead of
tuples of `Doc` and `GoldParse` objects.
This method was previously called `begin_training`. It now also takes a
**function** that is called with no arguments and returns a sequence of
[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
objects.
</Infobox>
@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
>
> ```python
> get_examples = lambda: examples
> optimizer = nlp.begin_training(get_examples)
> optimizer = nlp.initialize(get_examples)
> ```
| Name | Description |
@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
>
> ```python
> with nlp.select_pipes(disable=["tagger", "parser"]):
> nlp.begin_training()
> nlp.initialize()
>
> with nlp.select_pipes(enable="ner"):
> nlp.begin_training()
> nlp.initialize()
>
> disabled = nlp.select_pipes(disable=["tagger", "parser"])
> nlp.begin_training()
> nlp.initialize()
> disabled.restore()
> ```

View File

@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Morphologizer.begin_training {#begin_training tag="method"}
## Morphologizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -133,7 +133,7 @@ setting up the label scheme based on the data.
> ```python
> morphologizer = nlp.add_pipe("morphologizer")
> nlp.pipeline.append(morphologizer)
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
>
> ```python
> morphologizer = nlp.add_pipe("morphologizer")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = morphologizer.update(examples, sgd=optimizer)
> ```
@ -259,12 +259,11 @@ context, the original parameters are restored.
Add a new label to the pipe. If the `Morphologizer` should set annotations for
both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
Raises an error if the output dimension is already set, or if the model has
already been fully [initialized](#begin_training). Note that you don't have to
call this method if you provide a **representative data sample** to the
[`begin_training`](#begin_training) method. In this case, all labels found in
the sample will be automatically added to the model, and the output dimension
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
already been fully [initialized](#initialize). Note that you don't have to call
this method if you provide a **representative data sample** to the
[`initialize`](#initialize) method. In this case, all labels found in the sample
will be automatically added to the model, and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
> #### Example
>

View File

@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Pipe.begin_training {#begin_training tag="method"}
## Pipe.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -109,11 +109,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = pipe.update(examples, sgd=optimizer)
> ```
@ -296,9 +302,9 @@ context, the original parameters are restored.
Add a new label to the pipe, to be predicted by the model. The actual
implementation depends on the specific component, but in general `add_label`
shouldn't be called if the output dimension is already set, or if the model has
already been fully [initialized](#begin_training). If these conditions are
violated, the function will raise an Error. The exception to this rule is when
the component is [resizable](#is_resizable), in which case
already been fully [initialized](#initialize). If these conditions are violated,
the function will raise an Error. The exception to this rule is when the
component is [resizable](#is_resizable), in which case
[`set_output`](#set_output) should be called to ensure that the model is
properly resized.
@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
| **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
Note that in general, you don't have to call `pipe.add_label` if you provide a
representative data sample to the [`begin_training`](#begin_training) method. In
this case, all labels found in the sample will be automatically added to the
model, and the output dimension will be
representative data sample to the [`initialize`](#initialize) method. In this
case, all labels found in the sample will be automatically added to the model,
and the output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
## Pipe.is_resizable {#is_resizable tag="method"}

View File

@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## SentenceRecognizer.begin_training {#begin_training tag="method"}
## SentenceRecognizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -129,7 +129,7 @@ setting up the label scheme based on the data.
>
> ```python
> senter = nlp.add_pipe("senter")
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
>
> ```python
> senter = nlp.add_pipe("senter")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = senter.update(examples, sgd=optimizer)
> ```

View File

@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tagger.begin_training {#begin_training tag="method"}
## Tagger.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -123,11 +123,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> tagger = nlp.add_pipe("tagger")
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
>
> ```python
> tagger = nlp.add_pipe("tagger")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = tagger.update(examples, sgd=optimizer)
> ```
@ -289,12 +295,12 @@ context, the original parameters are restored.
## Tagger.add_label {#add_label tag="method"}
Add a new label to the pipe. Raises an error if the output dimension is already
set, or if the model has already been fully [initialized](#begin_training). Note
set, or if the model has already been fully [initialized](#initialize). Note
that you don't have to call this method if you provide a **representative data
sample** to the [`begin_training`](#begin_training) method. In this case, all
labels found in the sample will be automatically added to the model, and the
output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
sample** to the [`initialize`](#initialize) method. In this case, all labels
found in the sample will be automatically added to the model, and the output
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
> #### Example
>

View File

@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## TextCategorizer.begin_training {#begin_training tag="method"}
## TextCategorizer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -136,11 +136,17 @@ validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme based on the data.
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
This method was previously called `begin_training`.
</Infobox>
> #### Example
>
> ```python
> textcat = nlp.add_pipe("textcat")
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -196,7 +202,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
>
> ```python
> textcat = nlp.add_pipe("textcat")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = textcat.update(examples, sgd=optimizer)
> ```
@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
## TextCategorizer.add_label {#add_label tag="method"}
Add a new label to the pipe. Raises an error if the output dimension is already
set, or if the model has already been fully [initialized](#begin_training). Note
set, or if the model has already been fully [initialized](#initialize). Note
that you don't have to call this method if you provide a **representative data
sample** to the [`begin_training`](#begin_training) method. In this case, all
labels found in the sample will be automatically added to the model, and the
output dimension will be
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
sample** to the [`initialize`](#initialize) method. In this case, all labels
found in the sample will be automatically added to the model, and the output
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
automatically.
> #### Example
>

View File

@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tok2Vec.begin_training {#begin_training tag="method"}
## Tok2Vec.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -138,7 +138,7 @@ setting up the label scheme based on the data.
>
> ```python
> tok2vec = nlp.add_pipe("tok2vec")
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
>
> ```python
> tok2vec = nlp.add_pipe("tok2vec")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = tok2vec.update(examples, sgd=optimizer)
> ```

View File

@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Transformer.begin_training {#begin_training tag="method"}
## Transformer.initialize {#initialize tag="method"}
Initialize the component for training and return an
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
@ -173,7 +173,7 @@ setting up the label scheme based on the data.
>
> ```python
> trf = nlp.add_pipe("transformer")
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
> ```
| Name | Description |
@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
>
> ```python
> trf = nlp.add_pipe("transformer")
> optimizer = nlp.begin_training()
> optimizer = nlp.initialize()
> losses = trf.update(examples, sgd=optimizer)
> ```

View File

@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
that their internal models are **always initialized** with appropriate sample
data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
functionality is triggered when
[`nlp.begin_training`](/api/language#begin_training) is called.
functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
called.
### Dropout and normalization in Thinc {#thinc-dropout-norm}
@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
<!-- TODO: write trainable component section
- Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `begin_training`, correlation with add_label
- Initialization life-cycle with `initialize`, correlation with add_label
Example: relation extraction component (implemented as project template)
Avoid duplication with usage/processing-pipelines#trainable-components ?
-->

View File

@ -1127,9 +1127,9 @@ customize how the model is updated from examples, how it's initialized, how the
loss is calculated and to add evaluation scores to the training output.
| Name | Description |
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |

View File

@ -1045,8 +1045,8 @@ of being dropped.
> - [`nlp`](/api/language): The `nlp` object with the pipeline components and
> their models.
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
> return an optimizer to update the component model weights.
> - [`nlp.initialize`](/api/language#initialize): Start the training and return
> an optimizer to update the component model weights.
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
> state between updates.
> - [`nlp.update`](/api/language#update): Update component models with examples.
@ -1057,7 +1057,7 @@ of being dropped.
```python
### Example training loop
optimizer = nlp.begin_training()
optimizer = nlp.initialize()
for itn in range(100):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:

View File

@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
[`Pipe.update`](/api/pipe#update) methods now all take batches of
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
raw text and a dictionary of annotations.
[`Language.begin_training`](/api/language#begin_training) and
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
returns a sequence of `Example` objects to initialize the model instead of a
list of tuples.
[`Language.initialize`](/api/language#initialize) and
[`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
sequence of `Example` objects to initialize the model instead of a list of
tuples.
- The `begin_training` methods have been renamed to `initialize`.
- [`Matcher.add`](/api/matcher#add) and
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
patterns as the second argument (instead of a variable number of arguments).
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
| Removed | Replacement |
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
| `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) |
@ -936,7 +938,7 @@ TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London.", {"entities": [(7, 13, "LOC")]}),
]
nlp.begin_training()
nlp.initialize()
for i in range(20):
random.shuffle(TRAIN_DATA)
for batch in minibatch(TRAIN_DATA):
@ -946,17 +948,18 @@ for i in range(20):
nlp.update(examples)
```
[`Language.begin_training`](/api/language#begin_training) and
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
returns a sequence of `Example` objects to initialize the model instead of a
list of tuples. The data examples are used to **initialize the models** of
`Language.begin_training` and `Pipe.begin_training` have been renamed to
[`Language.initialize`](/api/language#initialize) and
[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
that returns a sequence of `Example` objects to initialize the model instead of
a list of tuples. The data examples are used to **initialize the models** of
trainable pipeline components, which includes validating the network,
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
setting up the label scheme.
```diff
- nlp.begin_training(examples)
+ nlp.begin_training(lambda: examples)
- nlp.initialize(examples)
+ nlp.initialize(lambda: examples)
```
#### Packaging trained pipelines {#migrating-training-packaging}