mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
begin_training -> initialize
This commit is contained in:
parent
046f655d86
commit
ff9a63bfbd
|
@ -103,12 +103,12 @@ def debug_model(
|
|||
with data_validation(False):
|
||||
try:
|
||||
train_corpus = dot_to_object(config, config["training"]["train_corpus"])
|
||||
nlp.begin_training(lambda: train_corpus(nlp))
|
||||
nlp.initialize(lambda: train_corpus(nlp))
|
||||
msg.info("Initialized the model with the training corpus.")
|
||||
except ValueError:
|
||||
try:
|
||||
_set_output_dim(nO=7, model=model)
|
||||
nlp.begin_training(lambda: [Example.from_dict(x, {}) for x in X])
|
||||
nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
|
||||
msg.info("Initialized the model with dummy data.")
|
||||
except Exception:
|
||||
msg.fail(
|
||||
|
|
|
@ -85,6 +85,7 @@ class Warnings:
|
|||
"attribute or operator.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W089 = ("The nlp.begin_training method has been renamed to nlp.initialize.")
|
||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
|
@ -306,7 +307,7 @@ class Errors:
|
|||
"settings: {opts}")
|
||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||
"call begin_training()?")
|
||||
"call initialize()?")
|
||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||
"of the parent Doc and can't exist on their own. A pickled token "
|
||||
|
@ -376,7 +377,7 @@ class Errors:
|
|||
"provided {found}.")
|
||||
E143 = ("Labels for component '{name}' not initialized. This can be fixed "
|
||||
"by calling add_label, or by providing a representative batch of "
|
||||
"examples to the component's begin_training method.")
|
||||
"examples to the component's initialize method.")
|
||||
E145 = ("Error reading `{param}` from input file.")
|
||||
E146 = ("Could not access `{path}`.")
|
||||
E147 = ("Unexpected error in the {method} functionality of the "
|
||||
|
@ -517,7 +518,7 @@ class Errors:
|
|||
"but the provided argument {loc} points to a file.")
|
||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||
"not seem to exist.")
|
||||
E930 = ("Received invalid get_examples callback in {name}.begin_training. "
|
||||
E930 = ("Received invalid get_examples callback in {name}.initialize. "
|
||||
"Expected function that returns an iterable of Example objects but "
|
||||
"got: {obj}")
|
||||
E931 = ("Encountered Pipe subclass without Pipe.{method} method in component "
|
||||
|
|
|
@ -1154,6 +1154,16 @@ class Language:
|
|||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
device: int = -1,
|
||||
) -> Optimizer:
|
||||
warnings.warn(Warnings.W089, DeprecationWarning)
|
||||
return self.initialize(get_examples, sgd=sgd, device=device)
|
||||
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
sgd: Optional[Optimizer] = None,
|
||||
device: int = -1,
|
||||
) -> Optimizer:
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
|
||||
|
@ -1163,11 +1173,11 @@ class Language:
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/language#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/language#initialize
|
||||
"""
|
||||
if get_examples is None:
|
||||
util.logger.debug(
|
||||
"No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
|
||||
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||||
)
|
||||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||||
get_examples = lambda: [Example.from_dict(doc, {})]
|
||||
|
@ -1179,7 +1189,7 @@ class Language:
|
|||
for example in get_examples():
|
||||
if not isinstance(example, Example):
|
||||
err = Errors.E978.format(
|
||||
name="Language.begin_training", types=type(example)
|
||||
name="Language.initialize", types=type(example)
|
||||
)
|
||||
raise ValueError(err)
|
||||
else:
|
||||
|
@ -1198,8 +1208,8 @@ class Language:
|
|||
sgd = create_default_optimizer()
|
||||
self._optimizer = sgd
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "begin_training"):
|
||||
proc.begin_training(
|
||||
if hasattr(proc, "initialize"):
|
||||
proc.initialize(
|
||||
get_examples, pipeline=self.pipeline, sgd=self._optimizer
|
||||
)
|
||||
self._link_components()
|
||||
|
|
|
@ -132,7 +132,7 @@ cdef class DependencyParser(Parser):
|
|||
labeller.model.set_dim("nO", len(self.labels))
|
||||
if labeller.model.has_ref("output_layer"):
|
||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||
labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd)
|
||||
labeller.initialize(get_examples, pipeline=pipeline, sgd=sgd)
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
|
|
@ -140,7 +140,7 @@ class EntityLinker(Pipe):
|
|||
if len(self.kb) == 0:
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def begin_training(
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
|
@ -159,7 +159,7 @@ class EntityLinker(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
self._require_kb()
|
||||
|
|
|
@ -129,7 +129,7 @@ class Morphologizer(Tagger):
|
|||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||
return 1
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
|
@ -142,7 +142,7 @@ class Morphologizer(Tagger):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
# First, fetch all labels from the data
|
||||
|
|
|
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, pipeline=None, sgd=None):
|
||||
if not hasattr(get_examples, "__call__"):
|
||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||
raise ValueError(err)
|
||||
|
@ -177,10 +177,10 @@ class ClozeMultitask(Pipe):
|
|||
def set_annotations(self, docs, dep_ids):
|
||||
pass
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, pipeline=None, sgd=None):
|
||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||
self.model.output_layer.begin_training(X)
|
||||
self.model.output_layer.initialize(X)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
|
|
@ -103,7 +103,7 @@ cdef class EntityRecognizer(Parser):
|
|||
labeller.model.set_dim("nO", len(self.labels))
|
||||
if labeller.model.has_ref("output_layer"):
|
||||
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
|
||||
labeller.begin_training(get_examples, pipeline=pipeline)
|
||||
labeller.initialize(get_examples, pipeline=pipeline)
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
|
|
@ -183,7 +183,7 @@ cdef class Pipe:
|
|||
"""
|
||||
return util.create_default_optimizer()
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||
"""Initialize the pipe for training, using data examples if available.
|
||||
This method needs to be implemented by each Pipe component,
|
||||
ensuring the internal model (if available) is initialized properly
|
||||
|
@ -198,7 +198,7 @@ cdef class Pipe:
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/pipe#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||
"""
|
||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
|
|||
else:
|
||||
self.punct_chars = set(self.default_punct_chars)
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, pipeline=None, sgd=None):
|
||||
pass
|
||||
|
||||
def __call__(self, doc):
|
||||
|
|
|
@ -124,7 +124,7 @@ class SentenceRecognizer(Tagger):
|
|||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
|
@ -137,7 +137,7 @@ class SentenceRecognizer(Tagger):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
doc_sample = []
|
||||
|
|
|
@ -256,7 +256,7 @@ class Tagger(Pipe):
|
|||
raise ValueError("nan value when computing loss")
|
||||
return float(loss), d_scores
|
||||
|
||||
def begin_training(self, get_examples, *, pipeline=None, sgd=None):
|
||||
def initialize(self, get_examples, *, pipeline=None, sgd=None):
|
||||
"""Initialize the pipe for training, using a representative set
|
||||
of data examples.
|
||||
|
||||
|
@ -269,7 +269,7 @@ class Tagger(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tagger#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
doc_sample = []
|
||||
|
|
|
@ -334,7 +334,7 @@ class TextCategorizer(Pipe):
|
|||
self.labels = tuple(list(self.labels) + [label])
|
||||
return 1
|
||||
|
||||
def begin_training(
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
|
@ -353,7 +353,7 @@ class TextCategorizer(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
subbatch = [] # Select a subbatch of examples to initialize the model
|
||||
|
|
|
@ -203,7 +203,7 @@ class Tok2Vec(Pipe):
|
|||
def get_loss(self, examples, scores) -> None:
|
||||
pass
|
||||
|
||||
def begin_training(
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Callable[[], Iterable[Example]],
|
||||
*,
|
||||
|
@ -222,7 +222,7 @@ class Tok2Vec(Pipe):
|
|||
create_optimizer if it doesn't exist.
|
||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
|
||||
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||
"""
|
||||
self._ensure_examples(get_examples)
|
||||
doc_sample = []
|
||||
|
|
|
@ -405,7 +405,7 @@ cdef class Parser(Pipe):
|
|||
def set_output(self, nO):
|
||||
self.model.attrs["resize_output"](self.model, nO)
|
||||
|
||||
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||
def initialize(self, get_examples, pipeline=None, sgd=None, **kwargs):
|
||||
self._ensure_examples(get_examples)
|
||||
self.cfg.update(kwargs)
|
||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||
|
|
|
@ -26,7 +26,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
|||
cfg = {"model": DEFAULT_NER_MODEL}
|
||||
model = registry.resolve(cfg, validate=True)["model"]
|
||||
ner = EntityRecognizer(en_vocab, model, **config)
|
||||
ner.begin_training(lambda: [_ner_example(ner)])
|
||||
ner.initialize(lambda: [_ner_example(ner)])
|
||||
ner(doc)
|
||||
|
||||
doc.ents = [("ANIMAL", 3, 4)]
|
||||
|
@ -48,7 +48,7 @@ def test_ents_reset(en_vocab):
|
|||
cfg = {"model": DEFAULT_NER_MODEL}
|
||||
model = registry.resolve(cfg, validate=True)["model"]
|
||||
ner = EntityRecognizer(en_vocab, model, **config)
|
||||
ner.begin_training(lambda: [_ner_example(ner)])
|
||||
ner.initialize(lambda: [_ner_example(ner)])
|
||||
ner(doc)
|
||||
orig_iobs = [t.ent_iob_ for t in doc]
|
||||
doc.ents = list(doc.ents)
|
||||
|
|
|
@ -35,7 +35,7 @@ def test_init_parser(parser):
|
|||
def _train_parser(parser):
|
||||
fix_random_seed(1)
|
||||
parser.add_label("left")
|
||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
||||
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
|
||||
sgd = Adam(0.001)
|
||||
|
||||
for i in range(5):
|
||||
|
@ -87,7 +87,7 @@ def test_add_label_deserializes_correctly():
|
|||
ner1.add_label("C")
|
||||
ner1.add_label("B")
|
||||
ner1.add_label("A")
|
||||
ner1.begin_training(lambda: [_ner_example(ner1)])
|
||||
ner1.initialize(lambda: [_ner_example(ner1)])
|
||||
ner2 = EntityRecognizer(Vocab(), model, **config)
|
||||
|
||||
# the second model needs to be resized before we can call from_bytes
|
||||
|
|
|
@ -202,7 +202,7 @@ def test_train_empty():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
ner = nlp.add_pipe("ner", last=True)
|
||||
ner.add_label("PERSON")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for itn in range(2):
|
||||
losses = {}
|
||||
batches = util.minibatch(train_examples, size=8)
|
||||
|
@ -213,7 +213,7 @@ def test_train_empty():
|
|||
def test_overwrite_token():
|
||||
nlp = English()
|
||||
nlp.add_pipe("ner")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
# The untrained NER will predict O for each token
|
||||
doc = nlp("I live in New York")
|
||||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||
|
@ -235,7 +235,7 @@ def test_empty_ner():
|
|||
nlp = English()
|
||||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
doc = nlp("John is watching the news about Croatia's elections")
|
||||
# if this goes wrong, the initialization of the parser's upper layer is probably broken
|
||||
result = ["O", "O", "O", "O", "O", "O", "O", "O", "O"]
|
||||
|
@ -254,7 +254,7 @@ def test_ruler_before_ner():
|
|||
# 2: untrained NER - should set everything else to O
|
||||
untrained_ner = nlp.add_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
doc = nlp("This is Antti Korhonen speaking in Finland")
|
||||
expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
|
||||
expected_types = ["THING", "", "", "", "", "", ""]
|
||||
|
@ -269,7 +269,7 @@ def test_ner_before_ruler():
|
|||
# 1: untrained NER - should set everything to O
|
||||
untrained_ner = nlp.add_pipe("ner", name="uner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
# 2 : Entity Ruler - should set "this" to B and keep everything else O
|
||||
patterns = [{"label": "THING", "pattern": "This"}]
|
||||
|
@ -290,7 +290,7 @@ def test_block_ner():
|
|||
nlp.add_pipe("blocker", config={"start": 2, "end": 5})
|
||||
untrained_ner = nlp.add_pipe("ner")
|
||||
untrained_ner.add_label("MY_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
doc = nlp("This is Antti L Korhonen speaking in Finland")
|
||||
expected_iobs = ["O", "O", "B", "B", "B", "O", "O", "O"]
|
||||
expected_types = ["", "", "", "", "", "", "", ""]
|
||||
|
@ -307,7 +307,7 @@ def test_overfitting_IO():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for ent in annotations.get("entities"):
|
||||
ner.add_label(ent[2])
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
|
@ -340,13 +340,13 @@ def test_ner_warns_no_lookups(caplog):
|
|||
assert not len(nlp.vocab.lookups)
|
||||
nlp.add_pipe("ner")
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert "W033" in caplog.text
|
||||
caplog.clear()
|
||||
nlp.vocab.lookups.add_table("lexeme_norm")
|
||||
nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
|
||||
with caplog.at_level(logging.DEBUG):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert "W033" not in caplog.text
|
||||
|
||||
|
||||
|
@ -358,5 +358,5 @@ class BlockerComponent1:
|
|||
self.name = name
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
|
||||
doc.set_ents([], blocked=[doc[self.start : self.end]], default="unmodified")
|
||||
return doc
|
||||
|
|
|
@ -191,7 +191,7 @@ def test_overfitting_IO():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
|
|
@ -34,7 +34,7 @@ def parser(vocab):
|
|||
parser.cfg["hidden_width"] = 32
|
||||
# parser.add_label('right')
|
||||
parser.add_label("left")
|
||||
parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
|
||||
parser.initialize(lambda: [_parser_example(parser)], **parser.cfg)
|
||||
sgd = Adam(0.001)
|
||||
|
||||
for i in range(10):
|
||||
|
|
|
@ -134,7 +134,7 @@ def test_kb_undefined(nlp):
|
|||
"""Test that the EL can't train without defining a KB"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||
with pytest.raises(ValueError):
|
||||
entity_linker.begin_training(lambda: [])
|
||||
entity_linker.initialize(lambda: [])
|
||||
|
||||
|
||||
def test_kb_empty(nlp):
|
||||
|
@ -143,7 +143,7 @@ def test_kb_empty(nlp):
|
|||
entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError):
|
||||
entity_linker.begin_training(lambda: [])
|
||||
entity_linker.initialize(lambda: [])
|
||||
|
||||
|
||||
def test_kb_serialize(nlp):
|
||||
|
@ -360,7 +360,7 @@ def test_preserving_links_asdoc(nlp):
|
|||
ruler.add_patterns(patterns)
|
||||
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
|
||||
entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert entity_linker.model.get_dim("nO") == vector_length
|
||||
|
||||
# test whether the entity links are preserved by the `as_doc()` function
|
||||
|
@ -463,7 +463,7 @@ def test_overfitting_IO():
|
|||
)
|
||||
|
||||
# train the NEL pipe
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert entity_linker.model.get_dim("nO") == vector_length
|
||||
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ def test_no_label():
|
|||
nlp = Language()
|
||||
nlp.add_pipe("morphologizer")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_implicit_label():
|
||||
|
@ -42,7 +42,7 @@ def test_implicit_label():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
|
@ -50,13 +50,13 @@ def test_no_resize():
|
|||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
# this throws an error because the morphologizer can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
morphologizer = nlp.add_pipe("morphologizer")
|
||||
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
|
||||
|
@ -64,12 +64,12 @@ def test_begin_training_examples():
|
|||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -79,7 +79,7 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for inst in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
|
|
|
@ -31,19 +31,19 @@ TRAIN_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
nlp.add_pipe("senter")
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -58,7 +58,7 @@ def test_overfitting_IO():
|
|||
train_examples[1].reference[11].is_sent_start = False
|
||||
|
||||
nlp.add_pipe("senter")
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
|
||||
for i in range(200):
|
||||
losses = {}
|
||||
|
|
|
@ -15,14 +15,14 @@ def test_label_types():
|
|||
tagger.add_label(9)
|
||||
|
||||
|
||||
def test_tagger_begin_training_tag_map():
|
||||
"""Test that Tagger.begin_training() without gold tuples does not clobber
|
||||
def test_tagger_initialize_tag_map():
|
||||
"""Test that Tagger.initialize() without gold tuples does not clobber
|
||||
the tag map."""
|
||||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
orig_tag_count = len(tagger.labels)
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels)
|
||||
|
||||
|
||||
|
@ -38,7 +38,7 @@ def test_no_label():
|
|||
nlp = Language()
|
||||
nlp.add_pipe("tagger")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
|
@ -47,7 +47,7 @@ def test_no_resize():
|
|||
tagger.add_label("N")
|
||||
tagger.add_label("V")
|
||||
assert tagger.labels == ("N", "V")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert tagger.model.get_dim("nO") == 2
|
||||
# this throws an error because the tagger can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -60,10 +60,10 @@ def test_implicit_label():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
train_examples = []
|
||||
|
@ -72,16 +72,16 @@ def test_begin_training_examples():
|
|||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: train_examples[0])
|
||||
nlp.initialize(get_examples=lambda: train_examples[0])
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=lambda: [])
|
||||
nlp.initialize(get_examples=lambda: [])
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -91,7 +91,7 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert tagger.model.get_dim("nO") == len(TAGS)
|
||||
|
||||
for i in range(50):
|
||||
|
@ -122,4 +122,4 @@ def test_tagger_requires_labels():
|
|||
nlp = English()
|
||||
nlp.add_pipe("tagger")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
|
|
@ -26,7 +26,7 @@ def test_simple_train():
|
|||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
textcat.add_label("answer")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for i in range(5):
|
||||
for text, answer in [
|
||||
("aaaa", 1.0),
|
||||
|
@ -56,7 +56,7 @@ def test_textcat_learns_multilabel():
|
|||
textcat = TextCategorizer(nlp.vocab, width=8)
|
||||
for letter in letters:
|
||||
textcat.add_label(letter)
|
||||
optimizer = textcat.begin_training(lambda: [])
|
||||
optimizer = textcat.initialize(lambda: [])
|
||||
for i in range(30):
|
||||
losses = {}
|
||||
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
||||
|
@ -86,7 +86,7 @@ def test_no_label():
|
|||
nlp = Language()
|
||||
nlp.add_pipe("textcat")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_implicit_label():
|
||||
|
@ -95,7 +95,7 @@ def test_implicit_label():
|
|||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
|
@ -103,14 +103,14 @@ def test_no_resize():
|
|||
textcat = nlp.add_pipe("textcat")
|
||||
textcat.add_label("POSITIVE")
|
||||
textcat.add_label("NEGATIVE")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
# this throws an error because the textcat can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
textcat.add_label("NEUTRAL")
|
||||
|
||||
|
||||
def test_begin_training_examples():
|
||||
def test_initialize_examples():
|
||||
nlp = Language()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
train_examples = []
|
||||
|
@ -119,12 +119,12 @@ def test_begin_training_examples():
|
|||
for label, value in annotations.get("cats").items():
|
||||
textcat.add_label(label)
|
||||
# you shouldn't really call this more than once, but for testing it should be fine
|
||||
nlp.begin_training()
|
||||
nlp.begin_training(get_examples=lambda: train_examples)
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
with pytest.raises(TypeError):
|
||||
nlp.begin_training(get_examples=lambda: None)
|
||||
nlp.initialize(get_examples=lambda: None)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(get_examples=train_examples)
|
||||
nlp.initialize(get_examples=train_examples)
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
|
@ -139,7 +139,7 @@ def test_overfitting_IO():
|
|||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
optimizer = nlp.begin_training(get_examples=lambda: train_examples)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert textcat.model.get_dim("nO") == 2
|
||||
|
||||
for i in range(50):
|
||||
|
@ -195,7 +195,7 @@ def test_textcat_configs(textcat_config):
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for label, value in annotations.get("cats").items():
|
||||
textcat.add_label(label)
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
|
|
|
@ -88,7 +88,7 @@ def test_init_tok2vec():
|
|||
nlp = English()
|
||||
tok2vec = nlp.add_pipe("tok2vec")
|
||||
assert tok2vec.listeners == []
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert tok2vec.model.get_dim("nO")
|
||||
|
||||
|
||||
|
@ -154,7 +154,7 @@ def test_tok2vec_listener():
|
|||
|
||||
# Check that the Tok2Vec component finds it listeners
|
||||
assert tok2vec.listeners == []
|
||||
optimizer = nlp.begin_training(lambda: train_examples)
|
||||
optimizer = nlp.initialize(lambda: train_examples)
|
||||
assert tok2vec.listeners == [tagger_tok2vec]
|
||||
|
||||
for i in range(5):
|
||||
|
|
|
@ -428,7 +428,7 @@ def test_issue999():
|
|||
for _, offsets in TRAIN_DATA:
|
||||
for start, end, label in offsets:
|
||||
ner.add_label(label)
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for itn in range(20):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
for raw_text, entity_offsets in TRAIN_DATA:
|
||||
|
|
|
@ -250,7 +250,7 @@ def test_issue1915():
|
|||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("answer")
|
||||
with pytest.raises(ValueError):
|
||||
nlp.begin_training(**cfg)
|
||||
nlp.initialize(**cfg)
|
||||
|
||||
|
||||
def test_issue1945():
|
||||
|
|
|
@ -30,7 +30,7 @@ def test_issue2179():
|
|||
nlp = Italian()
|
||||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("CITIZENSHIP")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
nlp2 = Italian()
|
||||
nlp2.add_pipe("ner")
|
||||
assert len(nlp2.get_pipe("ner").labels) == 0
|
||||
|
|
|
@ -18,7 +18,7 @@ def test_issue2564():
|
|||
nlp = Language()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
doc = nlp("hello world")
|
||||
assert doc.has_annotation("TAG")
|
||||
docs = nlp.pipe(["hello", "world"])
|
||||
|
@ -149,7 +149,7 @@ def test_issue2800():
|
|||
ner = nlp.add_pipe("ner")
|
||||
for entity_type in list(entity_types):
|
||||
ner.add_label(entity_type)
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(20):
|
||||
losses = {}
|
||||
random.shuffle(train_data)
|
||||
|
|
|
@ -92,7 +92,7 @@ def test_issue3209():
|
|||
nlp = English()
|
||||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("ANIMAL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
|
||||
assert ner.move_names == move_names
|
||||
nlp2 = English()
|
||||
|
@ -239,7 +239,7 @@ def test_issue3456():
|
|||
nlp = English()
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
list(nlp.pipe(["hi", ""]))
|
||||
|
||||
|
||||
|
|
|
@ -223,7 +223,7 @@ def test_issue3611():
|
|||
textcat.add_label(label)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
@ -268,7 +268,7 @@ def test_issue3830_no_subtok():
|
|||
parser = DependencyParser(Vocab(), model, **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [_parser_example(parser)])
|
||||
parser.initialize(lambda: [_parser_example(parser)])
|
||||
assert "subtok" not in parser.labels
|
||||
|
||||
|
||||
|
@ -283,7 +283,7 @@ def test_issue3830_with_subtok():
|
|||
parser = DependencyParser(Vocab(), model, **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [_parser_example(parser)])
|
||||
parser.initialize(lambda: [_parser_example(parser)])
|
||||
assert "subtok" in parser.labels
|
||||
|
||||
|
||||
|
@ -342,7 +342,7 @@ def test_issue3880():
|
|||
nlp.add_pipe("parser").add_label("dep")
|
||||
nlp.add_pipe("ner").add_label("PERSON")
|
||||
nlp.add_pipe("tagger").add_label("NN")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for doc in nlp.pipe(texts):
|
||||
pass
|
||||
|
||||
|
|
|
@ -66,7 +66,7 @@ def test_issue4030():
|
|||
textcat.add_label(label)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
@ -87,7 +87,7 @@ def test_issue4042():
|
|||
# add ner pipe
|
||||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
# Add entity ruler
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
|
@ -118,7 +118,7 @@ def test_issue4042_bug2():
|
|||
# add ner pipe
|
||||
ner1 = nlp1.add_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.begin_training()
|
||||
nlp1.initialize()
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
|
@ -244,7 +244,7 @@ def test_issue4267():
|
|||
nlp = English()
|
||||
ner = nlp.add_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert "ner" in nlp.pipe_names
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
|
@ -299,7 +299,7 @@ def test_issue4313():
|
|||
config = {}
|
||||
ner = nlp.create_pipe("ner", config=config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training(lambda: [])
|
||||
ner.initialize(lambda: [])
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
|
@ -327,7 +327,7 @@ def test_issue4348():
|
|||
TRAIN_DATA = [example, example]
|
||||
tagger = nlp.add_pipe("tagger")
|
||||
tagger.add_label("A")
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
|
|
|
@ -180,7 +180,7 @@ def test_issue4725_2():
|
|||
vocab.set_vector("dog", data[1])
|
||||
nlp = English(vocab=vocab)
|
||||
nlp.add_pipe("ner")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
||||
|
|
|
@ -64,7 +64,7 @@ def tagger():
|
|||
# 1. no model leads to error in serialization,
|
||||
# 2. the affected line is the one for model serialization
|
||||
tagger.add_label("A")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
return tagger
|
||||
|
||||
|
||||
|
@ -85,7 +85,7 @@ def entity_linker():
|
|||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
# 2. the affected line is the one for model serialization
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
return entity_linker
|
||||
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ def test_issue5551():
|
|||
pipe = nlp.add_pipe(component, config=pipe_cfg, last=True)
|
||||
for label in set(example[1]["cats"]):
|
||||
pipe.add_label(label)
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
# Store the result of each iteration
|
||||
result = pipe.model.predict([nlp.make_doc(example[0])])
|
||||
|
|
|
@ -152,7 +152,7 @@ def test_serialize_nlp():
|
|||
nlp_config = Config().from_str(nlp_config_string)
|
||||
nlp = load_model_from_config(nlp_config, auto_fill=True)
|
||||
nlp.get_pipe("tagger").add_label("A")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
assert "tok2vec" in nlp.pipe_names
|
||||
assert "tagger" in nlp.pipe_names
|
||||
assert "parser" not in nlp.pipe_names
|
||||
|
@ -173,7 +173,7 @@ def test_serialize_custom_nlp():
|
|||
parser_cfg = dict()
|
||||
parser_cfg["model"] = {"@architectures": "my_test_parser"}
|
||||
nlp.add_pipe("parser", config=parser_cfg)
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
with make_tempdir() as d:
|
||||
nlp.to_disk(d)
|
||||
|
@ -191,7 +191,7 @@ def test_serialize_parser():
|
|||
model_config = Config().from_str(parser_config_string)
|
||||
parser = nlp.add_pipe("parser", config=model_config)
|
||||
parser.add_label("nsubj")
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
|
||||
with make_tempdir() as d:
|
||||
nlp.to_disk(d)
|
||||
|
|
|
@ -18,7 +18,7 @@ def nlp():
|
|||
textcat = nlp.add_pipe("textcat")
|
||||
for label in ("POSITIVE", "NEGATIVE"):
|
||||
textcat.add_label(label)
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
return nlp
|
||||
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ def test_readers():
|
|||
)
|
||||
optimizer = T["optimizer"]
|
||||
# simulate a training loop
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
for example in train_corpus(nlp):
|
||||
nlp.update([example], sgd=optimizer)
|
||||
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||
|
@ -99,7 +99,7 @@ def test_cat_readers(reader, additional_config):
|
|||
)
|
||||
optimizer = T["optimizer"]
|
||||
# simulate a training loop
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
for example in train_corpus(nlp):
|
||||
assert example.y.cats
|
||||
# this shouldn't fail if each training example has at least one positive label
|
||||
|
|
|
@ -600,7 +600,7 @@ def _train_tuples(train_data):
|
|||
train_examples = []
|
||||
for t in train_data:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||
|
|
|
@ -49,9 +49,9 @@ def init_nlp(config: Config, *, use_gpu: int = -1, silent: bool = True) -> Langu
|
|||
msg.info(f"Resuming training for: {resume_components}")
|
||||
nlp.resume_training(sgd=optimizer)
|
||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||
msg.good(f"Initialized pipeline components")
|
||||
# Verify the config after calling 'begin_training' to ensure labels
|
||||
# Verify the config after calling 'initialize' to ensure labels
|
||||
# are properly initialized
|
||||
verify_config(nlp)
|
||||
if "pretraining" in config and config["pretraining"]:
|
||||
|
|
|
@ -517,18 +517,18 @@ specific data and challenge.
|
|||
Stacked ensemble of a bag-of-words model and a neural network model. The neural
|
||||
network has an internal CNN Tok2Vec layer and uses attention.
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
|
||||
| `width` | Output dimension of the feature encoding step. ~~int~~ |
|
||||
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
|
||||
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
|
||||
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `dropout` | The dropout rate. ~~float~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
| Name | Description |
|
||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
|
||||
| `width` | Output dimension of the feature encoding step. ~~int~~ |
|
||||
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
|
||||
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
|
||||
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `dropout` | The dropout rate. ~~float~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||
|
||||
|
@ -555,12 +555,12 @@ A neural network model where token vectors are calculated using a CNN. The
|
|||
vectors are mean pooled and used as features in a feed-forward network. This
|
||||
architecture is usually less accurate than the ensemble, but runs faster.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
| Name | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.TextCatBOW.v1 {#TextCatBOW}
|
||||
|
||||
|
@ -578,13 +578,13 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
|||
An ngram "bag-of-words" model. This architecture should run much faster than the
|
||||
others, but may not be as accurate, especially if texts are short.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
| Name | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||
|
||||
|
@ -629,11 +629,11 @@ into the "real world". This requires 3 main components:
|
|||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
### spacy.EmptyKB.v1 {#EmptyKB}
|
||||
|
||||
|
|
|
@ -140,7 +140,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||
## DependencyParser.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -151,11 +151,17 @@ validating the network,
|
|||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> parser = nlp.add_pipe("parser")
|
||||
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = parser.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -210,7 +216,7 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
|||
>
|
||||
> ```python
|
||||
> parser = nlp.add_pipe("parser")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = parser.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
@ -294,11 +300,10 @@ context, the original parameters are restored.
|
|||
## DependencyParser.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. Note that you don't have to call this method if you
|
||||
provide a **representative data sample** to the
|
||||
[`begin_training`](#begin_training) method. In this case, all labels found in
|
||||
the sample will be automatically added to the model, and the output dimension
|
||||
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||
automatically.
|
||||
provide a **representative data sample** to the [`initialize`](#initialize)
|
||||
method. In this case, all labels found in the sample will be automatically added
|
||||
to the model, and the output dimension will be
|
||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -139,7 +139,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||
## EntityLinker.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -150,11 +150,17 @@ validating the network,
|
|||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = entity_linker.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -211,7 +217,7 @@ pipe's entity linking model and context encoder. Delegates to
|
|||
>
|
||||
> ```python
|
||||
> entity_linker = nlp.add_pipe("entity_linker")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = entity_linker.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
|
|
@ -129,7 +129,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||
## EntityRecognizer.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -140,11 +140,17 @@ validating the network,
|
|||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> ner = nlp.add_pipe("ner")
|
||||
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = ner.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -199,7 +205,7 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
|||
>
|
||||
> ```python
|
||||
> ner = nlp.add_pipe("ner")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = ner.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
@ -282,11 +288,10 @@ context, the original parameters are restored.
|
|||
## EntityRecognizer.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. Note that you don't have to call this method if you
|
||||
provide a **representative data sample** to the
|
||||
[`begin_training`](#begin_training) method. In this case, all labels found in
|
||||
the sample will be automatically added to the model, and the output dimension
|
||||
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||
automatically.
|
||||
provide a **representative data sample** to the [`initialize`](#initialize)
|
||||
method. In this case, all labels found in the sample will be automatically added
|
||||
to the model, and the output dimension will be
|
||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -201,30 +201,31 @@ more efficient than processing texts one-by-one.
|
|||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||
|
||||
## Language.begin_training {#begin_training tag="method"}
|
||||
## Language.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the pipeline for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
function that returns an iterable of [`Example`](/api/example) objects. The data
|
||||
examples can either be the full training data or a representative sample. They
|
||||
are used to **initialize the models** of trainable pipeline components and are
|
||||
passed each component's [`begin_training`](/api/pipe#begin_training) method, if
|
||||
passed each component's [`initialize`](/api/pipe#initialize) method, if
|
||||
available. Initialization includes validating the network,
|
||||
[inferring missing shapes](/usage/layers-architectures#thinc-shape-inference)
|
||||
and setting up the label scheme based on the data.
|
||||
|
||||
If no `get_examples` function is provided when calling `nlp.begin_training`, the
|
||||
If no `get_examples` function is provided when calling `nlp.initialize`, the
|
||||
pipeline components will be initialized with generic data. In this case, it is
|
||||
crucial that the output dimension of each component has already been defined
|
||||
either in the [config](/usage/training#config), or by calling
|
||||
[`pipe.add_label`](/api/pipe#add_label) for each possible output label (e.g. for
|
||||
the tagger or textcat).
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
The `Language.update` method now takes a **function** that is called with no
|
||||
arguments and returns a sequence of [`Example`](/api/example) objects instead of
|
||||
tuples of `Doc` and `GoldParse` objects.
|
||||
This method was previously called `begin_training`. It now also takes a
|
||||
**function** that is called with no arguments and returns a sequence of
|
||||
[`Example`](/api/example) objects instead of tuples of `Doc` and `GoldParse`
|
||||
objects.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -232,7 +233,7 @@ tuples of `Doc` and `GoldParse` objects.
|
|||
>
|
||||
> ```python
|
||||
> get_examples = lambda: examples
|
||||
> optimizer = nlp.begin_training(get_examples)
|
||||
> optimizer = nlp.initialize(get_examples)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -636,13 +637,13 @@ list, will be disabled. Under the hood, this method calls into
|
|||
>
|
||||
> ```python
|
||||
> with nlp.select_pipes(disable=["tagger", "parser"]):
|
||||
> nlp.begin_training()
|
||||
> nlp.initialize()
|
||||
>
|
||||
> with nlp.select_pipes(enable="ner"):
|
||||
> nlp.begin_training()
|
||||
> nlp.initialize()
|
||||
>
|
||||
> disabled = nlp.select_pipes(disable=["tagger", "parser"])
|
||||
> nlp.begin_training()
|
||||
> nlp.initialize()
|
||||
> disabled.restore()
|
||||
> ```
|
||||
|
||||
|
|
|
@ -117,7 +117,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Morphologizer.begin_training {#begin_training tag="method"}
|
||||
## Morphologizer.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -133,7 +133,7 @@ setting up the label scheme based on the data.
|
|||
> ```python
|
||||
> morphologizer = nlp.add_pipe("morphologizer")
|
||||
> nlp.pipeline.append(morphologizer)
|
||||
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = morphologizer.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -189,7 +189,7 @@ Delegates to [`predict`](/api/morphologizer#predict) and
|
|||
>
|
||||
> ```python
|
||||
> morphologizer = nlp.add_pipe("morphologizer")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = morphologizer.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
@ -259,12 +259,11 @@ context, the original parameters are restored.
|
|||
Add a new label to the pipe. If the `Morphologizer` should set annotations for
|
||||
both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
|
||||
Raises an error if the output dimension is already set, or if the model has
|
||||
already been fully [initialized](#begin_training). Note that you don't have to
|
||||
call this method if you provide a **representative data sample** to the
|
||||
[`begin_training`](#begin_training) method. In this case, all labels found in
|
||||
the sample will be automatically added to the model, and the output dimension
|
||||
will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||
automatically.
|
||||
already been fully [initialized](#initialize). Note that you don't have to call
|
||||
this method if you provide a **representative data sample** to the
|
||||
[`initialize`](#initialize) method. In this case, all labels found in the sample
|
||||
will be automatically added to the model, and the output dimension will be
|
||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -98,7 +98,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Pipe.begin_training {#begin_training tag="method"}
|
||||
## Pipe.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -109,11 +109,17 @@ validating the network,
|
|||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = pipe.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -180,7 +186,7 @@ predictions and gold-standard annotations, and update the component's model.
|
|||
>
|
||||
> ```python
|
||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = pipe.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
@ -296,9 +302,9 @@ context, the original parameters are restored.
|
|||
Add a new label to the pipe, to be predicted by the model. The actual
|
||||
implementation depends on the specific component, but in general `add_label`
|
||||
shouldn't be called if the output dimension is already set, or if the model has
|
||||
already been fully [initialized](#begin_training). If these conditions are
|
||||
violated, the function will raise an Error. The exception to this rule is when
|
||||
the component is [resizable](#is_resizable), in which case
|
||||
already been fully [initialized](#initialize). If these conditions are violated,
|
||||
the function will raise an Error. The exception to this rule is when the
|
||||
component is [resizable](#is_resizable), in which case
|
||||
[`set_output`](#set_output) should be called to ensure that the model is
|
||||
properly resized.
|
||||
|
||||
|
@ -314,9 +320,9 @@ This method needs to be overwritten with your own custom `add_label` method.
|
|||
| **RETURNS** | 0 if the label is already present, otherwise 1. ~~int~~ |
|
||||
|
||||
Note that in general, you don't have to call `pipe.add_label` if you provide a
|
||||
representative data sample to the [`begin_training`](#begin_training) method. In
|
||||
this case, all labels found in the sample will be automatically added to the
|
||||
model, and the output dimension will be
|
||||
representative data sample to the [`initialize`](#initialize) method. In this
|
||||
case, all labels found in the sample will be automatically added to the model,
|
||||
and the output dimension will be
|
||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||
|
||||
## Pipe.is_resizable {#is_resizable tag="method"}
|
||||
|
|
|
@ -114,7 +114,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
||||
## SentenceRecognizer.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -129,7 +129,7 @@ setting up the label scheme based on the data.
|
|||
>
|
||||
> ```python
|
||||
> senter = nlp.add_pipe("senter")
|
||||
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = senter.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -185,7 +185,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
|
|||
>
|
||||
> ```python
|
||||
> senter = nlp.add_pipe("senter")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = senter.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
|
|
@ -112,7 +112,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Tagger.begin_training {#begin_training tag="method"}
|
||||
## Tagger.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -123,11 +123,17 @@ validating the network,
|
|||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> tagger = nlp.add_pipe("tagger")
|
||||
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = tagger.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -183,7 +189,7 @@ Delegates to [`predict`](/api/tagger#predict) and
|
|||
>
|
||||
> ```python
|
||||
> tagger = nlp.add_pipe("tagger")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = tagger.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
@ -289,12 +295,12 @@ context, the original parameters are restored.
|
|||
## Tagger.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. Raises an error if the output dimension is already
|
||||
set, or if the model has already been fully [initialized](#begin_training). Note
|
||||
set, or if the model has already been fully [initialized](#initialize). Note
|
||||
that you don't have to call this method if you provide a **representative data
|
||||
sample** to the [`begin_training`](#begin_training) method. In this case, all
|
||||
labels found in the sample will be automatically added to the model, and the
|
||||
output dimension will be
|
||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||
sample** to the [`initialize`](#initialize) method. In this case, all labels
|
||||
found in the sample will be automatically added to the model, and the output
|
||||
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||
automatically.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## TextCategorizer.begin_training {#begin_training tag="method"}
|
||||
## TextCategorizer.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -136,11 +136,17 @@ validating the network,
|
|||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme based on the data.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
|
||||
|
||||
This method was previously called `begin_training`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = textcat.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -196,7 +202,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
|
|||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = textcat.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
@ -303,12 +309,12 @@ Modify the pipe's model, to use the given parameter values.
|
|||
## TextCategorizer.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. Raises an error if the output dimension is already
|
||||
set, or if the model has already been fully [initialized](#begin_training). Note
|
||||
set, or if the model has already been fully [initialized](#initialize). Note
|
||||
that you don't have to call this method if you provide a **representative data
|
||||
sample** to the [`begin_training`](#begin_training) method. In this case, all
|
||||
labels found in the sample will be automatically added to the model, and the
|
||||
output dimension will be
|
||||
[inferred](/usage/layers-architectures#thinc-shape-inference) automatically.
|
||||
sample** to the [`initialize`](#initialize) method. In this case, all labels
|
||||
found in the sample will be automatically added to the model, and the output
|
||||
dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
|
||||
automatically.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -123,7 +123,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Tok2Vec.begin_training {#begin_training tag="method"}
|
||||
## Tok2Vec.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -138,7 +138,7 @@ setting up the label scheme based on the data.
|
|||
>
|
||||
> ```python
|
||||
> tok2vec = nlp.add_pipe("tok2vec")
|
||||
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = tok2vec.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -193,7 +193,7 @@ Delegates to [`predict`](/api/tok2vec#predict).
|
|||
>
|
||||
> ```python
|
||||
> tok2vec = nlp.add_pipe("tok2vec")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = tok2vec.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
|
|
@ -158,7 +158,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Transformer.begin_training {#begin_training tag="method"}
|
||||
## Transformer.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the component for training and return an
|
||||
[`Optimizer`](https://thinc.ai/docs/api-optimizers). `get_examples` should be a
|
||||
|
@ -173,7 +173,7 @@ setting up the label scheme based on the data.
|
|||
>
|
||||
> ```python
|
||||
> trf = nlp.add_pipe("transformer")
|
||||
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> optimizer = trf.initialize(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -241,7 +241,7 @@ and call the optimizer, while the others simply increment the gradients.
|
|||
>
|
||||
> ```python
|
||||
> trf = nlp.add_pipe("transformer")
|
||||
> optimizer = nlp.begin_training()
|
||||
> optimizer = nlp.initialize()
|
||||
> losses = trf.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
|
|
|
@ -460,8 +460,8 @@ The built-in [pipeline components](/usage/processing-pipelines) in spaCy ensure
|
|||
that their internal models are **always initialized** with appropriate sample
|
||||
data. In this case, `X` is typically a ~~List[Doc]~~, while `Y` is typically a
|
||||
~~List[Array1d]~~ or ~~List[Array2d]~~, depending on the specific task. This
|
||||
functionality is triggered when
|
||||
[`nlp.begin_training`](/api/language#begin_training) is called.
|
||||
functionality is triggered when [`nlp.initialize`](/api/language#initialize) is
|
||||
called.
|
||||
|
||||
### Dropout and normalization in Thinc {#thinc-dropout-norm}
|
||||
|
||||
|
@ -491,7 +491,7 @@ with Model.define_operators({">>": chain}):
|
|||
|
||||
<!-- TODO: write trainable component section
|
||||
- Interaction with `predict`, `get_loss` and `set_annotations`
|
||||
- Initialization life-cycle with `begin_training`, correlation with add_label
|
||||
- Initialization life-cycle with `initialize`, correlation with add_label
|
||||
Example: relation extraction component (implemented as project template)
|
||||
Avoid duplication with usage/processing-pipelines#trainable-components ?
|
||||
-->
|
||||
|
|
|
@ -1126,12 +1126,12 @@ For some use cases, it makes sense to also overwrite additional methods to
|
|||
customize how the model is updated from examples, how it's initialized, how the
|
||||
loss is calculated and to add evaluation scores to the training output.
|
||||
|
||||
| Name | Description |
|
||||
| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||
| [`begin_training`](/api/pipe#begin_training) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
|
||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and [`Pipe.create_optimizer`](/api/pipe#create_optimizer) if no optimizer is provided. |
|
||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||
|
||||
<Infobox title="Custom trainable components and models" emoji="📖">
|
||||
|
||||
|
|
|
@ -1045,8 +1045,8 @@ of being dropped.
|
|||
|
||||
> - [`nlp`](/api/language): The `nlp` object with the pipeline components and
|
||||
> their models.
|
||||
> - [`nlp.begin_training`](/api/language#begin_training): Start the training and
|
||||
> return an optimizer to update the component model weights.
|
||||
> - [`nlp.initialize`](/api/language#initialize): Start the training and return
|
||||
> an optimizer to update the component model weights.
|
||||
> - [`Optimizer`](https://thinc.ai/docs/api-optimizers): Function that holds
|
||||
> state between updates.
|
||||
> - [`nlp.update`](/api/language#update): Update component models with examples.
|
||||
|
@ -1057,7 +1057,7 @@ of being dropped.
|
|||
|
||||
```python
|
||||
### Example training loop
|
||||
optimizer = nlp.begin_training()
|
||||
optimizer = nlp.initialize()
|
||||
for itn in range(100):
|
||||
random.shuffle(train_data)
|
||||
for raw_text, entity_offsets in train_data:
|
||||
|
|
|
@ -526,10 +526,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
[`Pipe.update`](/api/pipe#update) methods now all take batches of
|
||||
[`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or
|
||||
raw text and a dictionary of annotations.
|
||||
[`Language.begin_training`](/api/language#begin_training) and
|
||||
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
|
||||
returns a sequence of `Example` objects to initialize the model instead of a
|
||||
list of tuples.
|
||||
[`Language.initialize`](/api/language#initialize) and
|
||||
[`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a
|
||||
sequence of `Example` objects to initialize the model instead of a list of
|
||||
tuples.
|
||||
- The `begin_training` methods have been renamed to `initialize`.
|
||||
- [`Matcher.add`](/api/matcher#add) and
|
||||
[`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of
|
||||
patterns as the second argument (instead of a variable number of arguments).
|
||||
|
@ -555,6 +556,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
| Removed | Replacement |
|
||||
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
|
||||
| `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... |
|
||||
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
|
||||
| `GoldParse` | [`Example`](/api/example) |
|
||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||
|
@ -936,7 +938,7 @@ TRAIN_DATA = [
|
|||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||
("I like London.", {"entities": [(7, 13, "LOC")]}),
|
||||
]
|
||||
nlp.begin_training()
|
||||
nlp.initialize()
|
||||
for i in range(20):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
for batch in minibatch(TRAIN_DATA):
|
||||
|
@ -946,17 +948,18 @@ for i in range(20):
|
|||
nlp.update(examples)
|
||||
```
|
||||
|
||||
[`Language.begin_training`](/api/language#begin_training) and
|
||||
[`Pipe.begin_training`](/api/pipe#begin_training) now take a function that
|
||||
returns a sequence of `Example` objects to initialize the model instead of a
|
||||
list of tuples. The data examples are used to **initialize the models** of
|
||||
`Language.begin_training` and `Pipe.begin_training` have been renamed to
|
||||
[`Language.initialize`](/api/language#initialize) and
|
||||
[`Pipe.initialize`](/api/pipe#initialize), and the methods now take a function
|
||||
that returns a sequence of `Example` objects to initialize the model instead of
|
||||
a list of tuples. The data examples are used to **initialize the models** of
|
||||
trainable pipeline components, which includes validating the network,
|
||||
[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
|
||||
setting up the label scheme.
|
||||
|
||||
```diff
|
||||
- nlp.begin_training(examples)
|
||||
+ nlp.begin_training(lambda: examples)
|
||||
- nlp.initialize(examples)
|
||||
+ nlp.initialize(lambda: examples)
|
||||
```
|
||||
|
||||
#### Packaging trained pipelines {#migrating-training-packaging}
|
||||
|
|
Loading…
Reference in New Issue
Block a user