Clean up sgd and pipeline -> nlp

This commit is contained in:
Ines Montani 2020-09-29 12:20:26 +02:00
parent 612bbf85ab
commit f171903139
11 changed files with 28 additions and 58 deletions

View File

@ -1,5 +1,5 @@
from itertools import islice
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
from pathlib import Path
import srsly
import random
@ -144,20 +144,14 @@ class EntityLinker(Pipe):
self,
get_examples: Callable[[], Iterable[Example]],
*,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
) -> Optimizer:
nlp: Optional[Language] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
nlp (Language): The current nlp object the component is part of.
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
"""
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
self.model.initialize(
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
)
if sgd is None:
sgd = self.create_optimizer()
return sgd
def update(
self,

View File

@ -129,16 +129,13 @@ class Morphologizer(Tagger):
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1
def initialize(self, get_examples, *, pipeline=None):
def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
nlp (Language): The current nlp object the component is part of.
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
"""

View File

@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
def set_annotations(self, docs, dep_ids):
pass
def initialize(self, get_examples, pipeline=None):
def initialize(self, get_examples, nlp=None):
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
raise ValueError(err)
@ -174,7 +174,7 @@ class ClozeMultitask(Pipe):
def set_annotations(self, docs, dep_ids):
pass
def initialize(self, get_examples, pipeline=None):
def initialize(self, get_examples, nlp=None):
self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.initialize(X)

View File

@ -183,7 +183,7 @@ cdef class Pipe:
"""
return util.create_default_optimizer()
def initialize(self, get_examples, *, pipeline=None):
def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each Pipe component,
ensuring the internal model (if available) is initialized properly
@ -191,14 +191,11 @@ cdef class Pipe:
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
nlp (Language): The current nlp object the component is part of.
DOCS: https://nightly.spacy.io/api/pipe#initialize
"""
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
def _ensure_examples(self, get_examples):
if get_examples is None or not hasattr(get_examples, "__call__"):

View File

@ -58,7 +58,7 @@ class Sentencizer(Pipe):
else:
self.punct_chars = set(self.default_punct_chars)
def initialize(self, get_examples, pipeline=None):
def initialize(self, get_examples, nlp=None):
pass
def __call__(self, doc):

View File

@ -124,16 +124,13 @@ class SentenceRecognizer(Tagger):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def initialize(self, get_examples, *, pipeline=None):
def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS: None
nlp (Language): The current nlp object the component is part of.
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
"""

View File

@ -256,16 +256,13 @@ class Tagger(Pipe):
raise ValueError("nan value when computing loss")
return float(loss), d_scores
def initialize(self, get_examples, *, pipeline=None):
def initialize(self, get_examples, *, nlp=None):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects..
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
nlp (Language): The current nlp object the component is part of.
DOCS: https://nightly.spacy.io/api/tagger#initialize
"""

View File

@ -338,17 +338,14 @@ class TextCategorizer(Pipe):
self,
get_examples: Callable[[], Iterable[Example]],
*,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None
) -> Optimizer:
nlp: Optional[Language] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
RETURNS (thinc.api.Optimizer): The optimizer.
nlp (Language): The current nlp object the component is part of.
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
"""

View File

@ -1,4 +1,4 @@
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
from thinc.api import Model, set_dropout_rate, Optimizer, Config
from itertools import islice
@ -207,20 +207,14 @@ class Tok2Vec(Pipe):
self,
get_examples: Callable[[], Iterable[Example]],
*,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
nlp: Optional[Language] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to
nlp.pipeline.
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
create_optimizer if it doesn't exist.
RETURNS (thinc.api.Optimizer): The optimizer.
nlp (Language): The current nlp object the component is part of.
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
"""

View File

@ -405,7 +405,7 @@ cdef class Parser(Pipe):
def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO)
def initialize(self, get_examples, pipeline=None, settings=None):
def initialize(self, get_examples, nlp=None):
self._ensure_examples(get_examples)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
@ -425,8 +425,8 @@ cdef class Parser(Pipe):
# make sure we resize so we have an appropriate upper layer
self._resize()
doc_sample = []
if pipeline is not None:
for name, component in pipeline:
if nlp is not None:
for name, component in nlp.pipeline:
if component is self:
break
if hasattr(component, "pipe"):
@ -438,8 +438,8 @@ cdef class Parser(Pipe):
doc_sample.append(example.predicted)
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(doc_sample)
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline)
if nlp is not None:
self.init_multitask_objectives(get_examples, nlp.pipeline)
def to_disk(self, path, exclude=tuple()):
serializers = {

View File

@ -107,7 +107,7 @@ def validate_init_settings(
*,
section: Optional[str] = None,
name: str = "",
exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"),
exclude: Iterable[str] = ("get_examples", "nlp"),
) -> Dict[str, Any]:
"""Validate initialization settings against the expected arguments in
the method signature. Will parse values if possible (e.g. int to string)