Clean up sgd and pipeline -> nlp

2025-08-05 21:00:19 +03:00 · 2020-09-29 12:20:26 +02:00 · 2020-09-29 12:20:26 +02:00 · f171903139
commit f171903139
parent 612bbf85ab
11 changed files with 28 additions and 58 deletions
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -1,5 +1,5 @@
 from itertools import islice
-from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
+from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
 from pathlib import Path
 import srsly
 import random
@ -144,20 +144,14 @@ class EntityLinker(Pipe):
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
-    ) -> Optimizer:
+        nlp: Optional[Language] = None,
+    ):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

        DOCS: https://nightly.spacy.io/api/entitylinker#initialize
        """
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
        self.model.initialize(
            X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
        )
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd

    def update(
        self,
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -129,16 +129,13 @@ class Morphologizer(Tagger):
            self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
        return 1

-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

        DOCS: https://nightly.spacy.io/api/morphologizer#initialize
        """
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
    def set_annotations(self, docs, dep_ids):
        pass

-    def initialize(self, get_examples, pipeline=None):
+    def initialize(self, get_examples, nlp=None):
        if not hasattr(get_examples, "__call__"):
            err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
            raise ValueError(err)
@ -174,7 +174,7 @@ class ClozeMultitask(Pipe):
    def set_annotations(self, docs, dep_ids):
        pass

-    def initialize(self, get_examples, pipeline=None):
+    def initialize(self, get_examples, nlp=None):
        self.model.initialize()  # TODO: fix initialization by defining X and Y
        X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
        self.model.output_layer.initialize(X)
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@ -183,7 +183,7 @@ cdef class Pipe:
        """
        return util.create_default_optimizer()

-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
        """Initialize the pipe for training, using data examples if available.
        This method needs to be implemented by each Pipe component,
        ensuring the internal model (if available) is initialized properly
@ -191,14 +191,11 @@ cdef class Pipe:

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

        DOCS: https://nightly.spacy.io/api/pipe#initialize
        """
-        raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
+        raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))

    def _ensure_examples(self, get_examples):
        if get_examples is None or not hasattr(get_examples, "__call__"):
--- a/spacy/pipeline/sentencizer.pyx
+++ b/spacy/pipeline/sentencizer.pyx
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
        else:
            self.punct_chars = set(self.default_punct_chars)

-    def initialize(self, get_examples, pipeline=None):
+    def initialize(self, get_examples, nlp=None):
        pass

    def __call__(self, doc):
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -124,16 +124,13 @@ class SentenceRecognizer(Tagger):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS: None
+        nlp (Language): The current nlp object the component is part of.

        DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
        """
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -256,16 +256,13 @@ class Tagger(Pipe):
            raise ValueError("nan value when computing loss")
        return float(loss), d_scores

-    def initialize(self, get_examples, *, pipeline=None):
+    def initialize(self, get_examples, *, nlp=None):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects..
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

        DOCS: https://nightly.spacy.io/api/tagger#initialize
        """
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -338,17 +338,14 @@ class TextCategorizer(Pipe):
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None
-    ) -> Optimizer:
+        nlp: Optional[Language] = None,
+    ):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

        DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
        """
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -1,4 +1,4 @@
-from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
+from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
 from thinc.api import Model, set_dropout_rate, Optimizer, Config
 from itertools import islice

@ -207,20 +207,14 @@ class Tok2Vec(Pipe):
        self,
        get_examples: Callable[[], Iterable[Example]],
        *,
-        pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
-        sgd: Optional[Optimizer] = None,
+        nlp: Optional[Language] = None,
    ):
        """Initialize the pipe for training, using a representative set
        of data examples.

        get_examples (Callable[[], Iterable[Example]]): Function that
            returns a representative sample of gold-standard Example objects.
-        pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
-            components that this component is part of. Corresponds to
-            nlp.pipeline.
-        sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
-            create_optimizer if it doesn't exist.
-        RETURNS (thinc.api.Optimizer): The optimizer.
+        nlp (Language): The current nlp object the component is part of.

        DOCS: https://nightly.spacy.io/api/tok2vec#initialize
        """
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -405,7 +405,7 @@ cdef class Parser(Pipe):
    def set_output(self, nO):
        self.model.attrs["resize_output"](self.model, nO)

-    def initialize(self, get_examples, pipeline=None, settings=None):
+    def initialize(self, get_examples, nlp=None):
        self._ensure_examples(get_examples)
        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
@ -425,8 +425,8 @@ cdef class Parser(Pipe):
        # make sure we resize so we have an appropriate upper layer
        self._resize()
        doc_sample = []
-        if pipeline is not None:
-            for name, component in pipeline:
+        if nlp is not None:
+            for name, component in nlp.pipeline:
                if component is self:
                    break
                if hasattr(component, "pipe"):
@ -438,8 +438,8 @@ cdef class Parser(Pipe):
                doc_sample.append(example.predicted)
        assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
        self.model.initialize(doc_sample)
-        if pipeline is not None:
-            self.init_multitask_objectives(get_examples, pipeline)
+        if nlp is not None:
+            self.init_multitask_objectives(get_examples, nlp.pipeline)

    def to_disk(self, path, exclude=tuple()):
        serializers = {
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -107,7 +107,7 @@ def validate_init_settings(
    *,
    section: Optional[str] = None,
    name: str = "",
-    exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"),
+    exclude: Iterable[str] = ("get_examples", "nlp"),
 ) -> Dict[str, Any]:
    """Validate initialization settings against the expected arguments in
    the method signature. Will parse values if possible (e.g. int to string)