mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
Clean up sgd and pipeline -> nlp
This commit is contained in:
parent
612bbf85ab
commit
f171903139
|
@ -1,5 +1,5 @@
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
|
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import srsly
|
import srsly
|
||||||
import random
|
import random
|
||||||
|
@ -144,20 +144,14 @@ class EntityLinker(Pipe):
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
):
|
||||||
) -> Optimizer:
|
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
DOCS: https://nightly.spacy.io/api/entitylinker#initialize
|
||||||
"""
|
"""
|
||||||
|
@ -174,9 +168,6 @@ class EntityLinker(Pipe):
|
||||||
self.model.initialize(
|
self.model.initialize(
|
||||||
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
|
||||||
)
|
)
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -129,16 +129,13 @@ class Morphologizer(Tagger):
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def initialize(self, get_examples, *, pipeline=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
DOCS: https://nightly.spacy.io/api/morphologizer#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -81,7 +81,7 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def initialize(self, get_examples, pipeline=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
if not hasattr(get_examples, "__call__"):
|
if not hasattr(get_examples, "__call__"):
|
||||||
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
err = Errors.E930.format(name="MultitaskObjective", obj=type(get_examples))
|
||||||
raise ValueError(err)
|
raise ValueError(err)
|
||||||
|
@ -174,7 +174,7 @@ class ClozeMultitask(Pipe):
|
||||||
def set_annotations(self, docs, dep_ids):
|
def set_annotations(self, docs, dep_ids):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def initialize(self, get_examples, pipeline=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
self.model.initialize() # TODO: fix initialization by defining X and Y
|
self.model.initialize() # TODO: fix initialization by defining X and Y
|
||||||
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
|
||||||
self.model.output_layer.initialize(X)
|
self.model.output_layer.initialize(X)
|
||||||
|
|
|
@ -183,7 +183,7 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
def initialize(self, get_examples, *, pipeline=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
This method needs to be implemented by each Pipe component,
|
This method needs to be implemented by each Pipe component,
|
||||||
ensuring the internal model (if available) is initialized properly
|
ensuring the internal model (if available) is initialized properly
|
||||||
|
@ -191,14 +191,11 @@ cdef class Pipe:
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
DOCS: https://nightly.spacy.io/api/pipe#initialize
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
|
raise NotImplementedError(Errors.E931.format(method="initialize", name=self.name))
|
||||||
|
|
||||||
def _ensure_examples(self, get_examples):
|
def _ensure_examples(self, get_examples):
|
||||||
if get_examples is None or not hasattr(get_examples, "__call__"):
|
if get_examples is None or not hasattr(get_examples, "__call__"):
|
||||||
|
|
|
@ -58,7 +58,7 @@ class Sentencizer(Pipe):
|
||||||
else:
|
else:
|
||||||
self.punct_chars = set(self.default_punct_chars)
|
self.punct_chars = set(self.default_punct_chars)
|
||||||
|
|
||||||
def initialize(self, get_examples, pipeline=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
|
|
|
@ -124,16 +124,13 @@ class SentenceRecognizer(Tagger):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def initialize(self, get_examples, *, pipeline=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
RETURNS: None
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
DOCS: https://nightly.spacy.io/api/sentencerecognizer#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -256,16 +256,13 @@ class Tagger(Pipe):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def initialize(self, get_examples, *, pipeline=None):
|
def initialize(self, get_examples, *, nlp=None):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects..
|
returns a representative sample of gold-standard Example objects..
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
DOCS: https://nightly.spacy.io/api/tagger#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -338,17 +338,14 @@ class TextCategorizer(Pipe):
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None
|
nlp: Optional[Language] = None,
|
||||||
) -> Optimizer:
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
DOCS: https://nightly.spacy.io/api/textcategorizer#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
|
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List
|
||||||
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
|
@ -207,20 +207,14 @@ class Tok2Vec(Pipe):
|
||||||
self,
|
self,
|
||||||
get_examples: Callable[[], Iterable[Example]],
|
get_examples: Callable[[], Iterable[Example]],
|
||||||
*,
|
*,
|
||||||
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
nlp: Optional[Language] = None,
|
||||||
sgd: Optional[Optimizer] = None,
|
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training, using a representative set
|
"""Initialize the pipe for training, using a representative set
|
||||||
of data examples.
|
of data examples.
|
||||||
|
|
||||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||||
returns a representative sample of gold-standard Example objects.
|
returns a representative sample of gold-standard Example objects.
|
||||||
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
|
nlp (Language): The current nlp object the component is part of.
|
||||||
components that this component is part of. Corresponds to
|
|
||||||
nlp.pipeline.
|
|
||||||
sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
|
|
||||||
create_optimizer if it doesn't exist.
|
|
||||||
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
DOCS: https://nightly.spacy.io/api/tok2vec#initialize
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -405,7 +405,7 @@ cdef class Parser(Pipe):
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
|
||||||
def initialize(self, get_examples, pipeline=None, settings=None):
|
def initialize(self, get_examples, nlp=None):
|
||||||
self._ensure_examples(get_examples)
|
self._ensure_examples(get_examples)
|
||||||
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
|
||||||
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
|
||||||
|
@ -425,8 +425,8 @@ cdef class Parser(Pipe):
|
||||||
# make sure we resize so we have an appropriate upper layer
|
# make sure we resize so we have an appropriate upper layer
|
||||||
self._resize()
|
self._resize()
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
if pipeline is not None:
|
if nlp is not None:
|
||||||
for name, component in pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if component is self:
|
if component is self:
|
||||||
break
|
break
|
||||||
if hasattr(component, "pipe"):
|
if hasattr(component, "pipe"):
|
||||||
|
@ -438,8 +438,8 @@ cdef class Parser(Pipe):
|
||||||
doc_sample.append(example.predicted)
|
doc_sample.append(example.predicted)
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(doc_sample)
|
self.model.initialize(doc_sample)
|
||||||
if pipeline is not None:
|
if nlp is not None:
|
||||||
self.init_multitask_objectives(get_examples, pipeline)
|
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
serializers = {
|
serializers = {
|
||||||
|
|
|
@ -107,7 +107,7 @@ def validate_init_settings(
|
||||||
*,
|
*,
|
||||||
section: Optional[str] = None,
|
section: Optional[str] = None,
|
||||||
name: str = "",
|
name: str = "",
|
||||||
exclude: Iterable[str] = ("get_examples", "nlp", "pipeline"),
|
exclude: Iterable[str] = ("get_examples", "nlp"),
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Validate initialization settings against the expected arguments in
|
"""Validate initialization settings against the expected arguments in
|
||||||
the method signature. Will parse values if possible (e.g. int to string)
|
the method signature. Will parse values if possible (e.g. int to string)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user