diff --git a/pyproject.toml b/pyproject.toml index d23730b00..e610e603e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a30,<8.0.0a40", + "thinc>=8.0.0a31,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index 9b108de8d..db6eae2ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a30,<8.0.0a40 +thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index fc33abedb..d94e60b27 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a30,<8.0.0a40 + thinc>=8.0.0a31,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a30,<8.0.0a40 + thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 5bd4e008f..f4d93071e 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -84,11 +84,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None _print_model(model, print_settings) # STEP 1: Initializing the model and printing again + X = _get_docs() Y = _get_output(model.ops.xp) - _set_output_dim(nO=Y.shape[-1], model=model) # The output vector might differ from the official type of the output layer with data_validation(False): - model.initialize(X=_get_docs(), Y=Y) + model.initialize(X=X, Y=Y) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) @@ -135,15 +135,6 @@ def _get_output(xp): return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32") -def _set_output_dim(model, nO): - # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx - if model.has_dim("nO") is None: - model.set_dim("nO", nO) - if model.has_ref("output_layer"): - if model.get_ref("output_layer").has_dim("nO") is None: - model.get_ref("output_layer").set_dim("nO", nO) - - def _print_model(model, print_settings): layers = print_settings.get("layers", "") parameters = print_settings.get("parameters", False) diff --git a/spacy/errors.py b/spacy/errors.py index bad3e83e4..3c120598e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -247,8 +247,8 @@ class Errors: "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") E065 = ("Only one of the vector table's width and shape can be specified. " "Got width {width} and shape {shape}.") - E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " - "an entity) without a preceding 'B' (beginning of an entity). " + E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} " + "without a preceding 'B' (beginning of an entity). " "Tag sequence:\n{tags}") E068 = ("Invalid BILUO tag: '{tag}'.") E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " @@ -320,10 +320,6 @@ class Errors: "So instead of pickling the span, pickle the Doc it belongs to or " "use Span.as_doc to convert the span to a standalone Doc object.") E115 = ("All subtokens must have associated heads.") - E116 = ("Cannot currently add labels to pretrained text classifier. Add " - "labels before training begins. This functionality was available " - "in previous versions, but had significant bugs that led to poor " - "performance.") E117 = ("The newly split tokens must match the text of the original token. " "New orths: {new}. Old text: {old}.") E118 = ("The custom extension attribute '{attr}' is not registered on the " @@ -378,8 +374,9 @@ class Errors: "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " "provided {found}.") - E143 = ("Labels for component '{name}' not initialized. Did you forget to " - "call add_label()?") + E143 = ("Labels for component '{name}' not initialized. This can be fixed " + "by calling add_label, or by providing a representative batch of " + "examples to the component's begin_training method.") E145 = ("Error reading `{param}` from input file.") E146 = ("Could not access `{path}`.") E147 = ("Unexpected error in the {method} functionality of the " @@ -483,6 +480,16 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E921 = ("The method 'set_output' can only be called on components that have " + "a Model with a 'resize_output' attribute. Otherwise, the output " + "layer can not be dynamically changed.") + E922 = ("Component '{name}' has been initialized with an output dimension of " + "{nO} - cannot add any more labels.") + E923 = ("It looks like there is no proper sample data to initialize the " + "Model of component '{name}'. " + "This is likely a bug in spaCy, so feel free to open an issue.") + E924 = ("The '{name}' component does not seem to be initialized properly. " + "This is likely a bug in spaCy, so feel free to open an issue.") E925 = ("Invalid color values for displaCy visualizer: expected dictionary " "mapping label names to colors but got: {obj}") E926 = ("It looks like you're trying to modify nlp.{attr} directly. This " diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py index 08751cfd4..ceb5e16b8 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/gold/iob_utils.py @@ -195,13 +195,15 @@ def tags_to_entities(tags): continue elif tag.startswith("I"): if start is None: - raise ValueError(Errors.E067.format(tags=tags[: i + 1])) + raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) continue if tag.startswith("U"): entities.append((tag[2:], i, i)) elif tag.startswith("B"): start = i elif tag.startswith("L"): + if start is None: + raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1])) entities.append((tag[2:], start, i)) start = None else: diff --git a/spacy/language.py b/spacy/language.py index cd84e30a4..6631250aa 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -656,7 +656,7 @@ class Language: return resolved[factory_name] def create_pipe_from_source( - self, source_name: str, source: "Language", *, name: str, + self, source_name: str, source: "Language", *, name: str ) -> Tuple[Callable[[Doc], Doc], str]: """Create a pipeline component by copying it from an existing model. @@ -1155,21 +1155,24 @@ class Language: DOCS: https://nightly.spacy.io/api/language#begin_training """ - # TODO: throw warning when get_gold_tuples is provided instead of get_examples if get_examples is None: - get_examples = lambda: [] - else: # Populate vocab - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Language", obj=type(get_examples)) + util.logger.debug( + "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples" + ) + doc = Doc(self.vocab, words=["x", "y", "z"]) + get_examples = lambda: [Example.from_dict(doc, {})] + # Populate vocab + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="Language", obj=type(get_examples)) + raise ValueError(err) + for example in get_examples(): + if not isinstance(example, Example): + err = Errors.E978.format( + name="Language.begin_training", types=type(example) + ) raise ValueError(err) - for example in get_examples(): - if not isinstance(example, Example): - err = Errors.E978.format( - name="Language.begin_training", types=type(example) - ) - raise ValueError(err) - for word in [t.text for t in example.reference]: - _ = self.vocab[word] # noqa: F841 + for word in [t.text for t in example.reference]: + _ = self.vocab[word] # noqa: F841 if device >= 0: # TODO: do we need this here? require_gpu(device) if self.vocab.vectors.data.shape[1] >= 1: @@ -1187,7 +1190,7 @@ class Language: return self._optimizer def resume_training( - self, *, sgd: Optional[Optimizer] = None, device: int = -1, + self, *, sgd: Optional[Optimizer] = None, device: int = -1 ) -> Optimizer: """Continue training a pretrained model. diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 4dbc79f52..2e6b2ffab 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -62,8 +62,6 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): def get_num_actions(n_labels: int) -> int: # One BEGIN action per label # One IN action per label - # One LAST action per label - # One UNIT action per label # One OUT action return n_labels * 2 + 1 diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index aca58c937..b47e7f349 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -21,7 +21,7 @@ def BiluoTagger( A BILUO tag sequence encodes a sequence of non-overlapping labelled spans into tags assigned to each token. The first token of a span is given the tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens - within the span are given the tag U-LABEL. Single-token spans are given + within the span are given the tag I-LABEL. Single-token spans are given the tag U-LABEL. All other tokens are assigned the tag O. The BILUO tag scheme generally results in better linear separation between @@ -86,7 +86,7 @@ def IOBTagger( def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: - if model.get_dim("nO") is None and Y: + if model.has_dim("nO") is None and Y: model.set_dim("nO", Y[0].shape[1]) nO = model.get_dim("nO") biluo = model.get_ref("biluo") diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index d4f1e6b56..e9564c05f 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,3 +1,4 @@ +from itertools import islice from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple from pathlib import Path import srsly @@ -128,7 +129,7 @@ class EntityLinker(Pipe): # how many neightbour sentences to take into account self.n_sents = cfg.get("n_sents", 0) - def require_kb(self) -> None: + def _require_kb(self) -> None: # Raise an error if the knowledge base is not initialized. if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) @@ -140,10 +141,11 @@ class EntityLinker(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ) -> Optimizer: - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -153,10 +155,19 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#begin_training """ - self.require_kb() + self._ensure_examples(get_examples) + self._require_kb() nO = self.kb.entity_vector_length - self.set_output(nO) - self.model.initialize() + doc_sample = [] + vector_sample = [] + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + vector_sample.append(self.model.ops.alloc1f(nO)) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(vector_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize( + X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") + ) if sgd is None: sgd = self.create_optimizer() return sgd @@ -184,7 +195,7 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#update """ - self.require_kb() + self._require_kb() if losses is None: losses = {} losses.setdefault(self.name, 0.0) @@ -296,7 +307,7 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#predict """ - self.require_kb() + self._require_kb() entity_count = 0 final_kb_ids = [] if not docs: @@ -405,7 +416,7 @@ class EntityLinker(Pipe): token.ent_kb_id_ = kb_id def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Serialize the pipe to disk. @@ -422,7 +433,7 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityLinker": """Load the pipe from disk. Modifies the object in place and returns it. diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index bcb555b90..b54824ce9 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -2,6 +2,7 @@ from typing import Optional import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config +from itertools import islice from ..tokens.doc cimport Doc from ..vocab cimport Vocab @@ -112,6 +113,7 @@ class Morphologizer(Tagger): raise ValueError(Errors.E187) if label in self.labels: return 0 + self._allow_extra_label() # normalize label norm_label = self.vocab.morphology.normalize_features(label) # extract separate POS and morph tags @@ -128,10 +130,11 @@ class Morphologizer(Tagger): return 1 def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -141,9 +144,8 @@ class Morphologizer(Tagger): DOCS: https://nightly.spacy.io/api/morphologizer#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Morphologizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) + # First, fetch all labels from the data for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ @@ -157,8 +159,25 @@ class Morphologizer(Tagger): if norm_label not in self.cfg["labels_morph"]: self.cfg["labels_morph"][norm_label] = morph self.cfg["labels_pos"][norm_label] = POS_IDS[pos] - self.set_output(len(self.labels)) - self.model.initialize() + if len(self.labels) <= 1: + raise ValueError(Errors.E143.format(name=self.name)) + doc_sample = [] + label_sample = [] + for example in islice(get_examples(), 10): + gold_array = [] + for i, token in enumerate(example.reference): + pos = token.pos_ + morph = token.morph_ + morph_dict = Morphology.feats_to_dict(morph) + if pos: + morph_dict[self.POS_FEAT] = pos + norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels]) + doc_sample.append(example.x) + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 3ef85c821..f07d24efc 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -90,7 +90,7 @@ class MultitaskObjective(Tagger): label = self.make_label(token) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) - self.model.initialize() + self.model.initialize() # TODO: fix initialization by defining X and Y if sgd is None: sgd = self.create_optimizer() return sgd @@ -178,7 +178,7 @@ class ClozeMultitask(Pipe): pass def begin_training(self, get_examples, pipeline=None, sgd=None): - self.model.initialize() + self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) if sgd is None: diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 2518ebad3..a6a2ff45c 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -160,6 +160,20 @@ cdef class Pipe: """ raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + + def _require_labels(self) -> None: + """Raise an error if the component's model has no labels defined.""" + if not self.labels or list(self.labels) == [""]: + raise ValueError(Errors.E143.format(name=self.name)) + + + def _allow_extra_label(self) -> None: + """Raise an error if the component can not add any more labels.""" + if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels): + if not self.is_resizable(): + raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))) + + def create_optimizer(self): """Create an optimizer for the pipeline component. @@ -171,9 +185,12 @@ cdef class Pipe: def begin_training(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. + This method needs to be implemented by each Pipe component, + ensuring the internal model (if available) is initialized properly + using the provided sample of Example objects. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -183,16 +200,24 @@ cdef class Pipe: DOCS: https://nightly.spacy.io/api/pipe#begin_training """ - self.model.initialize() - if sgd is None: - sgd = self.create_optimizer() - return sgd + raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + + def _ensure_examples(self, get_examples): + if get_examples is None or not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name=self.name, obj=type(get_examples)) + raise ValueError(err) + if not get_examples(): + err = Errors.E930.format(name=self.name, obj=get_examples()) + raise ValueError(err) + + def is_resizable(self): + return hasattr(self, "model") and "resize_output" in self.model.attrs def set_output(self, nO): - if self.model.has_dim("nO") is not False: - self.model.set_dim("nO", nO) - if self.model.has_ref("output_layer"): - self.model.get_ref("output_layer").set_dim("nO", nO) + if self.is_resizable(): + self.model.attrs["resize_output"](self.model, nO) + else: + raise NotImplementedError(Errors.E921) def use_params(self, params): """Modify the pipe's model, to use the given parameter values. At the diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index b78be44f8..64e01a071 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,4 +1,6 @@ # cython: infer_types=True, profile=True, binding=True +from itertools import islice + import srsly from thinc.api import Model, SequenceCategoricalCrossentropy, Config @@ -124,10 +126,11 @@ class SentenceRecognizer(Tagger): return float(loss), d_scores def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -137,8 +140,18 @@ class SentenceRecognizer(Tagger): DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training """ - self.set_output(len(self.labels)) - self.model.initialize() + self._ensure_examples(get_examples) + doc_sample = [] + label_sample = [] + assert self.labels, Errors.E924.format(name=self.name) + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned("SENT_START") + gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index c55edb067..a4a3248d2 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -3,6 +3,7 @@ from thinc.types import Floats2d from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model from thinc.api import Optimizer, Config from thinc.util import to_numpy +from itertools import islice from ..errors import Errors from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob @@ -168,18 +169,29 @@ class SimpleNER(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ): + self._ensure_examples(get_examples) all_labels = set() - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="SimpleNER", obj=type(get_examples)) - raise ValueError(err) for example in get_examples(): all_labels.update(_get_labels(example)) for label in sorted(all_labels): - self.add_label(label) - labels = self.labels - n_actions = self.model.attrs["get_num_actions"](len(labels)) - self.model.set_dim("nO", n_actions) - self.model.initialize() + if label != "": + self.add_label(label) + doc_sample = [] + label_sample = [] + self._require_labels() + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned_ner() + if not self.is_biluo: + gold_tags = biluo_to_iob(gold_tags) + gold_array = [ + [1.0 if tag == gold_tag else 0.0 for tag in self.get_tag_names()] + for gold_tag in gold_tags + ] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.loss_func = SequenceCategoricalCrossentropy( @@ -206,6 +218,6 @@ def _has_ner(example: Example) -> bool: def _get_labels(example: Example) -> Set[str]: labels = set() for ner_tag in example.get_aligned("ENT_TYPE", as_string=True): - if ner_tag != "O" and ner_tag != "-": + if ner_tag != "O" and ner_tag != "-" and ner_tag != "": labels.add(ner_tag) return labels diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 2b760c878..a0f06aa1c 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -5,6 +5,7 @@ import srsly from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.types import Floats2d import warnings +from itertools import islice from ..tokens.doc cimport Doc from ..morphology cimport Morphology @@ -258,10 +259,11 @@ class Tagger(Pipe): return float(loss), d_scores def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects.. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -271,32 +273,24 @@ class Tagger(Pipe): DOCS: https://nightly.spacy.io/api/tagger#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Tagger", obj=type(get_examples)) - raise ValueError(err) - tags = set() + self._ensure_examples(get_examples) doc_sample = [] + label_sample = [] + tags = set() for example in get_examples(): for token in example.y: - tags.add(token.tag_) - if len(doc_sample) < 10: - doc_sample.append(example.x) - if not doc_sample: - doc_sample.append(Doc(self.vocab, words=["hello"])) + if token.tag_: + tags.add(token.tag_) for tag in sorted(tags): self.add_label(tag) - if len(self.labels) == 0: - err = Errors.E1006.format(name="Tagger") - raise ValueError(err) - self.set_output(len(self.labels)) - if doc_sample: - label_sample = [ - self.model.ops.alloc2f(len(doc), len(self.labels)) - for doc in doc_sample - ] - self.model.initialize(X=doc_sample, Y=label_sample) - else: - self.model.initialize() + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned("TAG", as_string=True) + gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd @@ -313,6 +307,7 @@ class Tagger(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 + self._allow_extra_label() self.cfg["labels"].append(label) self.vocab.strings.add(label) return 1 diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index d6efb4348..e1edfb5b2 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,3 +1,4 @@ +from itertools import islice from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.types import Floats2d @@ -128,11 +129,6 @@ class TextCategorizer(Pipe): """ return tuple(self.cfg.setdefault("labels", [])) - def require_labels(self) -> None: - """Raise an error if the component's model has no labels defined.""" - if not self.labels: - raise ValueError(Errors.E143.format(name=self.name)) - @labels.setter def labels(self, value: Iterable[str]) -> None: self.cfg["labels"] = tuple(value) @@ -311,17 +307,7 @@ class TextCategorizer(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model.has_dim("nO"): - # This functionality was available previously, but was broken. - # The problem is that we resize the last layer, but the last layer - # is actually just an ensemble. We're not resizing the child layers - # - a huge problem. - raise ValueError(Errors.E116) - # smaller = self.model._layers[-1] - # larger = Linear(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger + self._allow_extra_label() self.labels = tuple(list(self.labels) + [label]) return 1 @@ -332,10 +318,11 @@ class TextCategorizer(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ) -> Optimizer: - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -345,22 +332,19 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) subbatch = [] # Select a subbatch of examples to initialize the model - for example in get_examples(): + for example in islice(get_examples(), 10): if len(subbatch) < 2: subbatch.append(example) for cat in example.y.cats: self.add_label(cat) - self.require_labels() - docs = [eg.reference for eg in subbatch] - if not docs: # need at least one doc - docs = [Doc(self.vocab, words=["hello"])] - truths, _ = self._examples_to_truth(subbatch) - self.set_output(len(self.labels)) - self.model.initialize(X=docs, Y=truths) + doc_sample = [eg.reference for eg in subbatch] + label_sample, _ = self._examples_to_truth(subbatch) + self._require_labels() + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5657d687d..b5f84f324 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,5 +1,6 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from itertools import islice from .pipe import Pipe from ..gold import Example, validate_examples @@ -209,10 +210,11 @@ class Tok2Vec(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -222,8 +224,12 @@ class Tok2Vec(Pipe): DOCS: https://nightly.spacy.io/api/tok2vec#begin_training """ - docs = [Doc(self.vocab, words=["hello"])] - self.model.initialize(X=docs) + self._ensure_examples(get_examples) + doc_sample = [] + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + assert doc_sample, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample) def add_label(self, label): raise NotImplementedError diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 5a6b491e0..2361cfd7f 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -244,7 +244,7 @@ cdef class Parser(Pipe): int nr_class, int batch_size) nogil: # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc with gil: - assert self.moves.n_moves > 0 + assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = calloc(self.moves.n_moves, sizeof(int)) cdef int i, guess cdef Transition action @@ -378,7 +378,7 @@ cdef class Parser(Pipe): cdef int i # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - assert self.moves.n_moves > 0 + assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) costs = mem.alloc(self.moves.n_moves, sizeof(float)) @@ -406,9 +406,7 @@ cdef class Parser(Pipe): self.model.attrs["resize_output"](self.model, nO) def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) self.cfg.update(kwargs) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: @@ -430,9 +428,6 @@ cdef class Parser(Pipe): if sgd is None: sgd = self.create_optimizer() doc_sample = [] - for example in islice(get_examples(), 10): - doc_sample.append(example.predicted) - if pipeline is not None: for name, component in pipeline: if component is self: @@ -441,10 +436,11 @@ cdef class Parser(Pipe): doc_sample = list(component.pipe(doc_sample, batch_size=8)) else: doc_sample = [component(doc) for doc in doc_sample] - if doc_sample: - self.model.initialize(doc_sample) - else: - self.model.initialize() + if not doc_sample: + for example in islice(get_examples(), 10): + doc_sample.append(example.predicted) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(doc_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) return sgd diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index d6e345336..2a4e3e499 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -1,5 +1,6 @@ +from spacy.gold import Example from spacy.pipeline import EntityRecognizer -from spacy.tokens import Span +from spacy.tokens import Span, Doc from spacy import registry import pytest @@ -7,6 +8,12 @@ from ..util import get_doc from spacy.pipeline.ner import DEFAULT_NER_MODEL +def _ner_example(ner): + doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} + return Example.from_dict(doc, gold) + + def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) @@ -18,10 +25,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: []) + ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] @@ -31,6 +36,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): + """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) config = { @@ -41,11 +47,11 @@ def test_ents_reset(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: []) + ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) - assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + assert [t.ent_iob_ for t in doc] == orig_iobs def test_add_overlapping_entities(en_vocab): diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index fce5f679f..b17080f15 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -35,7 +35,7 @@ def test_init_parser(parser): def _train_parser(parser): fix_random_seed(1) parser.add_label("left") - parser.begin_training(lambda: [], **parser.cfg) + parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(5): @@ -47,16 +47,25 @@ def _train_parser(parser): return parser +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +def _ner_example(ner): + doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} + return Example.from_dict(doc, gold) + + def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") sgd = Adam(0.001) for i in range(100): losses = {} - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - example = Example.from_dict(doc, gold) - parser.update([example], sgd=sgd, losses=losses) + parser.update([_parser_example(parser)], sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" @@ -75,7 +84,7 @@ def test_add_label_deserializes_correctly(): ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") - ner1.begin_training(lambda: []) + ner1.begin_training(lambda: [_ner_example(ner1)]) ner2 = EntityRecognizer(Vocab(), model, **config) # the second model needs to be resized before we can call from_bytes diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 8265a8a45..fa6494eb6 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -85,7 +85,7 @@ def test_parser_merge_pp(en_tokenizer): pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"] tokens = en_tokenizer(text) doc = get_doc( - tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos, + tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos ) with doc.retokenize() as retokenizer: for np in doc.noun_chunks: diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 594498b0b..430440576 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -14,6 +14,12 @@ def vocab(): return Vocab(lex_attr_getters={NORM: lambda s: s}) +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + @pytest.fixture def parser(vocab): config = { @@ -28,7 +34,7 @@ def parser(vocab): parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") - parser.begin_training(lambda: [], **parser.cfg) + parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(10): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4eaa71272..776d4f451 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -281,11 +281,12 @@ def test_append_invalid_alias(nlp): def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" + vector_length = 1 @registry.misc.register("myLocationsKB.v1") def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: def create_kb(vocab): - mykb = KnowledgeBase(vocab, entity_vector_length=1) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) @@ -305,10 +306,9 @@ def test_preserving_links_asdoc(nlp): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} - el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) - el_pipe.begin_training(lambda: []) - el_pipe.incl_context = False - el_pipe.incl_prior = True + entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True) + nlp.begin_training() + assert entity_linker.model.get_dim("nO") == vector_length # test whether the entity links are preserved by the `as_doc()` function text = "She lives in Boston. He lives in Denver." @@ -373,6 +373,7 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() nlp.add_pipe("sentencizer") + vector_length = 3 # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [ @@ -393,7 +394,7 @@ def test_overfitting_IO(): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher - mykb = KnowledgeBase(vocab, entity_vector_length=3) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( @@ -406,14 +407,17 @@ def test_overfitting_IO(): return create_kb # Create the Entity Linker component and add it to the pipeline - nlp.add_pipe( + entity_linker = nlp.add_pipe( "entity_linker", config={"kb_loader": {"@misc": "myOverfittingKB.v1"}}, last=True, ) # train the NEL pipe - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert entity_linker.model.get_dim("nO") == vector_length + assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length + for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 501c00f84..f52fb5401 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -25,27 +25,61 @@ TRAIN_DATA = [ }, ), # test combinations of morph+POS - ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},), + ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}), ] +def test_no_label(): + nlp = Language() + nlp.add_pipe("morphologizer") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = Language() + nlp.add_pipe("morphologizer") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_no_resize(): + nlp = Language() + morphologizer = nlp.add_pipe("morphologizer") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") + nlp.begin_training() + # this throws an error because the morphologizer can't be resized after initialization + with pytest.raises(ValueError): + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ") + + +def test_begin_training_examples(): + nlp = Language() + morphologizer = nlp.add_pipe("morphologizer") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly nlp = English() - morphologizer = nlp.add_pipe("morphologizer") + nlp.add_pipe("morphologizer") train_examples = [] for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) - for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): - if morph and pos: - morphologizer.add_label( - morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos - ) - elif pos: - morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos) - elif morph: - morphologizer.add_label(morph) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) for i in range(50): losses = {} @@ -55,18 +89,8 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue ham" doc = nlp(test_text) - gold_morphs = [ - "Feat=N", - "Feat=V", - "", - "", - ] - gold_pos_tags = [ - "NOUN", - "VERB", - "ADJ", - "", - ] + gold_morphs = ["Feat=N", "Feat=V", "", ""] + gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] assert [t.morph_ for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index b64fa8581..8941eae9a 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -30,6 +30,20 @@ TRAIN_DATA = [ ), ] +def test_begin_training_examples(): + nlp = Language() + senter = nlp.add_pipe("senter") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + def test_overfitting_IO(): # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py index b012a2cd6..3148eda0a 100644 --- a/spacy/tests/pipeline/test_simple_ner.py +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -1,3 +1,4 @@ +import pytest from spacy.lang.en import English from spacy.gold import Example from spacy import util @@ -5,11 +6,73 @@ from ..util import make_tempdir TRAIN_DATA = [ - ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("Who is Shaka S Khan?", {"entities": [(7, 19, "PERSON")]}), ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), ] +def test_no_label(): + nlp = English() + nlp.add_pipe("simple_ner") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = English() + ner = nlp.add_pipe("simple_ner") + train_examples = [] + ner.add_label("ORG") + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +@pytest.mark.skip(reason="Should be fixed") +def test_untrained(): + # This shouldn't crash, but it does when the simple_ner produces an invalid sequence like ['L-PERSON', 'L-ORG'] + nlp = English() + ner = nlp.add_pipe("simple_ner") + ner.add_label("PERSON") + ner.add_label("LOC") + ner.add_label("ORG") + nlp.begin_training() + nlp("Example sentence") + + +def test_resize(): + nlp = English() + ner = nlp.add_pipe("simple_ner") + ner.add_label("PERSON") + ner.add_label("LOC") + nlp.begin_training() + assert len(ner.labels) == 2 + ner.add_label("ORG") + nlp.begin_training() + assert len(ner.labels) == 3 + + +def test_begin_training_examples(): + nlp = English() + ner = nlp.add_pipe("simple_ner") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: train_examples[0]) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=lambda: []) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly nlp = English() @@ -17,9 +80,7 @@ def test_overfitting_IO(): train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for ent in annotations.get("entities"): - ner.add_label(ent[2]) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) for i in range(50): losses = {} diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 540301eac..89f40c5bf 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -34,6 +34,56 @@ TRAIN_DATA = [ ] +def test_no_label(): + nlp = Language() + nlp.add_pipe("tagger") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_no_resize(): + nlp = Language() + tagger = nlp.add_pipe("tagger") + tagger.add_label("N") + tagger.add_label("V") + assert tagger.labels == ("N", "V") + nlp.begin_training() + assert tagger.model.get_dim("nO") == 2 + # this throws an error because the tagger can't be resized after initialization + with pytest.raises(ValueError): + tagger.add_label("J") + + +def test_implicit_label(): + nlp = Language() + nlp.add_pipe("tagger") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_begin_training_examples(): + nlp = Language() + tagger = nlp.add_pipe("tagger") + train_examples = [] + for tag in TAGS: + tagger.add_label(tag) + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: train_examples[0]) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=lambda: []) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() @@ -41,9 +91,8 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - for tag in TAGS: - tagger.add_label(tag) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert tagger.model.get_dim("nO") == len(TAGS) for i in range(50): losses = {} diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 12ead90cb..59c0fce49 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -80,6 +80,51 @@ def test_label_types(): textcat.add_label(9) +def test_no_label(): + nlp = Language() + nlp.add_pipe("textcat") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_no_resize(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + nlp.begin_training() + assert textcat.model.get_dim("nO") == 2 + # this throws an error because the textcat can't be resized after initialization + with pytest.raises(ValueError): + textcat.add_label("NEUTRAL") + + +def test_begin_training_examples(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for label, value in annotations.get("cats").items(): + textcat.add_label(label) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly fix_random_seed(0) @@ -89,9 +134,8 @@ def test_overfitting_IO(): train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for label, value in annotations.get("cats").items(): - textcat.add_label(label) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert textcat.model.get_dim("nO") == 2 for i in range(50): losses = {} diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 3882df0a6..dd8f282b8 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -20,7 +20,7 @@ def test_issue2564(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - tagger.begin_training(lambda: []) + nlp.begin_training() doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index fc2a3ed7c..f853b7aa7 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -251,6 +251,12 @@ def test_issue3803(): assert [t.like_num for t in doc] == [True, True, True, True, True, True] +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { @@ -264,7 +270,7 @@ def test_issue3830_no_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: []) + parser.begin_training(lambda: [_parser_example(parser)]) assert "subtok" not in parser.labels @@ -281,7 +287,7 @@ def test_issue3830_with_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: []) + parser.begin_training(lambda: [_parser_example(parser)]) assert "subtok" in parser.labels diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index af643aadc..531e48ec3 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -64,7 +64,7 @@ def tagger(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.add_label("A") - tagger.begin_training(lambda: [], pipeline=nlp.pipeline) + nlp.begin_training() return tagger @@ -85,7 +85,7 @@ def entity_linker(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) + nlp.begin_training() return entity_linker diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 9f0f4b74a..37e02a5b2 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -89,6 +89,7 @@ def test_init_tok2vec(): tok2vec = nlp.add_pipe("tok2vec") assert tok2vec.listeners == [] nlp.begin_training() + assert tok2vec.model.get_dim("nO") cfg_string = """