diff --git a/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py index 66d96ff68..41b6a70da 100644 --- a/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py +++ b/extra/example_data/textcat_example_data/textcatjsonl_to_trainjson.py @@ -1,7 +1,7 @@ from pathlib import Path import plac import spacy -from spacy.gold import docs_to_json +from spacy.training import docs_to_json import srsly import sys diff --git a/pyproject.toml b/pyproject.toml index d23730b00..e610e603e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a30,<8.0.0a40", + "thinc>=8.0.0a31,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index 9b108de8d..db6eae2ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a30,<8.0.0a40 +thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index fc33abedb..d94e60b27 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a30,<8.0.0a40 + thinc>=8.0.0a31,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a30,<8.0.0a40 + thinc>=8.0.0a31,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/setup.py b/setup.py index d448a262c..4a4b99f22 100755 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ Options.docstrings = True PACKAGES = find_packages() MOD_NAMES = [ - "spacy.gold.example", + "spacy.training.example", "spacy.parts_of_speech", "spacy.strings", "spacy.lexeme", @@ -48,7 +48,7 @@ MOD_NAMES = [ "spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", - "spacy.gold.gold_io", + "spacy.training.gold_io", "spacy.tokens.doc", "spacy.tokens.span", "spacy.tokens.token", diff --git a/spacy/about.py b/spacy/about.py index 7d0e85a17..c6176ad36 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a14" +__version__ = "3.0.0a16" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 0ecb5ad8f..ee83ec75e 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -13,7 +13,7 @@ from thinc.config import Config, ConfigValidationError from configparser import InterpolationError from ..schemas import ProjectConfigSchema, validate -from ..util import import_file, run_command, make_tempdir +from ..util import import_file, run_command, make_tempdir, registry if TYPE_CHECKING: from pathy import Pathy # noqa: F401 @@ -54,6 +54,8 @@ app.add_typer(init_cli) def setup_cli() -> None: + # Make sure the entry-point for CLI runs, so that they get imported. + registry.cli.get_all() # Ensure that the help messages always display the correct prompt command = get_command(app) command(prog_name=COMMAND) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index ade5a3ad4..ad89b9976 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,9 +7,9 @@ import re import sys from ._util import app, Arg, Opt -from ..gold import docs_to_json +from ..training import docs_to_json from ..tokens import DocBin -from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs +from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs # Converters are matched by file extension except for ner/iob, which are diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 75a81e6f5..d52f30b82 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -8,7 +8,7 @@ import typer from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli, get_sourced_components -from ..gold import Corpus, Example +from ..training import Corpus, Example from ..pipeline._parser_internals import nonproj from ..language import Language from .. import util diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 5bd4e008f..f4d93071e 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -84,11 +84,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None _print_model(model, print_settings) # STEP 1: Initializing the model and printing again + X = _get_docs() Y = _get_output(model.ops.xp) - _set_output_dim(nO=Y.shape[-1], model=model) # The output vector might differ from the official type of the output layer with data_validation(False): - model.initialize(X=_get_docs(), Y=Y) + model.initialize(X=X, Y=Y) if print_settings.get("print_after_init"): msg.divider(f"STEP 1 - after initialization") _print_model(model, print_settings) @@ -135,15 +135,6 @@ def _get_output(xp): return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32") -def _set_output_dim(model, nO): - # the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx - if model.has_dim("nO") is None: - model.set_dim("nO", nO) - if model.has_ref("output_layer"): - if model.get_ref("output_layer").has_dim("nO") is None: - model.get_ref("output_layer").set_dim("nO", nO) - - def _print_model(model, print_settings): layers = print_settings.get("layers", "") parameters = print_settings.get("parameters", False) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index c5cbab09a..f9954d9ad 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -5,7 +5,7 @@ import re import srsly from thinc.api import require_gpu, fix_random_seed -from ..gold import Corpus +from ..training import Corpus from ..tokens import Doc from ._util import app, Arg, Opt from ..scorer import Scorer diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6be47fa39..0bc493e56 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,4 +1,5 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List +from timeit import default_timer as timer import srsly import tqdm from pathlib import Path @@ -15,7 +16,7 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error from ._util import import_code, get_sourced_components from ..language import Language from .. import util -from ..gold.example import Example +from ..training.example import Example from ..errors import Errors @@ -286,9 +287,12 @@ def train_while_improving( ] raw_batches = util.minibatch(raw_examples, size=8) + words_seen = 0 + start_time = timer() for step, (epoch, batch) in enumerate(train_data): dropout = next(dropouts) for subbatch in subdivide_batch(batch, accumulate_gradient): + nlp.update( subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude ) @@ -317,6 +321,7 @@ def train_while_improving( else: score, other_scores = (None, None) is_best_checkpoint = None + words_seen += sum(len(eg) for eg in batch) info = { "epoch": epoch, "step": step, @@ -324,6 +329,8 @@ def train_while_improving( "other_scores": other_scores, "losses": losses, "checkpoints": results, + "seconds": int(timer() - start_time), + "words": words_seen, } yield batch, info, is_best_checkpoint if is_best_checkpoint is not None: diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 9507f0f0a..7cd71453f 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -52,7 +52,7 @@ path = ${paths.train} # data is passed in sentence-by-sentence via some prior preprocessing. gold_preproc = false # Limitations on training document length -max_length = 2000 +max_length = 0 # Limitation on number of training examples limit = 0 @@ -64,7 +64,7 @@ path = ${paths.dev} # data is passed in sentence-by-sentence via some prior preprocessing. gold_preproc = false # Limitations on training document length -max_length = 2000 +max_length = 0 # Limitation on number of training examples limit = 0 @@ -88,9 +88,4 @@ L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 1e-8 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.001 +learn_rate = 0.001 diff --git a/spacy/errors.py b/spacy/errors.py index bad3e83e4..7164598b6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -66,7 +66,7 @@ class Warnings: "in problems with the vocab further on in the pipeline.") W030 = ("Some entities could not be aligned in the text \"{text}\" with " "entities \"{entities}\". Use " - "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" + "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" " to check the alignment. Misaligned entities ('-') will be " "ignored during training.") W033 = ("Training a new {model} using a model with no lexeme normalization " @@ -247,8 +247,8 @@ class Errors: "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") E065 = ("Only one of the vector table's width and shape can be specified. " "Got width {width} and shape {shape}.") - E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " - "an entity) without a preceding 'B' (beginning of an entity). " + E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} " + "without a preceding 'B' (beginning of an entity). " "Tag sequence:\n{tags}") E068 = ("Invalid BILUO tag: '{tag}'.") E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " @@ -320,10 +320,6 @@ class Errors: "So instead of pickling the span, pickle the Doc it belongs to or " "use Span.as_doc to convert the span to a standalone Doc object.") E115 = ("All subtokens must have associated heads.") - E116 = ("Cannot currently add labels to pretrained text classifier. Add " - "labels before training begins. This functionality was available " - "in previous versions, but had significant bugs that led to poor " - "performance.") E117 = ("The newly split tokens must match the text of the original token. " "New orths: {new}. Old text: {old}.") E118 = ("The custom extension attribute '{attr}' is not registered on the " @@ -378,8 +374,9 @@ class Errors: "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " "provided {found}.") - E143 = ("Labels for component '{name}' not initialized. Did you forget to " - "call add_label()?") + E143 = ("Labels for component '{name}' not initialized. This can be fixed " + "by calling add_label, or by providing a representative batch of " + "examples to the component's begin_training method.") E145 = ("Error reading `{param}` from input file.") E146 = ("Could not access `{path}`.") E147 = ("Unexpected error in the {method} functionality of the " @@ -483,6 +480,16 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E921 = ("The method 'set_output' can only be called on components that have " + "a Model with a 'resize_output' attribute. Otherwise, the output " + "layer can not be dynamically changed.") + E922 = ("Component '{name}' has been initialized with an output dimension of " + "{nO} - cannot add any more labels.") + E923 = ("It looks like there is no proper sample data to initialize the " + "Model of component '{name}'. " + "This is likely a bug in spaCy, so feel free to open an issue.") + E924 = ("The '{name}' component does not seem to be initialized properly. " + "This is likely a bug in spaCy, so feel free to open an issue.") E925 = ("Invalid color values for displaCy visualizer: expected dictionary " "mapping label names to colors but got: {obj}") E926 = ("It looks like you're trying to modify nlp.{attr} directly. This " diff --git a/spacy/language.py b/spacy/language.py index cd84e30a4..777b0c24b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -17,7 +17,7 @@ from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis -from .gold import Example, validate_examples +from .training import Example, validate_examples from .scorer import Scorer from .util import create_default_optimizer, registry, SimpleFrozenList from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER @@ -656,7 +656,7 @@ class Language: return resolved[factory_name] def create_pipe_from_source( - self, source_name: str, source: "Language", *, name: str, + self, source_name: str, source: "Language", *, name: str ) -> Tuple[Callable[[Doc], Doc], str]: """Create a pipeline component by copying it from an existing model. @@ -1155,21 +1155,24 @@ class Language: DOCS: https://nightly.spacy.io/api/language#begin_training """ - # TODO: throw warning when get_gold_tuples is provided instead of get_examples if get_examples is None: - get_examples = lambda: [] - else: # Populate vocab - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Language", obj=type(get_examples)) + util.logger.debug( + "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples" + ) + doc = Doc(self.vocab, words=["x", "y", "z"]) + get_examples = lambda: [Example.from_dict(doc, {})] + # Populate vocab + if not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name="Language", obj=type(get_examples)) + raise ValueError(err) + for example in get_examples(): + if not isinstance(example, Example): + err = Errors.E978.format( + name="Language.begin_training", types=type(example) + ) raise ValueError(err) - for example in get_examples(): - if not isinstance(example, Example): - err = Errors.E978.format( - name="Language.begin_training", types=type(example) - ) - raise ValueError(err) - for word in [t.text for t in example.reference]: - _ = self.vocab[word] # noqa: F841 + for word in [t.text for t in example.reference]: + _ = self.vocab[word] # noqa: F841 if device >= 0: # TODO: do we need this here? require_gpu(device) if self.vocab.vectors.data.shape[1] >= 1: @@ -1187,7 +1190,7 @@ class Language: return self._optimizer def resume_training( - self, *, sgd: Optional[Optimizer] = None, device: int = -1, + self, *, sgd: Optional[Optimizer] = None, device: int = -1 ) -> Optimizer: """Continue training a pretrained model. diff --git a/spacy/ml/_iob.py b/spacy/ml/_iob.py index 4dbc79f52..2e6b2ffab 100644 --- a/spacy/ml/_iob.py +++ b/spacy/ml/_iob.py @@ -62,8 +62,6 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): def get_num_actions(n_labels: int) -> int: # One BEGIN action per label # One IN action per label - # One LAST action per label - # One UNIT action per label # One OUT action return n_labels * 2 + 1 diff --git a/spacy/ml/models/simple_ner.py b/spacy/ml/models/simple_ner.py index aca58c937..b47e7f349 100644 --- a/spacy/ml/models/simple_ner.py +++ b/spacy/ml/models/simple_ner.py @@ -21,7 +21,7 @@ def BiluoTagger( A BILUO tag sequence encodes a sequence of non-overlapping labelled spans into tags assigned to each token. The first token of a span is given the tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens - within the span are given the tag U-LABEL. Single-token spans are given + within the span are given the tag I-LABEL. Single-token spans are given the tag U-LABEL. All other tokens are assigned the tag O. The BILUO tag scheme generally results in better linear separation between @@ -86,7 +86,7 @@ def IOBTagger( def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: - if model.get_dim("nO") is None and Y: + if model.has_dim("nO") is None and Y: model.set_dim("nO", Y[0].shape[1]) nO = model.get_dim("nO") biluo = model.get_ref("biluo") diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index 7db8aae0f..bb0bf35b8 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -8,7 +8,7 @@ from ...typedefs cimport hash_t, attr_t from ...strings cimport hash_string from ...structs cimport TokenC from ...tokens.doc cimport Doc, set_children_from_heads -from ...gold.example cimport Example +from ...training.example cimport Example from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx index 2570ccdee..0351bcaf7 100644 --- a/spacy/pipeline/_parser_internals/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -5,7 +5,7 @@ from cymem.cymem cimport Pool from ...typedefs cimport weight_t, attr_t from ...lexeme cimport Lexeme from ...attrs cimport IS_SPACE -from ...gold.example cimport Example +from ...training.example cimport Example from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/pipeline/_parser_internals/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd index ba4c33814..458f1d5f9 100644 --- a/spacy/pipeline/_parser_internals/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -3,7 +3,7 @@ from cymem.cymem cimport Pool from ...typedefs cimport attr_t, weight_t from ...structs cimport TokenC from ...strings cimport StringStore -from ...gold.example cimport Example +from ...training.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index 406112681..f64fcbc54 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -4,7 +4,7 @@ from pathlib import Path from .pipe import Pipe from ..errors import Errors -from ..gold import validate_examples +from ..training import validate_examples from ..language import Language from ..matcher import Matcher from ..scorer import Scorer diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index eee4ed535..edd791e40 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -9,7 +9,7 @@ from .functions import merge_subtokens from ..language import Language from ._parser_internals import nonproj from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples default_model_config = """ diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index d4f1e6b56..1debadd82 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -1,3 +1,4 @@ +from itertools import islice from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple from pathlib import Path import srsly @@ -11,7 +12,7 @@ from ..tokens import Doc from .pipe import Pipe, deserialize_config from ..language import Language from ..vocab import Vocab -from ..gold import Example, validate_examples +from ..training import Example, validate_examples from ..errors import Errors, Warnings from ..util import SimpleFrozenList from .. import util @@ -128,7 +129,7 @@ class EntityLinker(Pipe): # how many neightbour sentences to take into account self.n_sents = cfg.get("n_sents", 0) - def require_kb(self) -> None: + def _require_kb(self) -> None: # Raise an error if the knowledge base is not initialized. if len(self.kb) == 0: raise ValueError(Errors.E139.format(name=self.name)) @@ -140,10 +141,11 @@ class EntityLinker(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ) -> Optimizer: - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -153,10 +155,19 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#begin_training """ - self.require_kb() + self._ensure_examples(get_examples) + self._require_kb() nO = self.kb.entity_vector_length - self.set_output(nO) - self.model.initialize() + doc_sample = [] + vector_sample = [] + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + vector_sample.append(self.model.ops.alloc1f(nO)) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(vector_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize( + X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32") + ) if sgd is None: sgd = self.create_optimizer() return sgd @@ -184,7 +195,7 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#update """ - self.require_kb() + self._require_kb() if losses is None: losses = {} losses.setdefault(self.name, 0.0) @@ -296,7 +307,7 @@ class EntityLinker(Pipe): DOCS: https://nightly.spacy.io/api/entitylinker#predict """ - self.require_kb() + self._require_kb() entity_count = 0 final_kb_ids = [] if not docs: @@ -405,7 +416,7 @@ class EntityLinker(Pipe): token.ent_kb_id_ = kb_id def to_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: """Serialize the pipe to disk. @@ -422,7 +433,7 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityLinker": """Load the pipe from disk. Modifies the object in place and returns it. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 4f4ff230e..24bbb067f 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -9,7 +9,7 @@ from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList from ..tokens import Doc, Span from ..matcher import Matcher, PhraseMatcher from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples DEFAULT_ENT_ID_SEP = "||" diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 3f3e387b7..0fd3482c4 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -8,7 +8,7 @@ from ..lookups import Lookups, load_lookups from ..scorer import Scorer from ..tokens import Doc, Token from ..vocab import Vocab -from ..gold import validate_examples +from ..training import validate_examples from .. import util diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index bcb555b90..57bdb28d7 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -2,6 +2,7 @@ from typing import Optional import srsly from thinc.api import SequenceCategoricalCrossentropy, Model, Config +from itertools import islice from ..tokens.doc cimport Doc from ..vocab cimport Vocab @@ -15,7 +16,7 @@ from .pipe import deserialize_config from .tagger import Tagger from .. import util from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples default_model_config = """ @@ -112,6 +113,7 @@ class Morphologizer(Tagger): raise ValueError(Errors.E187) if label in self.labels: return 0 + self._allow_extra_label() # normalize label norm_label = self.vocab.morphology.normalize_features(label) # extract separate POS and morph tags @@ -128,10 +130,11 @@ class Morphologizer(Tagger): return 1 def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -141,9 +144,8 @@ class Morphologizer(Tagger): DOCS: https://nightly.spacy.io/api/morphologizer#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Morphologizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) + # First, fetch all labels from the data for example in get_examples(): for i, token in enumerate(example.reference): pos = token.pos_ @@ -157,8 +159,25 @@ class Morphologizer(Tagger): if norm_label not in self.cfg["labels_morph"]: self.cfg["labels_morph"][norm_label] = morph self.cfg["labels_pos"][norm_label] = POS_IDS[pos] - self.set_output(len(self.labels)) - self.model.initialize() + if len(self.labels) <= 1: + raise ValueError(Errors.E143.format(name=self.name)) + doc_sample = [] + label_sample = [] + for example in islice(get_examples(), 10): + gold_array = [] + for i, token in enumerate(example.reference): + pos = token.pos_ + morph = token.morph_ + morph_dict = Morphology.feats_to_dict(morph) + if pos: + morph_dict[self.POS_FEAT] = pos + norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)] + gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels]) + doc_sample.append(example.x) + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 3ef85c821..2f8940124 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -8,7 +8,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from .tagger import Tagger -from ..gold import validate_examples +from ..training import validate_examples from ..language import Language from ._parser_internals import nonproj from ..attrs import POS, ID @@ -90,7 +90,7 @@ class MultitaskObjective(Tagger): label = self.make_label(token) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) - self.model.initialize() + self.model.initialize() # TODO: fix initialization by defining X and Y if sgd is None: sgd = self.create_optimizer() return sgd @@ -178,7 +178,7 @@ class ClozeMultitask(Pipe): pass def begin_training(self, get_examples, pipeline=None, sgd=None): - self.model.initialize() + self.model.initialize() # TODO: fix initialization by defining X and Y X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) if sgd is None: diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index d9f33ccb4..2fa5c6392 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -7,7 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples default_model_config = """ diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 2518ebad3..324c8e19c 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -4,7 +4,7 @@ from thinc.api import set_dropout_rate, Model from ..tokens.doc cimport Doc -from ..gold import validate_examples +from ..training import validate_examples from ..errors import Errors from .. import util @@ -160,6 +160,20 @@ cdef class Pipe: """ raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + + def _require_labels(self) -> None: + """Raise an error if the component's model has no labels defined.""" + if not self.labels or list(self.labels) == [""]: + raise ValueError(Errors.E143.format(name=self.name)) + + + def _allow_extra_label(self) -> None: + """Raise an error if the component can not add any more labels.""" + if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels): + if not self.is_resizable(): + raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))) + + def create_optimizer(self): """Create an optimizer for the pipeline component. @@ -171,9 +185,12 @@ cdef class Pipe: def begin_training(self, get_examples, *, pipeline=None, sgd=None): """Initialize the pipe for training, using data examples if available. + This method needs to be implemented by each Pipe component, + ensuring the internal model (if available) is initialized properly + using the provided sample of Example objects. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -183,16 +200,24 @@ cdef class Pipe: DOCS: https://nightly.spacy.io/api/pipe#begin_training """ - self.model.initialize() - if sgd is None: - sgd = self.create_optimizer() - return sgd + raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) + + def _ensure_examples(self, get_examples): + if get_examples is None or not hasattr(get_examples, "__call__"): + err = Errors.E930.format(name=self.name, obj=type(get_examples)) + raise ValueError(err) + if not get_examples(): + err = Errors.E930.format(name=self.name, obj=get_examples()) + raise ValueError(err) + + def is_resizable(self): + return hasattr(self, "model") and "resize_output" in self.model.attrs def set_output(self, nO): - if self.model.has_dim("nO") is not False: - self.model.set_dim("nO", nO) - if self.model.has_ref("output_layer"): - self.model.get_ref("output_layer").set_dim("nO", nO) + if self.is_resizable(): + self.model.attrs["resize_output"](self.model, nO) + else: + raise NotImplementedError(Errors.E921) def use_params(self, params): """Modify the pipe's model, to use the given parameter values. At the diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index aaf08d594..5700c2b98 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from ..language import Language from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples from .. import util diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index b78be44f8..00664131b 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -1,4 +1,6 @@ # cython: infer_types=True, profile=True, binding=True +from itertools import islice + import srsly from thinc.api import Model, SequenceCategoricalCrossentropy, Config @@ -9,7 +11,7 @@ from .tagger import Tagger from ..language import Language from ..errors import Errors from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples from .. import util @@ -124,10 +126,11 @@ class SentenceRecognizer(Tagger): return float(loss), d_scores def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -137,8 +140,18 @@ class SentenceRecognizer(Tagger): DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training """ - self.set_output(len(self.labels)) - self.model.initialize() + self._ensure_examples(get_examples) + doc_sample = [] + label_sample = [] + assert self.labels, Errors.E924.format(name=self.name) + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned("SENT_START") + gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index c55edb067..951d89931 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -3,10 +3,11 @@ from thinc.types import Floats2d from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model from thinc.api import Optimizer, Config from thinc.util import to_numpy +from itertools import islice from ..errors import Errors -from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob -from ..gold import validate_examples +from ..training import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob +from ..training import validate_examples from ..tokens import Doc from ..language import Language from ..vocab import Vocab @@ -168,18 +169,29 @@ class SimpleNER(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ): + self._ensure_examples(get_examples) all_labels = set() - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="SimpleNER", obj=type(get_examples)) - raise ValueError(err) for example in get_examples(): all_labels.update(_get_labels(example)) for label in sorted(all_labels): - self.add_label(label) - labels = self.labels - n_actions = self.model.attrs["get_num_actions"](len(labels)) - self.model.set_dim("nO", n_actions) - self.model.initialize() + if label != "": + self.add_label(label) + doc_sample = [] + label_sample = [] + self._require_labels() + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned_ner() + if not self.is_biluo: + gold_tags = biluo_to_iob(gold_tags) + gold_array = [ + [1.0 if tag == gold_tag else 0.0 for tag in self.get_tag_names()] + for gold_tag in gold_tags + ] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.loss_func = SequenceCategoricalCrossentropy( @@ -206,6 +218,6 @@ def _has_ner(example: Example) -> bool: def _get_labels(example: Example) -> Set[str]: labels = set() for ner_tag in example.get_aligned("ENT_TYPE", as_string=True): - if ner_tag != "O" and ner_tag != "-": + if ner_tag != "O" and ner_tag != "-" and ner_tag != "": labels.add(ner_tag) return labels diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 2b760c878..1f8b4eb7a 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -5,6 +5,7 @@ import srsly from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.types import Floats2d import warnings +from itertools import islice from ..tokens.doc cimport Doc from ..morphology cimport Morphology @@ -16,7 +17,7 @@ from ..attrs import POS, ID from ..parts_of_speech import X from ..errors import Errors, TempErrors, Warnings from ..scorer import Scorer -from ..gold import validate_examples +from ..training import validate_examples from .. import util @@ -258,10 +259,11 @@ class Tagger(Pipe): return float(loss), d_scores def begin_training(self, get_examples, *, pipeline=None, sgd=None): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects.. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -271,32 +273,24 @@ class Tagger(Pipe): DOCS: https://nightly.spacy.io/api/tagger#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="Tagger", obj=type(get_examples)) - raise ValueError(err) - tags = set() + self._ensure_examples(get_examples) doc_sample = [] + label_sample = [] + tags = set() for example in get_examples(): for token in example.y: - tags.add(token.tag_) - if len(doc_sample) < 10: - doc_sample.append(example.x) - if not doc_sample: - doc_sample.append(Doc(self.vocab, words=["hello"])) + if token.tag_: + tags.add(token.tag_) for tag in sorted(tags): self.add_label(tag) - if len(self.labels) == 0: - err = Errors.E1006.format(name="Tagger") - raise ValueError(err) - self.set_output(len(self.labels)) - if doc_sample: - label_sample = [ - self.model.ops.alloc2f(len(doc), len(self.labels)) - for doc in doc_sample - ] - self.model.initialize(X=doc_sample, Y=label_sample) - else: - self.model.initialize() + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + gold_tags = example.get_aligned("TAG", as_string=True) + gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags] + label_sample.append(self.model.ops.asarray(gold_array, dtype="float32")) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd @@ -313,6 +307,7 @@ class Tagger(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 + self._allow_extra_label() self.cfg["labels"].append(label) self.vocab.strings.add(label) return 1 diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index d6efb4348..4be6f580d 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,3 +1,4 @@ +from itertools import islice from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.types import Floats2d @@ -5,7 +6,7 @@ import numpy from .pipe import Pipe from ..language import Language -from ..gold import Example, validate_examples +from ..training import Example, validate_examples from ..errors import Errors from ..scorer import Scorer from .. import util @@ -128,11 +129,6 @@ class TextCategorizer(Pipe): """ return tuple(self.cfg.setdefault("labels", [])) - def require_labels(self) -> None: - """Raise an error if the component's model has no labels defined.""" - if not self.labels: - raise ValueError(Errors.E143.format(name=self.name)) - @labels.setter def labels(self, value: Iterable[str]) -> None: self.cfg["labels"] = tuple(value) @@ -311,17 +307,7 @@ class TextCategorizer(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model.has_dim("nO"): - # This functionality was available previously, but was broken. - # The problem is that we resize the last layer, but the last layer - # is actually just an ensemble. We're not resizing the child layers - # - a huge problem. - raise ValueError(Errors.E116) - # smaller = self.model._layers[-1] - # larger = Linear(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger + self._allow_extra_label() self.labels = tuple(list(self.labels) + [label]) return 1 @@ -332,10 +318,11 @@ class TextCategorizer(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ) -> Optimizer: - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -345,22 +332,19 @@ class TextCategorizer(Pipe): DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training """ - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) subbatch = [] # Select a subbatch of examples to initialize the model - for example in get_examples(): + for example in islice(get_examples(), 10): if len(subbatch) < 2: subbatch.append(example) for cat in example.y.cats: self.add_label(cat) - self.require_labels() - docs = [eg.reference for eg in subbatch] - if not docs: # need at least one doc - docs = [Doc(self.vocab, words=["hello"])] - truths, _ = self._examples_to_truth(subbatch) - self.set_output(len(self.labels)) - self.model.initialize(X=docs, Y=truths) + doc_sample = [eg.reference for eg in subbatch] + label_sample, _ = self._examples_to_truth(subbatch) + self._require_labels() + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + assert len(label_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample, Y=label_sample) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5657d687d..721c67a19 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -1,8 +1,9 @@ from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple from thinc.api import Model, set_dropout_rate, Optimizer, Config +from itertools import islice from .pipe import Pipe -from ..gold import Example, validate_examples +from ..training import Example, validate_examples from ..tokens import Doc from ..vocab import Vocab from ..language import Language @@ -209,10 +210,11 @@ class Tok2Vec(Pipe): pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, sgd: Optional[Optimizer] = None, ): - """Initialize the pipe for training, using data examples if available. + """Initialize the pipe for training, using a representative set + of data examples. - get_examples (Callable[[], Iterable[Example]]): Optional function that - returns gold-standard Example objects. + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. pipeline (List[Tuple[str, Callable]]): Optional list of pipeline components that this component is part of. Corresponds to nlp.pipeline. @@ -222,8 +224,12 @@ class Tok2Vec(Pipe): DOCS: https://nightly.spacy.io/api/tok2vec#begin_training """ - docs = [Doc(self.vocab, words=["hello"])] - self.model.initialize(X=docs) + self._ensure_examples(get_examples) + doc_sample = [] + for example in islice(get_examples(), 10): + doc_sample.append(example.x) + assert doc_sample, Errors.E923.format(name=self.name) + self.model.initialize(X=doc_sample) def add_label(self, label): raise NotImplementedError diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 5a6b491e0..1350e1f12 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -21,7 +21,7 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ..ml.parser_model cimport get_c_weights, get_c_sizes from ..tokens.doc cimport Doc -from ..gold import validate_examples +from ..training import validate_examples from ..errors import Errors, Warnings from .. import util @@ -244,7 +244,7 @@ cdef class Parser(Pipe): int nr_class, int batch_size) nogil: # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc with gil: - assert self.moves.n_moves > 0 + assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = calloc(self.moves.n_moves, sizeof(int)) cdef int i, guess cdef Transition action @@ -378,7 +378,7 @@ cdef class Parser(Pipe): cdef int i # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc - assert self.moves.n_moves > 0 + assert self.moves.n_moves > 0, Errors.E924.format(name=self.name) is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) costs = mem.alloc(self.moves.n_moves, sizeof(float)) @@ -406,9 +406,7 @@ cdef class Parser(Pipe): self.model.attrs["resize_output"](self.model, nO) def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): - if not hasattr(get_examples, "__call__"): - err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples)) - raise ValueError(err) + self._ensure_examples(get_examples) self.cfg.update(kwargs) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: @@ -430,9 +428,6 @@ cdef class Parser(Pipe): if sgd is None: sgd = self.create_optimizer() doc_sample = [] - for example in islice(get_examples(), 10): - doc_sample.append(example.predicted) - if pipeline is not None: for name, component in pipeline: if component is self: @@ -441,10 +436,11 @@ cdef class Parser(Pipe): doc_sample = list(component.pipe(doc_sample, batch_size=8)) else: doc_sample = [component(doc) for doc in doc_sample] - if doc_sample: - self.model.initialize(doc_sample) - else: - self.model.initialize() + if not doc_sample: + for example in islice(get_examples(), 10): + doc_sample.append(example.predicted) + assert len(doc_sample) > 0, Errors.E923.format(name=self.name) + self.model.initialize(doc_sample) if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) return sgd diff --git a/spacy/schemas.py b/spacy/schemas.py index 59af53301..baa893802 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -12,7 +12,7 @@ from .attrs import NAMES if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .language import Language # noqa: F401 - from .gold import Example # noqa: F401 + from .training import Example # noqa: F401 ItemT = TypeVar("ItemT") diff --git a/spacy/scorer.py b/spacy/scorer.py index 9b1831a91..7f7418237 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,7 +1,7 @@ from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING import numpy as np -from .gold import Example +from .training import Example from .tokens import Token, Doc, Span from .errors import Errors from .util import get_lang_class, SimpleFrozenList diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index d6e345336..751bd36d4 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -1,5 +1,6 @@ +from spacy.training import Example from spacy.pipeline import EntityRecognizer -from spacy.tokens import Span +from spacy.tokens import Span, Doc from spacy import registry import pytest @@ -7,6 +8,12 @@ from ..util import get_doc from spacy.pipeline.ner import DEFAULT_NER_MODEL +def _ner_example(ner): + doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} + return Example.from_dict(doc, gold) + + def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) @@ -18,10 +25,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: []) + ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] @@ -31,6 +36,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab): def test_ents_reset(en_vocab): + """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = get_doc(en_vocab, text) config = { @@ -41,11 +47,11 @@ def test_ents_reset(en_vocab): cfg = {"model": DEFAULT_NER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) - ner.begin_training(lambda: []) + ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) - assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) + assert [t.ent_iob_ for t in doc] == orig_iobs def test_add_overlapping_entities(en_vocab): diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index fce5f679f..0da42daa2 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -3,7 +3,7 @@ from thinc.api import Adam, fix_random_seed from spacy import registry from spacy.attrs import NORM from spacy.vocab import Vocab -from spacy.gold import Example +from spacy.training import Example from spacy.tokens import Doc from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.pipeline.ner import DEFAULT_NER_MODEL @@ -35,7 +35,7 @@ def test_init_parser(parser): def _train_parser(parser): fix_random_seed(1) parser.add_label("left") - parser.begin_training(lambda: [], **parser.cfg) + parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(5): @@ -47,16 +47,25 @@ def _train_parser(parser): return parser +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +def _ner_example(ner): + doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"]) + gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]} + return Example.from_dict(doc, gold) + + def test_add_label(parser): parser = _train_parser(parser) parser.add_label("right") sgd = Adam(0.001) for i in range(100): losses = {} - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - example = Example.from_dict(doc, gold) - parser.update([example], sgd=sgd, losses=losses) + parser.update([_parser_example(parser)], sgd=sgd, losses=losses) doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].dep_ == "right" @@ -75,7 +84,7 @@ def test_add_label_deserializes_correctly(): ner1.add_label("C") ner1.add_label("B") ner1.add_label("A") - ner1.begin_training(lambda: []) + ner1.begin_training(lambda: [_ner_example(ner1)]) ner2 = EntityRecognizer(Vocab(), model, **config) # the second model needs to be resized before we can call from_bytes diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index fd1880030..826fc1d87 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,7 +1,7 @@ import pytest from spacy.vocab import Vocab from spacy import registry -from spacy.gold import Example +from spacy.training import Example from spacy.pipeline import DependencyParser from spacy.tokens import Doc from spacy.pipeline._parser_internals.nonproj import projectivize diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index c7a1ed0d2..548cd2697 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -4,7 +4,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.lookups import Lookups from spacy.pipeline._parser_internals.ner import BiluoPushDown -from spacy.gold import Example +from spacy.training import Example from spacy.tokens import Doc from spacy.vocab import Vocab import logging diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 6594c7e78..0747241d8 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,7 +1,7 @@ import pytest from spacy import registry -from spacy.gold import Example +from spacy.training import Example from spacy.vocab import Vocab from spacy.pipeline._parser_internals.arc_eager import ArcEager from spacy.pipeline.transition_parser import Parser diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 8265a8a45..8d45e2132 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -3,7 +3,7 @@ import pytest from spacy.lang.en import English from ..util import get_doc, apply_transition_sequence, make_tempdir from ... import util -from ...gold import Example +from ...training import Example TRAIN_DATA = [ ( @@ -85,7 +85,7 @@ def test_parser_merge_pp(en_tokenizer): pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"] tokens = en_tokenizer(text) doc = get_doc( - tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos, + tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos ) with doc.retokenize() as retokenizer: for np in doc.noun_chunks: diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index 594498b0b..1de05be1b 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -3,7 +3,7 @@ from thinc.api import Adam from spacy.attrs import NORM from spacy.vocab import Vocab from spacy import registry -from spacy.gold import Example +from spacy.training import Example from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.tokens import Doc from spacy.pipeline import DependencyParser @@ -14,6 +14,12 @@ def vocab(): return Vocab(lex_attr_getters={NORM: lambda s: s}) +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + @pytest.fixture def parser(vocab): config = { @@ -28,7 +34,7 @@ def parser(vocab): parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") - parser.begin_training(lambda: [], **parser.cfg) + parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg) sgd = Adam(0.001) for i in range(10): diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index c12a2b650..9254688cc 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -1,6 +1,6 @@ import pytest import numpy -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.pipeline import AttributeRuler from spacy import util, registry diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 4eaa71272..c43d2c58e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -4,7 +4,7 @@ import pytest from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy import util, registry -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.tests.util import make_tempdir from spacy.tokens import Span @@ -281,11 +281,12 @@ def test_append_invalid_alias(nlp): def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" + vector_length = 1 @registry.misc.register("myLocationsKB.v1") def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: def create_kb(vocab): - mykb = KnowledgeBase(vocab, entity_vector_length=1) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) @@ -305,10 +306,9 @@ def test_preserving_links_asdoc(nlp): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} - el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) - el_pipe.begin_training(lambda: []) - el_pipe.incl_context = False - el_pipe.incl_prior = True + entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True) + nlp.begin_training() + assert entity_linker.model.get_dim("nO") == vector_length # test whether the entity links are preserved by the `as_doc()` function text = "She lives in Boston. He lives in Denver." @@ -373,6 +373,7 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() nlp.add_pipe("sentencizer") + vector_length = 3 # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [ @@ -393,7 +394,7 @@ def test_overfitting_IO(): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher - mykb = KnowledgeBase(vocab, entity_vector_length=3) + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( @@ -406,14 +407,17 @@ def test_overfitting_IO(): return create_kb # Create the Entity Linker component and add it to the pipeline - nlp.add_pipe( + entity_linker = nlp.add_pipe( "entity_linker", config={"kb_loader": {"@misc": "myOverfittingKB.v1"}}, last=True, ) # train the NEL pipe - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert entity_linker.model.get_dim("nO") == vector_length + assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length + for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) diff --git a/spacy/tests/pipeline/test_morphologizer.py b/spacy/tests/pipeline/test_morphologizer.py index 501c00f84..864c7332e 100644 --- a/spacy/tests/pipeline/test_morphologizer.py +++ b/spacy/tests/pipeline/test_morphologizer.py @@ -1,7 +1,7 @@ import pytest from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir @@ -25,27 +25,61 @@ TRAIN_DATA = [ }, ), # test combinations of morph+POS - ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},), + ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}), ] +def test_no_label(): + nlp = Language() + nlp.add_pipe("morphologizer") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = Language() + nlp.add_pipe("morphologizer") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_no_resize(): + nlp = Language() + morphologizer = nlp.add_pipe("morphologizer") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB") + nlp.begin_training() + # this throws an error because the morphologizer can't be resized after initialization + with pytest.raises(ValueError): + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ") + + +def test_begin_training_examples(): + nlp = Language() + morphologizer = nlp.add_pipe("morphologizer") + morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly nlp = English() - morphologizer = nlp.add_pipe("morphologizer") + nlp.add_pipe("morphologizer") train_examples = [] for inst in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) - for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): - if morph and pos: - morphologizer.add_label( - morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos - ) - elif pos: - morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos) - elif morph: - morphologizer.add_label(morph) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) for i in range(50): losses = {} @@ -55,18 +89,8 @@ def test_overfitting_IO(): # test the trained model test_text = "I like blue ham" doc = nlp(test_text) - gold_morphs = [ - "Feat=N", - "Feat=V", - "", - "", - ] - gold_pos_tags = [ - "NOUN", - "VERB", - "ADJ", - "", - ] + gold_morphs = ["Feat=N", "Feat=V", "", ""] + gold_pos_tags = ["NOUN", "VERB", "ADJ", ""] assert [t.morph_ for t in doc] == gold_morphs assert [t.pos_ for t in doc] == gold_pos_tags diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index b64fa8581..1752df5d0 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -1,7 +1,7 @@ import pytest from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.language import Language from spacy.tests.util import make_tempdir @@ -30,6 +30,20 @@ TRAIN_DATA = [ ), ] +def test_begin_training_examples(): + nlp = Language() + senter = nlp.add_pipe("senter") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + def test_overfitting_IO(): # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly diff --git a/spacy/tests/pipeline/test_simple_ner.py b/spacy/tests/pipeline/test_simple_ner.py index b012a2cd6..940743ce0 100644 --- a/spacy/tests/pipeline/test_simple_ner.py +++ b/spacy/tests/pipeline/test_simple_ner.py @@ -1,15 +1,78 @@ +import pytest from spacy.lang.en import English -from spacy.gold import Example +from spacy.training import Example from spacy import util from ..util import make_tempdir TRAIN_DATA = [ - ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("Who is Shaka S Khan?", {"entities": [(7, 19, "PERSON")]}), ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), ] +def test_no_label(): + nlp = English() + nlp.add_pipe("simple_ner") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = English() + ner = nlp.add_pipe("simple_ner") + train_examples = [] + ner.add_label("ORG") + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +@pytest.mark.skip(reason="Should be fixed") +def test_untrained(): + # This shouldn't crash, but it does when the simple_ner produces an invalid sequence like ['L-PERSON', 'L-ORG'] + nlp = English() + ner = nlp.add_pipe("simple_ner") + ner.add_label("PERSON") + ner.add_label("LOC") + ner.add_label("ORG") + nlp.begin_training() + nlp("Example sentence") + + +def test_resize(): + nlp = English() + ner = nlp.add_pipe("simple_ner") + ner.add_label("PERSON") + ner.add_label("LOC") + nlp.begin_training() + assert len(ner.labels) == 2 + ner.add_label("ORG") + nlp.begin_training() + assert len(ner.labels) == 3 + + +def test_begin_training_examples(): + nlp = English() + ner = nlp.add_pipe("simple_ner") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: train_examples[0]) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=lambda: []) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly nlp = English() @@ -17,9 +80,7 @@ def test_overfitting_IO(): train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for ent in annotations.get("entities"): - ner.add_label(ent[2]) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) for i in range(50): losses = {} diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 540301eac..cd5927675 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,6 +1,6 @@ import pytest from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.language import Language @@ -34,6 +34,56 @@ TRAIN_DATA = [ ] +def test_no_label(): + nlp = Language() + nlp.add_pipe("tagger") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_no_resize(): + nlp = Language() + tagger = nlp.add_pipe("tagger") + tagger.add_label("N") + tagger.add_label("V") + assert tagger.labels == ("N", "V") + nlp.begin_training() + assert tagger.model.get_dim("nO") == 2 + # this throws an error because the tagger can't be resized after initialization + with pytest.raises(ValueError): + tagger.add_label("J") + + +def test_implicit_label(): + nlp = Language() + nlp.add_pipe("tagger") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_begin_training_examples(): + nlp = Language() + tagger = nlp.add_pipe("tagger") + train_examples = [] + for tag in TAGS: + tagger.add_label(tag) + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: train_examples[0]) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=lambda: []) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() @@ -41,9 +91,8 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - for tag in TAGS: - tagger.add_label(tag) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert tagger.model.get_dim("nO") == len(TAGS) for i in range(50): losses = {} diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 12ead90cb..3f9506bb1 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -10,7 +10,7 @@ from spacy.tokens import Doc from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from ..util import make_tempdir -from ...gold import Example +from ...training import Example TRAIN_DATA = [ @@ -80,6 +80,51 @@ def test_label_types(): textcat.add_label(9) +def test_no_label(): + nlp = Language() + nlp.add_pipe("textcat") + with pytest.raises(ValueError): + nlp.begin_training() + + +def test_implicit_label(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + nlp.begin_training(get_examples=lambda: train_examples) + + +def test_no_resize(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + textcat.add_label("POSITIVE") + textcat.add_label("NEGATIVE") + nlp.begin_training() + assert textcat.model.get_dim("nO") == 2 + # this throws an error because the textcat can't be resized after initialization + with pytest.raises(ValueError): + textcat.add_label("NEUTRAL") + + +def test_begin_training_examples(): + nlp = Language() + textcat = nlp.add_pipe("textcat") + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for label, value in annotations.get("cats").items(): + textcat.add_label(label) + # you shouldn't really call this more than once, but for testing it should be fine + nlp.begin_training() + nlp.begin_training(get_examples=lambda: train_examples) + with pytest.raises(TypeError): + nlp.begin_training(get_examples=lambda: None) + with pytest.raises(ValueError): + nlp.begin_training(get_examples=train_examples) + + def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly fix_random_seed(0) @@ -89,9 +134,8 @@ def test_overfitting_IO(): train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) - for label, value in annotations.get("cats").items(): - textcat.add_label(label) - optimizer = nlp.begin_training() + optimizer = nlp.begin_training(get_examples=lambda: train_examples) + assert textcat.model.get_dim("nO") == 2 for i in range(50): losses = {} diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 5c93ea3c8..ed5bcc1a5 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,7 +1,7 @@ import pytest import random from spacy import util -from spacy.gold import Example +from spacy.training import Example from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 4988575ea..c1d726db6 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -3,7 +3,7 @@ import gc import numpy import copy -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.lex_attrs import is_stop diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 259ca9b0c..357fbb84e 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -3,7 +3,7 @@ import numpy from spacy.tokens import Doc from spacy.matcher import Matcher from spacy.displacy import render -from spacy.gold import iob_to_biluo +from spacy.training import iob_to_biluo from spacy.lang.it import Italian from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 3882df0a6..beb8faca1 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -1,6 +1,6 @@ import pytest from spacy import displacy -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.lang.ja import Japanese from spacy.lang.xx import MultiLanguage @@ -20,7 +20,7 @@ def test_issue2564(): nlp = Language() tagger = nlp.add_pipe("tagger") tagger.add_label("A") - tagger.begin_training(lambda: []) + nlp.begin_training() doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index fc2a3ed7c..d36e693c7 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -9,7 +9,7 @@ from spacy.tokens import Doc, Token from spacy.matcher import Matcher, PhraseMatcher from spacy.errors import MatchPatternError from spacy.util import minibatch -from spacy.gold import Example +from spacy.training import Example from spacy.lang.hi import Hindi from spacy.lang.es import Spanish from spacy.lang.en import English @@ -251,6 +251,12 @@ def test_issue3803(): assert [t.like_num for t in doc] == [True, True, True, True, True, True] +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { @@ -264,7 +270,7 @@ def test_issue3830_no_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: []) + parser.begin_training(lambda: [_parser_example(parser)]) assert "subtok" not in parser.labels @@ -281,7 +287,7 @@ def test_issue3830_with_subtok(): parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels - parser.begin_training(lambda: []) + parser.begin_training(lambda: [_parser_example(parser)]) assert "subtok" in parser.labels diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index e846841d4..2beccedcf 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -2,8 +2,8 @@ import pytest from spacy.pipeline import Pipe from spacy.matcher import PhraseMatcher, Matcher from spacy.tokens import Doc, Span, DocBin -from spacy.gold import Example, Corpus -from spacy.gold.converters import json2docs +from spacy.training import Example, Corpus +from spacy.training.converters import json2docs from spacy.vocab import Vocab from spacy.lang.en import English from spacy.util import minibatch, ensure_path, load_model diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index d83a2c718..9454d7f0c 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -1,9 +1,7 @@ import pytest -from mock import Mock -from spacy.matcher import DependencyMatcher from spacy.tokens import Doc, Span, DocBin -from spacy.gold import Example -from spacy.gold.converters.conllu2docs import conllu2docs +from spacy.training import Example +from spacy.training.converters.conllu2docs import conllu2docs from spacy.lang.en import English from spacy.kb import KnowledgeBase from spacy.vocab import Vocab @@ -12,7 +10,7 @@ from spacy.util import ensure_path, load_model_from_path import numpy import pickle -from ..util import get_doc, make_tempdir +from ..util import make_tempdir def test_issue4528(en_vocab): diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index af643aadc..531e48ec3 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -64,7 +64,7 @@ def tagger(): # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization tagger.add_label("A") - tagger.begin_training(lambda: [], pipeline=nlp.pipeline) + nlp.begin_training() return tagger @@ -85,7 +85,7 @@ def entity_linker(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) + nlp.begin_training() return entity_linker diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index aa8ea6051..e8c83cbad 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,8 +1,8 @@ import pytest from click import NoSuchOption -from spacy.gold import docs_to_json, biluo_tags_from_offsets -from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs +from spacy.training import docs_to_json, biluo_tags_from_offsets +from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.cli.pretrain import make_docs diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index ebc804235..840d878c2 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -3,7 +3,7 @@ import pytest from spacy.language import Language from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from spacy.gold import Example +from spacy.training import Example from spacy.lang.en import English from spacy.util import registry diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index 321eaae95..597809286 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -1,5 +1,5 @@ import pytest -from spacy.gold.example import Example +from spacy.training.example import Example from spacy.tokens import Doc from spacy.vocab import Vocab diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 6dae14210..fb96c0361 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,8 +1,8 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx -from spacy.gold import Example -from spacy.gold.iob_utils import biluo_tags_from_offsets +from spacy.training import Example +from spacy.training.iob_utils import biluo_tags_from_offsets from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import _roc_auc_score, _roc_curve from .util import get_doc diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 9f0f4b74a..fb30c6ae5 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -6,7 +6,7 @@ from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.gold import Example +from spacy.training import Example from spacy import util from spacy.lang.en import English from .util import get_batch @@ -89,6 +89,7 @@ def test_init_tok2vec(): tok2vec = nlp.add_pipe("tok2vec") assert tok2vec.listeners == [] nlp.begin_training() + assert tok2vec.model.get_dim("nO") cfg_string = """ diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_training.py similarity index 98% rename from spacy/tests/test_gold.py rename to spacy/tests/test_training.py index 334d9fc24..1926aca1f 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_training.py @@ -1,9 +1,10 @@ import numpy -from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment -from spacy.gold import spans_from_biluo_tags, iob_to_biluo -from spacy.gold import Corpus, docs_to_json -from spacy.gold.example import Example -from spacy.gold.converters import json2docs +from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment +from spacy.training import spans_from_biluo_tags, iob_to_biluo +from spacy.training import Corpus, docs_to_json +from spacy.training.example import Example +from spacy.training.converters import json2docs +from spacy.training.augment import make_orth_variants_example from spacy.lang.en import English from spacy.tokens import Doc, DocBin from spacy.util import get_words_and_spaces, minibatch @@ -12,7 +13,6 @@ import pytest import srsly from .util import make_tempdir -from ..gold.augment import make_orth_variants_example @pytest.fixture diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 40cd71eb5..1f073ab32 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -5,7 +5,7 @@ from .util import get_random_doc from spacy import util from spacy.util import dot_to_object, SimpleFrozenList from thinc.api import Config, Optimizer -from spacy.gold.batchers import minibatch_by_words +from spacy.training.batchers import minibatch_by_words from ..lang.en import English from ..lang.nl import Dutch from ..language import DEFAULT_CONFIG_PATH diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5e7222d40..787cca652 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -24,7 +24,7 @@ from .util import registry from .attrs import intify_attrs from .symbols import ORTH from .scorer import Scorer -from .gold import validate_examples +from .training import validate_examples cdef class Tokenizer: diff --git a/spacy/gold/__init__.pxd b/spacy/training/__init__.pxd similarity index 100% rename from spacy/gold/__init__.pxd rename to spacy/training/__init__.pxd diff --git a/spacy/gold/__init__.py b/spacy/training/__init__.py similarity index 100% rename from spacy/gold/__init__.py rename to spacy/training/__init__.py diff --git a/spacy/gold/align.py b/spacy/training/align.py similarity index 100% rename from spacy/gold/align.py rename to spacy/training/align.py diff --git a/spacy/gold/augment.py b/spacy/training/augment.py similarity index 100% rename from spacy/gold/augment.py rename to spacy/training/augment.py diff --git a/spacy/gold/batchers.py b/spacy/training/batchers.py similarity index 100% rename from spacy/gold/batchers.py rename to spacy/training/batchers.py diff --git a/spacy/gold/converters/__init__.py b/spacy/training/converters/__init__.py similarity index 100% rename from spacy/gold/converters/__init__.py rename to spacy/training/converters/__init__.py diff --git a/spacy/gold/converters/conll_ner2docs.py b/spacy/training/converters/conll_ner2docs.py similarity index 99% rename from spacy/gold/converters/conll_ner2docs.py rename to spacy/training/converters/conll_ner2docs.py index c04a77f07..8dcaf2599 100644 --- a/spacy/gold/converters/conll_ner2docs.py +++ b/spacy/training/converters/conll_ner2docs.py @@ -1,7 +1,7 @@ from wasabi import Printer from .. import tags_to_entities -from ...gold import iob_to_biluo +from ...training import iob_to_biluo from ...lang.xx import MultiLanguage from ...tokens import Doc, Span from ...util import load_model diff --git a/spacy/gold/converters/conllu2docs.py b/spacy/training/converters/conllu2docs.py similarity index 99% rename from spacy/gold/converters/conllu2docs.py rename to spacy/training/converters/conllu2docs.py index 11ee86182..85afdeef3 100644 --- a/spacy/gold/converters/conllu2docs.py +++ b/spacy/training/converters/conllu2docs.py @@ -1,7 +1,7 @@ import re from .conll_ner2docs import n_sents_info -from ...gold import iob_to_biluo, spans_from_biluo_tags +from ...training import iob_to_biluo, spans_from_biluo_tags from ...tokens import Doc, Token, Span from ...vocab import Vocab from wasabi import Printer diff --git a/spacy/gold/converters/iob2docs.py b/spacy/training/converters/iob2docs.py similarity index 97% rename from spacy/gold/converters/iob2docs.py rename to spacy/training/converters/iob2docs.py index eebf1266b..f8076c5ab 100644 --- a/spacy/gold/converters/iob2docs.py +++ b/spacy/training/converters/iob2docs.py @@ -1,7 +1,7 @@ from wasabi import Printer from .conll_ner2docs import n_sents_info -from ...gold import iob_to_biluo, tags_to_entities +from ...training import iob_to_biluo, tags_to_entities from ...tokens import Doc, Span from ...util import minibatch diff --git a/spacy/gold/converters/json2docs.py b/spacy/training/converters/json2docs.py similarity index 100% rename from spacy/gold/converters/json2docs.py rename to spacy/training/converters/json2docs.py diff --git a/spacy/gold/corpus.py b/spacy/training/corpus.py similarity index 100% rename from spacy/gold/corpus.py rename to spacy/training/corpus.py diff --git a/spacy/gold/example.pxd b/spacy/training/example.pxd similarity index 100% rename from spacy/gold/example.pxd rename to spacy/training/example.pxd diff --git a/spacy/gold/example.pyx b/spacy/training/example.pyx similarity index 100% rename from spacy/gold/example.pyx rename to spacy/training/example.pyx diff --git a/spacy/gold/gold_io.pyx b/spacy/training/gold_io.pyx similarity index 100% rename from spacy/gold/gold_io.pyx rename to spacy/training/gold_io.pyx diff --git a/spacy/gold/iob_utils.py b/spacy/training/iob_utils.py similarity index 97% rename from spacy/gold/iob_utils.py rename to spacy/training/iob_utils.py index 08751cfd4..ceb5e16b8 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -195,13 +195,15 @@ def tags_to_entities(tags): continue elif tag.startswith("I"): if start is None: - raise ValueError(Errors.E067.format(tags=tags[: i + 1])) + raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) continue if tag.startswith("U"): entities.append((tag[2:], i, i)) elif tag.startswith("B"): start = i elif tag.startswith("L"): + if start is None: + raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1])) entities.append((tag[2:], start, i)) start = None else: diff --git a/spacy/gold/loggers.py b/spacy/training/loggers.py similarity index 100% rename from spacy/gold/loggers.py rename to spacy/training/loggers.py diff --git a/spacy/util.py b/spacy/util.py index fa4815df8..bd567ddc7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -93,6 +93,7 @@ class registry(thinc.registry): # environment. spaCy models packaged with `spacy package` will "advertise" # themselves via entry points. models = catalogue.create("spacy", "models", entry_points=True) + cli = catalogue.create("spacy", "cli", entry_points=True) class SimpleFrozenDict(dict): diff --git a/website/README.md b/website/README.md index f3a64d1cb..10a75161b 100644 --- a/website/README.md +++ b/website/README.md @@ -289,11 +289,11 @@ always be the **last element** in the row. > | Column 1 | Column 2 ~~List[Doc]~~ | > ``` -| Name | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ | -| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| Name | Description | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ | +| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | ### List {#list} diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 7852d0482..0291d6dca 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -357,12 +357,12 @@ File /path/to/spacy/ml/models/tok2vec.py (line 207) ℹ [training.logger] Registry @loggers Name spacy.ConsoleLogger.v1 -Module spacy.gold.loggers +Module spacy.training.loggers File /path/to/spacy/gold/loggers.py (line 8) ℹ [training.batcher] Registry @batchers Name spacy.batch_by_words.v1 -Module spacy.gold.batchers +Module spacy.training.batchers File /path/to/spacy/gold/batchers.py (line 49) ℹ [training.batcher.size] Registry @schedules @@ -372,7 +372,7 @@ File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43) ℹ [training.dev_corpus] Registry @readers Name spacy.Corpus.v1 -Module spacy.gold.corpus +Module spacy.training.corpus File /path/to/spacy/gold/corpus.py (line 18) ℹ [training.optimizer] Registry @optimizers @@ -387,7 +387,7 @@ File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91) ℹ [training.train_corpus] Registry @readers Name spacy.Corpus.v1 -Module spacy.gold.corpus +Module spacy.training.corpus File /path/to/spacy/gold/corpus.py (line 18) ``` diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 86cfa9121..b913d9a05 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -58,7 +58,7 @@ train/test skew. > #### Example > > ```python -> from spacy.gold import Corpus +> from spacy.training import Corpus > > # With a single file > corpus = Corpus("./data/train.spacy") @@ -82,7 +82,7 @@ Yield examples from the data. > #### Example > > ```python -> from spacy.gold import Corpus +> from spacy.training import Corpus > import spacy > > corpus = Corpus("./train.spacy") diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 3fd2818f4..6a3b528c6 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -175,7 +175,7 @@ run [`spacy pretrain`](/api/cli#pretrain). > > ```python > from spacy.tokens import DocBin -> from spacy.gold import Corpus +> from spacy.training import Corpus > > doc_bin = DocBin(docs=docs) > doc_bin.to_disk("./data.spacy") diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 2434cce43..132e9e8f5 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -22,7 +22,7 @@ both documents. > > ```python > from spacy.tokens import Doc -> from spacy.gold import Example +> from spacy.training import Example > > words = ["hello", "world", "!"] > spaces = [True, False, False] @@ -48,7 +48,7 @@ see the [training format documentation](/api/data-formats#dict-input). > > ```python > from spacy.tokens import Doc -> from spacy.gold import Example +> from spacy.training import Example > > predicted = Doc(vocab, words=["Apply", "some", "sunscreen"]) > token_ref = ["Apply", "some", "sun", "screen"] @@ -301,7 +301,7 @@ tokenizations add up to the same string. For example, you'll be able to align > #### Example > > ```python -> from spacy.gold import Alignment +> from spacy.training import Alignment > > bert_tokens = ["obama", "'", "s", "podcast"] > spacy_tokens = ["obama", "'s", "podcast"] diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 7f2eb2e66..7f66abb5f 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -538,7 +538,7 @@ sequences in the batch. ## Training data and alignment {#gold source="spacy/gold"} -### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} +### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} Encode labelled spans into per-token tags, using the [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, @@ -554,7 +554,7 @@ single-token entity. > #### Example > > ```python -> from spacy.gold import biluo_tags_from_offsets +> from spacy.training import biluo_tags_from_offsets > > doc = nlp("I like London.") > entities = [(7, 13, "LOC")] @@ -568,7 +568,7 @@ single-token entity. | `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ | -### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} +### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. @@ -576,7 +576,7 @@ Encode per-token tags following the > #### Example > > ```python -> from spacy.gold import offsets_from_biluo_tags +> from spacy.training import offsets_from_biluo_tags > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] @@ -590,7 +590,7 @@ Encode per-token tags following the | `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | -### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} +### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} Encode per-token tags following the [BILUO scheme](/usage/linguistic-features#accessing-ner) into @@ -600,7 +600,7 @@ token-based tags, e.g. to overwrite the `doc.ents`. > #### Example > > ```python -> from spacy.gold import spans_from_biluo_tags +> from spacy.training import spans_from_biluo_tags > > doc = nlp("I like London.") > tags = ["O", "O", "U-LOC", "O"] diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index b41a18890..fc8a8deef 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -61,11 +61,11 @@ on the transformer architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Description | -| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | -| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | +| Setting | Description | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | +| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py @@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ | -| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | -| _keyword-only_ | | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ | +| Name | Description | +| ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ | +| `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | +| _keyword-only_ | | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ | ## Transformer.\_\_call\_\_ {#call tag="method"} @@ -205,7 +205,7 @@ modifying them. Assign the extracted features to the Doc objects. By default, the [`TransformerData`](/api/transformer#transformerdata) object is written to the -[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter +[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations` callback is then called, if provided. > #### Example diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index abd92a8ac..5215c0ae5 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -253,10 +253,10 @@ for doc in nlp.pipe(["some text", "some other text"]): You can also customize how the [`Transformer`](/api/transformer) component sets annotations onto the [`Doc`](/api/doc), by specifying a custom -`annotation_setter`. This callback will be called with the raw input and output -data for the whole batch, along with the batch of `Doc` objects, allowing you to -implement whatever you need. The annotation setter is called with a batch of -[`Doc`](/api/doc) objects and a +`set_extra_annotations` function. This callback will be called with the raw +input and output data for the whole batch, along with the batch of `Doc` +objects, allowing you to implement whatever you need. The annotation setter is +called with a batch of [`Doc`](/api/doc) objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the transformers data for the batch. @@ -267,7 +267,7 @@ def custom_annotation_setter(docs, trf_data): doc._.custom_attr = data nlp = spacy.load("en_core_trf_lg") -nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter +nlp.get_pipe("transformer").set_extra_annotations = custom_annotation_setter doc = nlp("This is a text") assert isinstance(doc._.custom_attr, TransformerData) print(doc._.custom_attr.tensors) @@ -314,7 +314,7 @@ component: > get_spans=get_doc_spans, > tokenizer_config={"use_fast": True}, > ), -> annotation_setter=null_annotation_setter, +> set_extra_annotations=null_annotation_setter, > max_batch_items=4096, > ) > ``` @@ -333,7 +333,7 @@ tokenizer_config = {"use_fast": true} [components.transformer.model.get_spans] @span_getters = "spacy-transformers.doc_spans.v1" -[components.transformer.annotation_setter] +[components.transformer.set_extra_annotations] @annotation_setters = "spacy-transformers.null_annotation_setter.v1" ``` diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index b36e9b71f..3cf6316c9 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1366,7 +1366,7 @@ token. ```python ### {executable="true"} -from spacy.gold import Alignment +from spacy.training import Alignment other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."] spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."] diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 0da350f27..a875df29c 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1500,7 +1500,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline component function and pass it the token texts from the `Doc` object received by the component. -The [`gold.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very +The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very helpful here, because it takes a `Doc` object and token-based BILUO tags and returns a sequence of `Span` objects in the `Doc` with added labels. So all your wrapper has to do is compute the entity spans and overwrite the `doc.ents`. @@ -1515,7 +1515,7 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`. ```python ### {highlight="1,8-9"} import your_custom_entity_recognizer -from spacy.gold import offsets_from_biluo_tags +from spacy.training import offsets_from_biluo_tags from spacy.language import Language @Language.component("custom_ner_wrapper") diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 3f0141d72..51e56f2d5 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -735,7 +735,7 @@ as **config settings** – in this case, `source`. ### functions.py {highlight="7-8"} from typing import Callable, Iterator, List import spacy -from spacy.gold import Example +from spacy.training import Example from spacy.language import Language import random @@ -783,7 +783,7 @@ annotations are the same. ### functions.py from typing import Callable, Iterable, Iterator, List import spacy -from spacy.gold import Example +from spacy.training import Example @spacy.registry.batchers("filtering_batch.v1") def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterator[List[Example]]]: