spaCy/spacy/tests/test_language.py

938 lines
30 KiB
Python
Raw Normal View History

import itertools
import logging
from unittest import mock
import pytest
from spacy.language import Language
from spacy.scorer import Scorer
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.training import Example
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.util import registry, ignore_error, raise_error, find_matching_language
`Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec *after* the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop
2023-02-03 17:22:25 +03:00
from spacy.util import load_model_from_config
2020-09-15 12:12:12 +03:00
import spacy
`Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec *after* the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop
2023-02-03 17:22:25 +03:00
from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops
from .util import add_vecs_to_vocab, assert_docs_equal
try:
import torch
# Ensure that we don't deadlock in multiprocessing tests.
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
except ImportError:
pass
`Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec *after* the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop
2023-02-03 17:22:25 +03:00
TAGGER_CFG_STRING = """
[nlp]
lang = "en"
pipeline = ["tok2vec","tagger"]
[components]
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
rows = [2000, 1000, 1000, 1000]
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
"""
TAGGER_TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),
]
TAGGER_TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),
]
def evil_component(doc):
if "2" in doc.text:
raise ValueError("no dice")
return doc
def perhaps_set_sentences(doc):
if not doc.text.startswith("4"):
doc[-1].is_sent_start = True
return doc
def assert_sents_error(doc):
if not doc.has_annotation("SENT_START"):
raise ValueError("no sents")
return doc
def warn_error(proc_name, proc, docs, e):
logger = logging.getLogger("spacy")
logger.warning("Trouble with component %s.", proc_name)
@pytest.fixture
def nlp():
nlp = Language(Vocab())
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
textcat = nlp.add_pipe("textcat")
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
2020-09-28 22:35:09 +03:00
nlp.initialize()
return nlp
def test_language_update(nlp):
text = "hello world"
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
wrongkeyannots = {"LABEL": True}
doc = Doc(nlp.vocab, words=text.split(" "))
example = Example.from_dict(doc, annots)
nlp.update([example])
# Not allowed to call with just one Example
with pytest.raises(TypeError):
nlp.update(example)
# Update with text and dict: not supported anymore since v.3
with pytest.raises(TypeError):
nlp.update((text, annots))
# Update with doc object and dict
with pytest.raises(TypeError):
nlp.update((doc, annots))
# Create examples badly
with pytest.raises(ValueError):
example = Example.from_dict(doc, None)
Improve spacy.gold (no GoldParse, no json format!) (#5555) * Update errors * Remove beam for now (maybe) Remove beam_utils Update setup.py Remove beam * Remove GoldParse WIP on removing goldparse Get ArcEager compiling after GoldParse excise Update setup.py Get spacy.syntax compiling after removing GoldParse Rename NewExample -> Example and clean up Clean html files Start updating tests Update Morphologizer * fix error numbers * fix merge conflict * informative error when calling to_array with wrong field * fix error catching * fixing language and scoring tests * start testing get_aligned * additional tests for new get_aligned function * Draft create_gold_state for arc_eager oracle * Fix import * Fix import * Remove TokenAnnotation code from nonproj * fixing NER one-to-many alignment * Fix many-to-one IOB codes * fix test for misaligned * attempt to fix cases with weird spaces * fix spaces * test_gold_biluo_different_tokenization works * allow None as BILUO annotation * fixed some tests + WIP roundtrip unit test * add spaces to json output format * minibatch utiltiy can deal with strings, docs or examples * fix augment (needs further testing) * various fixes in scripts - needs to be further tested * fix test_cli * cleanup * correct silly typo * add support for MORPH in to/from_array, fix morphologizer overfitting test * fix tagger * fix entity linker * ensure test keeps working with non-linked entities * pipe() takes docs, not examples * small bug fix * textcat bugfix * throw informative error when running the components with the wrong type of objects * fix parser tests to work with example (most still failing) * fix BiluoPushDown parsing entities * small fixes * bugfix tok2vec * fix renames and simple_ner labels * various small fixes * prevent writing dummy values like deps because that could interfer with sent_start values * fix the fix * implement split_sent with aligned SENT_START attribute * test for split sentences with various alignment issues, works * Return ArcEagerGoldParse from ArcEager * Update parser and NER gold stuff * Draft new GoldCorpus class * add links to to_dict * clean up * fix test checking for variants * Fix oracles * Start updating converters * Move converters under spacy.gold * Move things around * Fix naming * Fix name * Update converter to produce DocBin * Update converters * Allow DocBin to take list of Doc objects. * Make spacy convert output docbin * Fix import * Fix docbin * Fix compile in ArcEager * Fix import * Serialize all attrs by default * Update converter * Remove jsonl converter * Add json2docs converter * Draft Corpus class for DocBin * Work on train script * Update Corpus * Update DocBin * Allocate Doc before starting to add words * Make doc.from_array several times faster * Update train.py * Fix Corpus * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests * Skip tests that cause crashes * Skip test causing segfault * Remove GoldCorpus * Update imports * Update after removing GoldCorpus * Fix module name of corpus * Fix mimport * Work on parser oracle * Update arc_eager oracle * Restore ArcEager.get_cost function * Update transition system * Update test_arc_eager_oracle * Remove beam test * Update test * Unskip * Unskip tests * add links to to_dict * clean up * fix test checking for variants * Allow DocBin to take list of Doc objects. * Fix compile in ArcEager * Serialize all attrs by default Move converters under spacy.gold Move things around Fix naming Fix name Update converter to produce DocBin Update converters Make spacy convert output docbin Fix import Fix docbin Fix import Update converter Remove jsonl converter Add json2docs converter * Allocate Doc before starting to add words * Make doc.from_array several times faster * Start updating converters * Work on train script * Draft Corpus class for DocBin Update Corpus Fix Corpus * Update DocBin Add missing strings when serializing * Update train.py * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests Skip tests that cause crashes Skip test causing segfault * Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport * Work on parser oracle Update arc_eager oracle Restore ArcEager.get_cost function Update transition system * Update tests Remove beam test Update test Unskip Unskip tests * Add get_aligned_parse method in Example Fix Example.get_aligned_parse * Add kwargs to Corpus.dev_dataset to match train_dataset * Update nonproj * Use get_aligned_parse in ArcEager * Add another arc-eager oracle test * Remove Example.doc property Remove Example.doc Remove Example.doc Remove Example.doc Remove Example.doc * Update ArcEager oracle Fix Break oracle * Debugging * Fix Corpus * Fix eg.doc * Format * small fixes * limit arg for Corpus * fix test_roundtrip_docs_to_docbin * fix test_make_orth_variants * fix add_label test * Update tests * avoid writing temp dir in json2docs, fixing 4402 test * Update test * Add missing costs to NER oracle * Update test * Work on Example.get_aligned_ner method * Clean up debugging * Xfail tests * Remove prints * Remove print * Xfail some tests * Replace unseen labels for parser * Update test * Update test * Xfail test * Fix Corpus * fix imports * fix docs_to_json * various small fixes * cleanup * Support gold_preproc in Corpus * Support gold_preproc * Pass gold_preproc setting into corpus * Remove debugging * Fix gold_preproc * Fix json2docs converter * Fix convert command * Fix flake8 * Fix import * fix output_dir (converted to Path by typer) * fix var * bugfix: update states after creating golds to avoid out of bounds indexing * Improve efficiency of ArEager oracle * pull merge_sent into iob2docs to avoid Doc creation for each line * fix asserts * bugfix excl Span.end in iob2docs * Support max_length in Corpus * Fix arc_eager oracle * Filter out uannotated sentences in NER * Remove debugging in parser * Simplify NER alignment * Fix conversion of NER data * Fix NER init_gold_batch * Tweak efficiency of precomputable affine * Update onto-json default * Update gold test for NER * Fix parser test * Update test * Add NER data test * Fix convert for single file * Fix test * Hack scorer to avoid evaluating non-nered data * Fix handling of NER data in Example * Output unlabelled spans from O biluo tags in iob_utils * Fix unset variable * Return kept examples from init_gold_batch * Return examples from init_gold_batch * Dont return Example from init_gold_batch * Set spaces on gold doc after conversion * Add test * Fix spaces reading * Improve NER alignment * Improve handling of missing values in NER * Restore the 'cutting' in parser training * Add assertion * Print epochs * Restore random cuts in parser/ner training * Implement Doc.copy * Implement Example.copy * Copy examples at the start of Language.update * Don't unset example docs * Tweak parser model slightly * attempt to fix _guess_spaces * _add_entities_to_doc first, so that links don't get overwritten * fixing get_aligned_ner for one-to-many * fix indexing into x_text * small fix biluo_tags_from_offsets * Add onto-ner config * Simplify NER alignment * Fix NER scoring for partially annotated documents * fix indexing into x_text * fix test_cli failing tests by ignoring spans in doc.ents with empty label * Fix limit * Improve NER alignment * Fix count_train * Remove print statement * fix tests, we're not having nothing but None * fix clumsy fingers * Fix tests * Fix doc.ents * Remove empty docs in Corpus and improve limit * Update config Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
2020-06-26 20:34:12 +03:00
with pytest.raises(KeyError):
example = Example.from_dict(doc, wrongkeyannots)
`Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec *after* the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop
2023-02-03 17:22:25 +03:00
def test_language_update_updates():
config = Config().from_str(TAGGER_CFG_STRING)
nlp = load_model_from_config(config, auto_fill=True, validate=True)
train_examples = []
for t in TAGGER_TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
optimizer = nlp.initialize(get_examples=lambda: train_examples)
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
nlp.update(train_examples, sgd=optimizer)
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
xp = get_array_module(docs_after_update[0].tensor)
assert xp.any(
xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
)
def test_language_evaluate(nlp):
text = "hello world"
Improve spacy.gold (no GoldParse, no json format!) (#5555) * Update errors * Remove beam for now (maybe) Remove beam_utils Update setup.py Remove beam * Remove GoldParse WIP on removing goldparse Get ArcEager compiling after GoldParse excise Update setup.py Get spacy.syntax compiling after removing GoldParse Rename NewExample -> Example and clean up Clean html files Start updating tests Update Morphologizer * fix error numbers * fix merge conflict * informative error when calling to_array with wrong field * fix error catching * fixing language and scoring tests * start testing get_aligned * additional tests for new get_aligned function * Draft create_gold_state for arc_eager oracle * Fix import * Fix import * Remove TokenAnnotation code from nonproj * fixing NER one-to-many alignment * Fix many-to-one IOB codes * fix test for misaligned * attempt to fix cases with weird spaces * fix spaces * test_gold_biluo_different_tokenization works * allow None as BILUO annotation * fixed some tests + WIP roundtrip unit test * add spaces to json output format * minibatch utiltiy can deal with strings, docs or examples * fix augment (needs further testing) * various fixes in scripts - needs to be further tested * fix test_cli * cleanup * correct silly typo * add support for MORPH in to/from_array, fix morphologizer overfitting test * fix tagger * fix entity linker * ensure test keeps working with non-linked entities * pipe() takes docs, not examples * small bug fix * textcat bugfix * throw informative error when running the components with the wrong type of objects * fix parser tests to work with example (most still failing) * fix BiluoPushDown parsing entities * small fixes * bugfix tok2vec * fix renames and simple_ner labels * various small fixes * prevent writing dummy values like deps because that could interfer with sent_start values * fix the fix * implement split_sent with aligned SENT_START attribute * test for split sentences with various alignment issues, works * Return ArcEagerGoldParse from ArcEager * Update parser and NER gold stuff * Draft new GoldCorpus class * add links to to_dict * clean up * fix test checking for variants * Fix oracles * Start updating converters * Move converters under spacy.gold * Move things around * Fix naming * Fix name * Update converter to produce DocBin * Update converters * Allow DocBin to take list of Doc objects. * Make spacy convert output docbin * Fix import * Fix docbin * Fix compile in ArcEager * Fix import * Serialize all attrs by default * Update converter * Remove jsonl converter * Add json2docs converter * Draft Corpus class for DocBin * Work on train script * Update Corpus * Update DocBin * Allocate Doc before starting to add words * Make doc.from_array several times faster * Update train.py * Fix Corpus * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests * Skip tests that cause crashes * Skip test causing segfault * Remove GoldCorpus * Update imports * Update after removing GoldCorpus * Fix module name of corpus * Fix mimport * Work on parser oracle * Update arc_eager oracle * Restore ArcEager.get_cost function * Update transition system * Update test_arc_eager_oracle * Remove beam test * Update test * Unskip * Unskip tests * add links to to_dict * clean up * fix test checking for variants * Allow DocBin to take list of Doc objects. * Fix compile in ArcEager * Serialize all attrs by default Move converters under spacy.gold Move things around Fix naming Fix name Update converter to produce DocBin Update converters Make spacy convert output docbin Fix import Fix docbin Fix import Update converter Remove jsonl converter Add json2docs converter * Allocate Doc before starting to add words * Make doc.from_array several times faster * Start updating converters * Work on train script * Draft Corpus class for DocBin Update Corpus Fix Corpus * Update DocBin Add missing strings when serializing * Update train.py * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests Skip tests that cause crashes Skip test causing segfault * Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport * Work on parser oracle Update arc_eager oracle Restore ArcEager.get_cost function Update transition system * Update tests Remove beam test Update test Unskip Unskip tests * Add get_aligned_parse method in Example Fix Example.get_aligned_parse * Add kwargs to Corpus.dev_dataset to match train_dataset * Update nonproj * Use get_aligned_parse in ArcEager * Add another arc-eager oracle test * Remove Example.doc property Remove Example.doc Remove Example.doc Remove Example.doc Remove Example.doc * Update ArcEager oracle Fix Break oracle * Debugging * Fix Corpus * Fix eg.doc * Format * small fixes * limit arg for Corpus * fix test_roundtrip_docs_to_docbin * fix test_make_orth_variants * fix add_label test * Update tests * avoid writing temp dir in json2docs, fixing 4402 test * Update test * Add missing costs to NER oracle * Update test * Work on Example.get_aligned_ner method * Clean up debugging * Xfail tests * Remove prints * Remove print * Xfail some tests * Replace unseen labels for parser * Update test * Update test * Xfail test * Fix Corpus * fix imports * fix docs_to_json * various small fixes * cleanup * Support gold_preproc in Corpus * Support gold_preproc * Pass gold_preproc setting into corpus * Remove debugging * Fix gold_preproc * Fix json2docs converter * Fix convert command * Fix flake8 * Fix import * fix output_dir (converted to Path by typer) * fix var * bugfix: update states after creating golds to avoid out of bounds indexing * Improve efficiency of ArEager oracle * pull merge_sent into iob2docs to avoid Doc creation for each line * fix asserts * bugfix excl Span.end in iob2docs * Support max_length in Corpus * Fix arc_eager oracle * Filter out uannotated sentences in NER * Remove debugging in parser * Simplify NER alignment * Fix conversion of NER data * Fix NER init_gold_batch * Tweak efficiency of precomputable affine * Update onto-json default * Update gold test for NER * Fix parser test * Update test * Add NER data test * Fix convert for single file * Fix test * Hack scorer to avoid evaluating non-nered data * Fix handling of NER data in Example * Output unlabelled spans from O biluo tags in iob_utils * Fix unset variable * Return kept examples from init_gold_batch * Return examples from init_gold_batch * Dont return Example from init_gold_batch * Set spaces on gold doc after conversion * Add test * Fix spaces reading * Improve NER alignment * Improve handling of missing values in NER * Restore the 'cutting' in parser training * Add assertion * Print epochs * Restore random cuts in parser/ner training * Implement Doc.copy * Implement Example.copy * Copy examples at the start of Language.update * Don't unset example docs * Tweak parser model slightly * attempt to fix _guess_spaces * _add_entities_to_doc first, so that links don't get overwritten * fixing get_aligned_ner for one-to-many * fix indexing into x_text * small fix biluo_tags_from_offsets * Add onto-ner config * Simplify NER alignment * Fix NER scoring for partially annotated documents * fix indexing into x_text * fix test_cli failing tests by ignoring spans in doc.ents with empty label * Fix limit * Improve NER alignment * Fix count_train * Remove print statement * fix tests, we're not having nothing but None * fix clumsy fingers * Fix tests * Fix doc.ents * Remove empty docs in Corpus and improve limit * Update config Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
2020-06-26 20:34:12 +03:00
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
doc = Doc(nlp.vocab, words=text.split(" "))
example = Example.from_dict(doc, annots)
scores = nlp.evaluate([example])
assert scores["speed"] > 0
# test with generator
scores = nlp.evaluate(eg for eg in [example])
assert scores["speed"] > 0
# Not allowed to call with just one Example
with pytest.raises(TypeError):
nlp.evaluate(example)
# Evaluate with text and dict: not supported anymore since v.3
with pytest.raises(TypeError):
nlp.evaluate([(text, annots)])
# Evaluate with doc object and dict
with pytest.raises(TypeError):
nlp.evaluate([(doc, annots)])
with pytest.raises(TypeError):
Improve spacy.gold (no GoldParse, no json format!) (#5555) * Update errors * Remove beam for now (maybe) Remove beam_utils Update setup.py Remove beam * Remove GoldParse WIP on removing goldparse Get ArcEager compiling after GoldParse excise Update setup.py Get spacy.syntax compiling after removing GoldParse Rename NewExample -> Example and clean up Clean html files Start updating tests Update Morphologizer * fix error numbers * fix merge conflict * informative error when calling to_array with wrong field * fix error catching * fixing language and scoring tests * start testing get_aligned * additional tests for new get_aligned function * Draft create_gold_state for arc_eager oracle * Fix import * Fix import * Remove TokenAnnotation code from nonproj * fixing NER one-to-many alignment * Fix many-to-one IOB codes * fix test for misaligned * attempt to fix cases with weird spaces * fix spaces * test_gold_biluo_different_tokenization works * allow None as BILUO annotation * fixed some tests + WIP roundtrip unit test * add spaces to json output format * minibatch utiltiy can deal with strings, docs or examples * fix augment (needs further testing) * various fixes in scripts - needs to be further tested * fix test_cli * cleanup * correct silly typo * add support for MORPH in to/from_array, fix morphologizer overfitting test * fix tagger * fix entity linker * ensure test keeps working with non-linked entities * pipe() takes docs, not examples * small bug fix * textcat bugfix * throw informative error when running the components with the wrong type of objects * fix parser tests to work with example (most still failing) * fix BiluoPushDown parsing entities * small fixes * bugfix tok2vec * fix renames and simple_ner labels * various small fixes * prevent writing dummy values like deps because that could interfer with sent_start values * fix the fix * implement split_sent with aligned SENT_START attribute * test for split sentences with various alignment issues, works * Return ArcEagerGoldParse from ArcEager * Update parser and NER gold stuff * Draft new GoldCorpus class * add links to to_dict * clean up * fix test checking for variants * Fix oracles * Start updating converters * Move converters under spacy.gold * Move things around * Fix naming * Fix name * Update converter to produce DocBin * Update converters * Allow DocBin to take list of Doc objects. * Make spacy convert output docbin * Fix import * Fix docbin * Fix compile in ArcEager * Fix import * Serialize all attrs by default * Update converter * Remove jsonl converter * Add json2docs converter * Draft Corpus class for DocBin * Work on train script * Update Corpus * Update DocBin * Allocate Doc before starting to add words * Make doc.from_array several times faster * Update train.py * Fix Corpus * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests * Skip tests that cause crashes * Skip test causing segfault * Remove GoldCorpus * Update imports * Update after removing GoldCorpus * Fix module name of corpus * Fix mimport * Work on parser oracle * Update arc_eager oracle * Restore ArcEager.get_cost function * Update transition system * Update test_arc_eager_oracle * Remove beam test * Update test * Unskip * Unskip tests * add links to to_dict * clean up * fix test checking for variants * Allow DocBin to take list of Doc objects. * Fix compile in ArcEager * Serialize all attrs by default Move converters under spacy.gold Move things around Fix naming Fix name Update converter to produce DocBin Update converters Make spacy convert output docbin Fix import Fix docbin Fix import Update converter Remove jsonl converter Add json2docs converter * Allocate Doc before starting to add words * Make doc.from_array several times faster * Start updating converters * Work on train script * Draft Corpus class for DocBin Update Corpus Fix Corpus * Update DocBin Add missing strings when serializing * Update train.py * Fix parser model * Start debugging arc_eager oracle * Update header * Fix parser declaration * Xfail some tests Skip tests that cause crashes Skip test causing segfault * Remove GoldCorpus Update imports Update after removing GoldCorpus Fix module name of corpus Fix mimport * Work on parser oracle Update arc_eager oracle Restore ArcEager.get_cost function Update transition system * Update tests Remove beam test Update test Unskip Unskip tests * Add get_aligned_parse method in Example Fix Example.get_aligned_parse * Add kwargs to Corpus.dev_dataset to match train_dataset * Update nonproj * Use get_aligned_parse in ArcEager * Add another arc-eager oracle test * Remove Example.doc property Remove Example.doc Remove Example.doc Remove Example.doc Remove Example.doc * Update ArcEager oracle Fix Break oracle * Debugging * Fix Corpus * Fix eg.doc * Format * small fixes * limit arg for Corpus * fix test_roundtrip_docs_to_docbin * fix test_make_orth_variants * fix add_label test * Update tests * avoid writing temp dir in json2docs, fixing 4402 test * Update test * Add missing costs to NER oracle * Update test * Work on Example.get_aligned_ner method * Clean up debugging * Xfail tests * Remove prints * Remove print * Xfail some tests * Replace unseen labels for parser * Update test * Update test * Xfail test * Fix Corpus * fix imports * fix docs_to_json * various small fixes * cleanup * Support gold_preproc in Corpus * Support gold_preproc * Pass gold_preproc setting into corpus * Remove debugging * Fix gold_preproc * Fix json2docs converter * Fix convert command * Fix flake8 * Fix import * fix output_dir (converted to Path by typer) * fix var * bugfix: update states after creating golds to avoid out of bounds indexing * Improve efficiency of ArEager oracle * pull merge_sent into iob2docs to avoid Doc creation for each line * fix asserts * bugfix excl Span.end in iob2docs * Support max_length in Corpus * Fix arc_eager oracle * Filter out uannotated sentences in NER * Remove debugging in parser * Simplify NER alignment * Fix conversion of NER data * Fix NER init_gold_batch * Tweak efficiency of precomputable affine * Update onto-json default * Update gold test for NER * Fix parser test * Update test * Add NER data test * Fix convert for single file * Fix test * Hack scorer to avoid evaluating non-nered data * Fix handling of NER data in Example * Output unlabelled spans from O biluo tags in iob_utils * Fix unset variable * Return kept examples from init_gold_batch * Return examples from init_gold_batch * Dont return Example from init_gold_batch * Set spaces on gold doc after conversion * Add test * Fix spaces reading * Improve NER alignment * Improve handling of missing values in NER * Restore the 'cutting' in parser training * Add assertion * Print epochs * Restore random cuts in parser/ner training * Implement Doc.copy * Implement Example.copy * Copy examples at the start of Language.update * Don't unset example docs * Tweak parser model slightly * attempt to fix _guess_spaces * _add_entities_to_doc first, so that links don't get overwritten * fixing get_aligned_ner for one-to-many * fix indexing into x_text * small fix biluo_tags_from_offsets * Add onto-ner config * Simplify NER alignment * Fix NER scoring for partially annotated documents * fix indexing into x_text * fix test_cli failing tests by ignoring spans in doc.ents with empty label * Fix limit * Improve NER alignment * Fix count_train * Remove print statement * fix tests, we're not having nothing but None * fix clumsy fingers * Fix tests * Fix doc.ents * Remove empty docs in Corpus and improve limit * Update config Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
2020-06-26 20:34:12 +03:00
nlp.evaluate([text, annots])
def test_evaluate_no_pipe(nlp):
"""Test that docs are processed correctly within Language.pipe if the
component doesn't expose a .pipe method."""
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
@Language.component("test_evaluate_no_pipe")
def pipe(doc):
return doc
text = "hello world"
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
nlp = Language(Vocab())
doc = nlp(text)
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
nlp.add_pipe("test_evaluate_no_pipe")
nlp.evaluate([Example.from_dict(doc, annots)])
def test_evaluate_textcat_multilabel(en_vocab):
"""Test that evaluate works with a multilabel textcat pipe."""
nlp = Language(en_vocab)
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
textcat_multilabel.add_label(label)
nlp.initialize()
annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
doc = nlp.make_doc("hello world")
example = Example.from_dict(doc, annots)
scores = nlp.evaluate([example])
labels = nlp.get_pipe("textcat_multilabel").labels
for label in labels:
assert scores["cats_f_per_type"].get(label) is not None
for key in example.reference.cats.keys():
if key not in labels:
assert scores["cats_f_per_type"].get(key) is None
def test_evaluate_multiple_textcat_final(en_vocab):
"""Test that evaluate evaluates the final textcat component in a pipeline
with more than one textcat or textcat_multilabel."""
nlp = Language(en_vocab)
textcat = nlp.add_pipe("textcat")
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
textcat_multilabel.add_label(label)
nlp.initialize()
annots = {
"cats": {
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
"FEATURE": 1.0,
"QUESTION": 1.0,
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
}
}
doc = nlp.make_doc("hello world")
example = Example.from_dict(doc, annots)
scores = nlp.evaluate([example])
# get the labels from the final pipe
labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
for label in labels:
assert scores["cats_f_per_type"].get(label) is not None
for key in example.reference.cats.keys():
if key not in labels:
assert scores["cats_f_per_type"].get(key) is None
def test_evaluate_multiple_textcat_separate(en_vocab):
"""Test that evaluate can evaluate multiple textcat components separately
with custom scorers."""
def custom_textcat_score(examples, **kwargs):
scores = Scorer.score_cats(
examples,
"cats",
multi_label=False,
**kwargs,
)
return {f"custom_{k}": v for k, v in scores.items()}
@spacy.registry.scorers("test_custom_textcat_scorer")
def make_custom_textcat_scorer():
return custom_textcat_score
nlp = Language(en_vocab)
textcat = nlp.add_pipe(
"textcat",
config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
)
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
textcat_multilabel = nlp.add_pipe("textcat_multilabel")
for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
textcat_multilabel.add_label(label)
nlp.initialize()
annots = {
"cats": {
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
"FEATURE": 1.0,
"QUESTION": 1.0,
"POSITIVE": 1.0,
"NEGATIVE": 0.0,
}
}
doc = nlp.make_doc("hello world")
example = Example.from_dict(doc, annots)
scores = nlp.evaluate([example])
# check custom scores for the textcat pipe
assert "custom_cats_f_per_type" in scores
labels = nlp.get_pipe("textcat").labels
assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
# check default scores for the textcat_multilabel pipe
assert "cats_f_per_type" in scores
labels = nlp.get_pipe("textcat_multilabel").labels
assert set(scores["cats_f_per_type"].keys()) == set(labels)
def vector_modification_pipe(doc):
doc.vector += 1
return doc
def userdata_pipe(doc):
doc.user_data["foo"] = "bar"
return doc
def ner_pipe(doc):
span = Span(doc, 0, 1, label="FIRST")
doc.ents += (span,)
return doc
@pytest.fixture
def sample_vectors():
return [
("spacy", [-0.1, -0.2, -0.3]),
("world", [-0.2, -0.3, -0.4]),
("pipe", [0.7, 0.8, 0.9]),
]
@pytest.fixture
def nlp2(nlp, sample_vectors):
2021-06-28 12:48:00 +03:00
Language.component(
"test_language_vector_modification_pipe", func=vector_modification_pipe
)
Language.component("test_language_userdata_pipe", func=userdata_pipe)
Language.component("test_language_ner_pipe", func=ner_pipe)
add_vecs_to_vocab(nlp.vocab, sample_vectors)
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
nlp.add_pipe("test_language_vector_modification_pipe")
nlp.add_pipe("test_language_ner_pipe")
nlp.add_pipe("test_language_userdata_pipe")
return nlp
@pytest.fixture
def texts():
data = [
"Hello world.",
"This is spacy.",
"You can use multiprocessing with pipe method.",
"Please try!",
]
return data
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe(nlp2, n_process, texts):
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
texts = texts * 10
expecteds = [nlp2(text) for text in texts]
docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
for doc, expected_doc in zip(docs, expecteds):
assert_docs_equal(doc, expected_doc)
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_stream(nlp2, n_process, texts):
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
# check if nlp.pipe can handle infinite length iterator properly.
stream_texts = itertools.cycle(texts)
texts0, texts1 = itertools.tee(stream_texts)
expecteds = (nlp2(text) for text in texts0)
docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
n_fetch = 20
for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
assert_docs_equal(doc, expected_doc)
Refactor pipeline components, config and language data (#5759) * Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-07-22 14:42:59 +03:00
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler(n_process):
"""Test that the error handling of nlp.pipe works well"""
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
nlp = English()
nlp.add_pipe("merge_subtokens")
nlp.initialize()
texts = ["Curious to see what will happen to this text.", "And this one."]
# the pipeline fails because there's no parser
with pytest.raises(ValueError):
nlp(texts[0])
with pytest.raises(ValueError):
list(nlp.pipe(texts, n_process=n_process))
nlp.set_error_handler(raise_error)
with pytest.raises(ValueError):
list(nlp.pipe(texts, n_process=n_process))
# set explicitely to ignoring
nlp.set_error_handler(ignore_error)
docs = list(nlp.pipe(texts, n_process=n_process))
assert len(docs) == 0
nlp(texts[0])
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_custom(en_vocab, n_process):
"""Test the error handling of a custom component that has no pipe method"""
Language.component("my_evil_component", func=evil_component)
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
nlp = English()
nlp.add_pipe("my_evil_component")
texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
with pytest.raises(ValueError):
# the evil custom component throws an error
list(nlp.pipe(texts))
nlp.set_error_handler(warn_error)
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
# the errors by the evil custom component raise a warning for each
# bad doc
docs = list(nlp.pipe(texts, n_process=n_process))
# HACK/TODO? the warnings in child processes don't seem to be
# detected by the mock logger
if n_process == 1:
mock_warning.assert_called()
assert mock_warning.call_count == 2
assert len(docs) + mock_warning.call_count == len(texts)
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
2021-01-30 04:52:33 +03:00
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_input_as_tuples(en_vocab, n_process):
"""Test the error handling of nlp.pipe with input as tuples"""
Language.component("my_evil_component", func=evil_component)
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
nlp = English()
nlp.add_pipe("my_evil_component")
texts = [
("TEXT 111", 111),
("TEXT 222", 222),
("TEXT 333", 333),
("TEXT 342", 342),
("TEXT 666", 666),
]
with pytest.raises(ValueError):
list(nlp.pipe(texts, as_tuples=True))
nlp.set_error_handler(warn_error)
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
tuples = list(nlp.pipe(texts, as_tuples=True, n_process=n_process))
# HACK/TODO? the warnings in child processes don't seem to be
# detected by the mock logger
if n_process == 1:
mock_warning.assert_called()
assert mock_warning.call_count == 2
assert len(tuples) + mock_warning.call_count == len(texts)
assert (tuples[0][0].text, tuples[0][1]) == ("TEXT 111", 111)
assert (tuples[1][0].text, tuples[1][1]) == ("TEXT 333", 333)
assert (tuples[2][0].text, tuples[2][1]) == ("TEXT 666", 666)
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_pipe(en_vocab, n_process):
"""Test the error handling of a component's pipe method"""
Language.component("my_perhaps_sentences", func=perhaps_set_sentences)
Language.component("assert_sents_error", func=assert_sents_error)
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
texts = [f"{str(i)} is enough. Done" for i in range(100)]
nlp = English()
nlp.add_pipe("my_perhaps_sentences")
nlp.add_pipe("assert_sents_error")
nlp.initialize()
with pytest.raises(ValueError):
# assert_sents_error requires sentence boundaries, will throw an error otherwise
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
nlp.set_error_handler(ignore_error)
docs = list(nlp.pipe(texts, n_process=n_process, batch_size=10))
# we lose/ignore the failing 4,40-49 docs
assert len(docs) == 89
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_make_doc_actual(n_process):
"""Test the error handling for make_doc"""
# TODO: fix so that the following test is the actual behavior
2021-01-30 04:52:33 +03:00
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
nlp = English()
nlp.max_length = 10
texts = ["12345678901234567890", "12345"] * 10
with pytest.raises(ValueError):
list(nlp.pipe(texts, n_process=n_process))
nlp.default_error_handler = ignore_error
if n_process == 1:
with pytest.raises(ValueError):
list(nlp.pipe(texts, n_process=n_process))
else:
docs = list(nlp.pipe(texts, n_process=n_process))
assert len(docs) == 0
@pytest.mark.xfail
@pytest.mark.parametrize("n_process", [1, 2])
def test_language_pipe_error_handler_make_doc_preferred(n_process):
"""Test the error handling for make_doc"""
ops = get_current_ops()
if isinstance(ops, NumpyOps) or n_process < 2:
nlp = English()
nlp.max_length = 10
texts = ["12345678901234567890", "12345"] * 10
with pytest.raises(ValueError):
list(nlp.pipe(texts, n_process=n_process))
nlp.default_error_handler = ignore_error
docs = list(nlp.pipe(texts, n_process=n_process))
assert len(docs) == 0
def test_language_from_config_before_after_init():
name = "test_language_from_config_before_after_init"
ran_before = False
ran_after = False
ran_after_pipeline = False
ran_before_init = False
ran_after_init = False
@registry.callbacks(f"{name}_before")
def make_before_creation():
def before_creation(lang_cls):
nonlocal ran_before
ran_before = True
assert lang_cls is English
lang_cls.Defaults.foo = "bar"
return lang_cls
return before_creation
@registry.callbacks(f"{name}_after")
def make_after_creation():
def after_creation(nlp):
nonlocal ran_after
ran_after = True
assert isinstance(nlp, English)
assert nlp.pipe_names == []
assert nlp.Defaults.foo == "bar"
nlp.meta["foo"] = "bar"
return nlp
return after_creation
@registry.callbacks(f"{name}_after_pipeline")
def make_after_pipeline_creation():
def after_pipeline_creation(nlp):
nonlocal ran_after_pipeline
ran_after_pipeline = True
assert isinstance(nlp, English)
assert nlp.pipe_names == ["sentencizer"]
assert nlp.Defaults.foo == "bar"
assert nlp.meta["foo"] == "bar"
nlp.meta["bar"] = "baz"
return nlp
return after_pipeline_creation
@registry.callbacks(f"{name}_before_init")
def make_before_init():
def before_init(nlp):
nonlocal ran_before_init
ran_before_init = True
nlp.meta["before_init"] = "before"
return nlp
return before_init
@registry.callbacks(f"{name}_after_init")
def make_after_init():
def after_init(nlp):
nonlocal ran_after_init
ran_after_init = True
nlp.meta["after_init"] = "after"
return nlp
return after_init
config = {
"nlp": {
"pipeline": ["sentencizer"],
"before_creation": {"@callbacks": f"{name}_before"},
"after_creation": {"@callbacks": f"{name}_after"},
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
},
"components": {"sentencizer": {"factory": "sentencizer"}},
"initialize": {
"before_init": {"@callbacks": f"{name}_before_init"},
"after_init": {"@callbacks": f"{name}_after_init"},
},
}
nlp = English.from_config(config)
assert nlp.Defaults.foo == "bar"
assert nlp.meta["foo"] == "bar"
assert nlp.meta["bar"] == "baz"
assert "before_init" not in nlp.meta
assert "after_init" not in nlp.meta
assert nlp.pipe_names == ["sentencizer"]
assert nlp("text")
nlp.initialize()
assert nlp.meta["before_init"] == "before"
assert nlp.meta["after_init"] == "after"
2021-01-15 03:57:36 +03:00
assert all(
[ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init]
)
def test_language_from_config_before_after_init_invalid():
"""Check that an error is raised if function doesn't return nlp."""
name = "test_language_from_config_before_after_init_invalid"
registry.callbacks(f"{name}_before1", func=lambda: lambda nlp: None)
registry.callbacks(f"{name}_before2", func=lambda: lambda nlp: nlp())
registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: None)
registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: English)
for callback_name in [f"{name}_before1", f"{name}_before2"]:
config = {"nlp": {"before_creation": {"@callbacks": callback_name}}}
with pytest.raises(ValueError):
English.from_config(config)
for callback_name in [f"{name}_after1", f"{name}_after2"]:
config = {"nlp": {"after_creation": {"@callbacks": callback_name}}}
with pytest.raises(ValueError):
English.from_config(config)
for callback_name in [f"{name}_after1", f"{name}_after2"]:
config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
with pytest.raises(ValueError):
English.from_config(config)
2020-08-09 16:24:01 +03:00
def test_language_whitespace_tokenizer():
"""Test the custom whitespace tokenizer from the docs."""
class WhitespaceTokenizer:
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split(" ")
spaces = [True] * len(words)
# Avoid zero-length tokens
for i, word in enumerate(words):
if word == "":
words[i] = " "
spaces[i] = False
# Remove the final trailing space
if words[-1] == " ":
words = words[0:-1]
spaces = spaces[0:-1]
else:
spaces[-1] = False
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
text = " What's happened to me? he thought. It wasn't a dream. "
doc = nlp(text)
assert doc.text == text
2020-08-09 16:24:01 +03:00
def test_language_custom_tokenizer():
"""Test that a fully custom tokenizer can be plugged in via the registry."""
name = "test_language_custom_tokenizer"
class CustomTokenizer:
"""Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
def __init__(self, nlp, prefix):
self.vocab = nlp.vocab
self.prefix = prefix
def __call__(self, text):
words = [f"{self.prefix}{word}" for word in text.split(" ")]
return Doc(self.vocab, words=words)
@registry.tokenizers(name)
def custom_create_tokenizer(prefix: str = "_"):
def create_tokenizer(nlp):
return CustomTokenizer(nlp, prefix=prefix)
return create_tokenizer
config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
nlp = English.from_config(config)
doc = nlp("hello world")
assert [t.text for t in doc] == ["_hello", "_world"]
doc = list(nlp.pipe(["hello world"]))[0]
assert [t.text for t in doc] == ["_hello", "_world"]
def test_language_from_config_invalid_lang():
"""Test that calling Language.from_config raises an error and lang defined
in config needs to match language-specific subclasses."""
config = {"nlp": {"lang": "en"}}
with pytest.raises(ValueError):
Language.from_config(config)
with pytest.raises(ValueError):
German.from_config(config)
2020-09-15 12:12:12 +03:00
def test_spacy_blank():
nlp = spacy.blank("en")
assert nlp.config["training"]["dropout"] == 0.1
config = {"training": {"dropout": 0.2}}
meta = {"name": "my_custom_model"}
nlp = spacy.blank("en", config=config, meta=meta)
assert nlp.config["training"]["dropout"] == 0.2
assert nlp.meta["name"] == "my_custom_model"
2020-09-15 14:25:34 +03:00
@pytest.mark.parametrize(
"lang,target",
[
("en", "en"),
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("is", "isl"),
("mo", "ro"),
("mul", "mul"),
("no", "nb"),
("pt-BR", "pt"),
("xx", "mul"),
("zh-Hans", "zh"),
("zh-Hant", None),
("zxx", None),
],
)
def test_language_matching(lang, target):
"""
Test that we can look up languages by equivalent or nearly-equivalent
language codes.
"""
assert find_matching_language(lang) == target
@pytest.mark.parametrize(
"lang,target",
[
("en", "en"),
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("is", "isl"),
("mo", "ro"),
("xx", "mul"),
("no", "nb"),
("pt-BR", "pt"),
("zh-Hans", "zh"),
],
)
def test_blank_languages(lang, target):
"""
Test that we can get spacy.blank in various languages, including codes
that are defined to be equivalent or that match by CLDR language matching.
"""
nlp = spacy.blank(lang)
assert nlp.lang == target
2020-09-29 22:39:28 +03:00
@pytest.mark.parametrize("value", [False, None, ["x", "y"], Language, Vocab])
2020-09-15 14:25:34 +03:00
def test_language_init_invalid_vocab(value):
err_fragment = "invalid value"
with pytest.raises(ValueError) as e:
Language(value)
assert err_fragment in str(e.value)
def test_language_source_and_vectors(nlp2):
nlp = Language(Vocab())
textcat = nlp.add_pipe("textcat")
for label in ("POSITIVE", "NEGATIVE"):
textcat.add_label(label)
nlp.initialize()
long_string = "thisisalongstring"
assert long_string not in nlp.vocab.strings
assert long_string not in nlp2.vocab.strings
nlp.vocab.strings.add(long_string)
assert nlp.vocab.vectors.to_bytes() != nlp2.vocab.vectors.to_bytes()
vectors_bytes = nlp.vocab.vectors.to_bytes()
with pytest.warns(UserWarning):
nlp2.add_pipe("textcat", name="textcat2", source=nlp)
# strings should be added
assert long_string in nlp2.vocab.strings
# vectors should remain unmodified
assert nlp.vocab.vectors.to_bytes() == vectors_bytes
@pytest.mark.parametrize("n_process", [1, 2])
def test_pass_doc_to_pipeline(nlp, n_process):
texts = ["cats", "dogs", "guinea pigs"]
docs = [nlp.make_doc(text) for text in texts]
assert not any(len(doc.cats) for doc in docs)
doc = nlp(docs[0])
assert doc.text == texts[0]
assert len(doc.cats) > 0
if isinstance(get_current_ops(), NumpyOps) or n_process < 2:
docs = nlp.pipe(docs, n_process=n_process)
assert [doc.text for doc in docs] == texts
assert all(len(doc.cats) for doc in docs)
def test_invalid_arg_to_pipeline(nlp):
str_list = ["This is a text.", "This is another."]
with pytest.raises(ValueError):
nlp(str_list) # type: ignore
assert len(list(nlp.pipe(str_list))) == 2
int_list = [1, 2, 3]
with pytest.raises(ValueError):
list(nlp.pipe(int_list)) # type: ignore
with pytest.raises(ValueError):
nlp(int_list) # type: ignore
@pytest.mark.skipif(
not isinstance(get_current_ops(), CupyOps), reason="test requires GPU"
)
def test_multiprocessing_gpu_warning(nlp2, texts):
texts = texts * 10
docs = nlp2.pipe(texts, n_process=2, batch_size=2)
with pytest.warns(UserWarning, match="multiprocessing with GPU models"):
with pytest.raises(ValueError):
# Trigger multi-processing.
for _ in docs:
pass
2022-08-19 10:52:12 +03:00
def test_dot_in_factory_names(nlp):
Language.component("my_evil_component", func=evil_component)
nlp.add_pipe("my_evil_component")
with pytest.raises(ValueError, match="not permitted"):
Language.component("my.evil.component.v1", func=evil_component)
with pytest.raises(ValueError, match="not permitted"):
Language.factory("my.evil.component.v1", func=evil_component)
def test_component_return():
"""Test that an error is raised if components return a type other than a
doc."""
nlp = English()
@Language.component("test_component_good_pipe")
def good_pipe(doc):
return doc
nlp.add_pipe("test_component_good_pipe")
nlp("text")
nlp.remove_pipe("test_component_good_pipe")
@Language.component("test_component_bad_pipe")
def bad_pipe(doc):
return doc.text
nlp.add_pipe("test_component_bad_pipe")
with pytest.raises(ValueError, match="instead of a Doc"):
nlp("text")
@pytest.mark.slow
@pytest.mark.parametrize("teacher_tagger_name", ["tagger", "teacher_tagger"])
def test_distill(teacher_tagger_name):
teacher = English()
teacher_tagger = teacher.add_pipe("tagger", name=teacher_tagger_name)
train_examples = []
for t in TAGGER_TRAIN_DATA:
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
optimizer = teacher.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
teacher.update(train_examples, sgd=optimizer, losses=losses)
assert losses[teacher_tagger_name] < 0.00001
student = English()
student_tagger = student.add_pipe("tagger")
student_tagger.min_tree_freq = 1
student_tagger.initialize(
get_examples=lambda: train_examples, labels=teacher_tagger.label_data
)
distill_examples = [
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TAGGER_TRAIN_DATA
]
student_to_teacher = (
None
if teacher_tagger.name == student_tagger.name
else {student_tagger.name: teacher_tagger.name}
)
for i in range(50):
losses = {}
student.distill(
teacher,
distill_examples,
sgd=optimizer,
losses=losses,
student_to_teacher=student_to_teacher,
)
assert losses["tagger"] < 0.00001
test_text = "I like blue eggs"
doc = student(test_text)
assert doc[0].tag_ == "N"
assert doc[1].tag_ == "V"
assert doc[2].tag_ == "J"
assert doc[3].tag_ == "N"
# Do an extra update to check if annotates works, though we can't really
# validate the resuls, since the annotations are ephemeral.
student.distill(
teacher,
distill_examples,
sgd=optimizer,
losses=losses,
student_to_teacher=student_to_teacher,
annotates=["tagger"],
)