From ec14744ee44e5dfb42f27f9c4edd02910420bdca Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 31 Aug 2020 12:41:39 +0200 Subject: [PATCH] Rename Transformer listener (#6001) * rename to spacy-transformers.TransformerListener * add some more tok2vec tests * use select_pipes * fix docs - annotation setter was not changed in the end --- spacy/cli/templates/quickstart_training.jinja | 6 +- spacy/pipeline/pipe.pyx | 2 +- spacy/pipeline/tok2vec.py | 2 +- spacy/tests/test_tok2vec.py | 97 ++++++++++++++++++- website/docs/api/architectures.md | 4 +- website/docs/usage/embeddings-transformers.md | 8 +- website/docs/usage/v3.md | 2 +- 7 files changed, 107 insertions(+), 14 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 0071f1b1a..fa9bb6d76 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -42,7 +42,7 @@ factory = "tagger" nO = null [components.tagger.model.tok2vec] -@architectures = "spacy-transformers.Tok2VecListener.v1" +@architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 [components.tagger.model.tok2vec.pooling] @@ -62,7 +62,7 @@ use_upper = false nO = null [components.parser.model.tok2vec] -@architectures = "spacy-transformers.Tok2VecListener.v1" +@architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 [components.parser.model.tok2vec.pooling] @@ -82,7 +82,7 @@ use_upper = false nO = null [components.ner.model.tok2vec] -@architectures = "spacy-transformers.Tok2VecListener.v1" +@architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 [components.ner.model.tok2vec.pooling] diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 51251dacc..a3f379a97 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -37,7 +37,7 @@ cdef class Pipe: and returned. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. - docs (Doc): The Doc to preocess. + docs (Doc): The Doc to process. RETURNS (Doc): The processed Doc. DOCS: https://spacy.io/api/pipe#call diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index dad66ddb3..7e61ccc02 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -88,7 +88,7 @@ class Tok2Vec(Pipe): """Add context-sensitive embeddings to the Doc.tensor attribute, allowing them to be used as features by downstream components. - docs (Doc): The Doc to preocess. + docs (Doc): The Doc to process. RETURNS (Doc): The processed Doc. DOCS: https://spacy.io/api/tok2vec#call diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index b30705088..1068b662d 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -3,11 +3,18 @@ import pytest from spacy.ml.models.tok2vec import build_Tok2Vec_model from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder +from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.vocab import Vocab from spacy.tokens import Doc - +from spacy.gold import Example +from spacy import util +from spacy.lang.en import English from .util import get_batch +from thinc.api import Config + +from numpy.testing import assert_equal + def test_empty_doc(): width = 128 @@ -41,7 +48,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): also_use_static_vectors=False, also_embed_subwords=True, ), - MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,), + MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) @@ -74,3 +81,89 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co assert len(vectors) == len(docs) assert vectors[0].shape == (len(docs[0]), width) backprop(vectors) + + +def test_init_tok2vec(): + # Simple test to initialize the default tok2vec + nlp = English() + tok2vec = nlp.add_pipe("tok2vec") + assert tok2vec.listeners == [] + nlp.begin_training() + + +cfg_string = """ + [nlp] + lang = "en" + pipeline = ["tok2vec","tagger"] + + [components] + + [components.tagger] + factory = "tagger" + + [components.tagger.model] + @architectures = "spacy.Tagger.v1" + nO = null + + [components.tagger.model.tok2vec] + @architectures = "spacy.Tok2VecListener.v1" + width = ${components.tok2vec.model.encode.width} + + [components.tok2vec] + factory = "tok2vec" + + [components.tok2vec.model] + @architectures = "spacy.Tok2Vec.v1" + + [components.tok2vec.model.embed] + @architectures = "spacy.MultiHashEmbed.v1" + width = ${components.tok2vec.model.encode.width} + rows = 2000 + also_embed_subwords = true + also_use_static_vectors = false + + [components.tok2vec.model.encode] + @architectures = "spacy.MaxoutWindowEncoder.v1" + width = 96 + depth = 4 + window_size = 1 + maxout_pieces = 3 + """ + +TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + +def test_tok2vec_listener(): + orig_config = Config().from_str(cfg_string) + nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.pipe_names == ["tok2vec", "tagger"] + tagger = nlp.get_pipe("tagger") + tok2vec = nlp.get_pipe("tok2vec") + tagger_tok2vec = tagger.model.get_ref("tok2vec") + assert isinstance(tok2vec, Tok2Vec) + assert isinstance(tagger_tok2vec, Tok2VecListener) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + for tag in t[1]["tags"]: + tagger.add_label(tag) + + # Check that the Tok2Vec component finds it listeners + assert tok2vec.listeners == [] + optimizer = nlp.begin_training(lambda: train_examples) + assert tok2vec.listeners == [tagger_tok2vec] + + for i in range(5): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + doc = nlp("Running the pipeline as a whole.") + doc_tensor = tagger_tok2vec.predict([doc])[0] + assert_equal(doc.tensor, doc_tensor) + + # TODO: should this warn or error? + nlp.select_pipes(disable="tok2vec") + assert nlp.pipe_names == ["tagger"] + nlp("Running the pipeline with the Tok2Vec component disabled.") diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 3089fa1b3..e3b26a961 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -346,13 +346,13 @@ in other components, see | `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ | -### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener} +### spacy-transformers.TransformerListener.v1 {#TransformerListener} > #### Example Config > > ```ini > [model] -> @architectures = "spacy-transformers.Tok2VecListener.v1" +> @architectures = "spacy-transformers.TransformerListener.v1" > grad_factor = 1.0 > > [model.pooling] diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 75be71845..fe7fc29c0 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -225,7 +225,7 @@ transformers as subnetworks directly, you can also use them via the ![The processing pipeline with the transformer component](../images/pipeline_transformer.svg) -By default, the `Transformer` component sets the +The `Transformer` component sets the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, which lets you access the transformers outputs at runtime. @@ -303,7 +303,7 @@ component: > > ```python > from spacy_transformers import Transformer, TransformerModel -> from spacy_transformers.annotation_setters import configure_trfdata_setter +> from spacy_transformers.annotation_setters import null_annotation_setter > from spacy_transformers.span_getters import get_doc_spans > > trf = Transformer( @@ -313,7 +313,7 @@ component: > get_spans=get_doc_spans, > tokenizer_config={"use_fast": True}, > ), -> annotation_setter=configure_trfdata_setter(), +> annotation_setter=null_annotation_setter, > max_batch_items=4096, > ) > ``` @@ -333,7 +333,7 @@ tokenizer_config = {"use_fast": true} @span_getters = "doc_spans.v1" [components.transformer.annotation_setter] -@annotation_setters = "spacy-transformers.trfdata_setter.v1" +@annotation_setters = "spacy-transformers.null_annotation_setter.v1" ``` diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 20b7a139b..de3d7ce33 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -64,7 +64,7 @@ menu: [`TransformerData`](/api/transformer#transformerdata), [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), - [Tok2VecListener](/api/architectures#transformers-Tok2VecListener), + [TransformerListener](/api/architectures#TransformerListener), [Tok2VecTransformer](/api/architectures#Tok2VecTransformer) - **Models:** [`en_core_trf_lg_sm`](/models/en) - **Implementation:**