Rename Transformer listener (#6001)

* rename to spacy-transformers.TransformerListener

* add some more tok2vec tests

* use select_pipes

* fix docs - annotation setter was not changed in the end
This commit is contained in:
Sofie Van Landeghem 2020-08-31 12:41:39 +02:00 committed by GitHub
parent 6ac3299e2e
commit ec14744ee4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 107 additions and 14 deletions

View File

@ -42,7 +42,7 @@ factory = "tagger"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy-transformers.Tok2VecListener.v1"
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.tagger.model.tok2vec.pooling]
@ -62,7 +62,7 @@ use_upper = false
nO = null
[components.parser.model.tok2vec]
@architectures = "spacy-transformers.Tok2VecListener.v1"
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.parser.model.tok2vec.pooling]
@ -82,7 +82,7 @@ use_upper = false
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.Tok2VecListener.v1"
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.ner.model.tok2vec.pooling]

View File

@ -37,7 +37,7 @@ cdef class Pipe:
and returned. This usually happens under the hood when the nlp object
is called on a text and all components are applied to the Doc.
docs (Doc): The Doc to preocess.
docs (Doc): The Doc to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/pipe#call

View File

@ -88,7 +88,7 @@ class Tok2Vec(Pipe):
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
them to be used as features by downstream components.
docs (Doc): The Doc to preocess.
docs (Doc): The Doc to process.
RETURNS (Doc): The processed Doc.
DOCS: https://spacy.io/api/tok2vec#call

View File

@ -3,11 +3,18 @@ import pytest
from spacy.ml.models.tok2vec import build_Tok2Vec_model
from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed
from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
from spacy.vocab import Vocab
from spacy.tokens import Doc
from spacy.gold import Example
from spacy import util
from spacy.lang.en import English
from .util import get_batch
from thinc.api import Config
from numpy.testing import assert_equal
def test_empty_doc():
width = 128
@ -41,7 +48,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
also_use_static_vectors=False,
also_embed_subwords=True,
),
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3,),
MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3),
)
tok2vec.initialize()
vectors, backprop = tok2vec.begin_update(batch)
@ -74,3 +81,89 @@ def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_co
assert len(vectors) == len(docs)
assert vectors[0].shape == (len(docs[0]), width)
backprop(vectors)
def test_init_tok2vec():
# Simple test to initialize the default tok2vec
nlp = English()
tok2vec = nlp.add_pipe("tok2vec")
assert tok2vec.listeners == []
nlp.begin_training()
cfg_string = """
[nlp]
lang = "en"
pipeline = ["tok2vec","tagger"]
[components]
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode.width}
rows = 2000
also_embed_subwords = true
also_use_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
"""
TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),
]
def test_tok2vec_listener():
orig_config = Config().from_str(cfg_string)
nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp.pipe_names == ["tok2vec", "tagger"]
tagger = nlp.get_pipe("tagger")
tok2vec = nlp.get_pipe("tok2vec")
tagger_tok2vec = tagger.model.get_ref("tok2vec")
assert isinstance(tok2vec, Tok2Vec)
assert isinstance(tagger_tok2vec, Tok2VecListener)
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
for tag in t[1]["tags"]:
tagger.add_label(tag)
# Check that the Tok2Vec component finds it listeners
assert tok2vec.listeners == []
optimizer = nlp.begin_training(lambda: train_examples)
assert tok2vec.listeners == [tagger_tok2vec]
for i in range(5):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
doc = nlp("Running the pipeline as a whole.")
doc_tensor = tagger_tok2vec.predict([doc])[0]
assert_equal(doc.tensor, doc_tensor)
# TODO: should this warn or error?
nlp.select_pipes(disable="tok2vec")
assert nlp.pipe_names == ["tagger"]
nlp("Running the pipeline with the Tok2Vec component disabled.")

View File

@ -346,13 +346,13 @@ in other components, see
| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], FullTransformerBatch]~~ |
### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
### spacy-transformers.TransformerListener.v1 {#TransformerListener}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy-transformers.Tok2VecListener.v1"
> @architectures = "spacy-transformers.TransformerListener.v1"
> grad_factor = 1.0
>
> [model.pooling]

View File

@ -225,7 +225,7 @@ transformers as subnetworks directly, you can also use them via the
![The processing pipeline with the transformer component](../images/pipeline_transformer.svg)
By default, the `Transformer` component sets the
The `Transformer` component sets the
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
which lets you access the transformers outputs at runtime.
@ -303,7 +303,7 @@ component:
>
> ```python
> from spacy_transformers import Transformer, TransformerModel
> from spacy_transformers.annotation_setters import configure_trfdata_setter
> from spacy_transformers.annotation_setters import null_annotation_setter
> from spacy_transformers.span_getters import get_doc_spans
>
> trf = Transformer(
@ -313,7 +313,7 @@ component:
> get_spans=get_doc_spans,
> tokenizer_config={"use_fast": True},
> ),
> annotation_setter=configure_trfdata_setter(),
> annotation_setter=null_annotation_setter,
> max_batch_items=4096,
> )
> ```
@ -333,7 +333,7 @@ tokenizer_config = {"use_fast": true}
@span_getters = "doc_spans.v1"
[components.transformer.annotation_setter]
@annotation_setters = "spacy-transformers.trfdata_setter.v1"
@annotation_setters = "spacy-transformers.null_annotation_setter.v1"
```

View File

@ -64,7 +64,7 @@ menu:
[`TransformerData`](/api/transformer#transformerdata),
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
[Tok2VecListener](/api/architectures#transformers-Tok2VecListener),
[TransformerListener](/api/architectures#TransformerListener),
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
- **Models:** [`en_core_trf_lg_sm`](/models/en)
- **Implementation:**