Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-28 22:47:52 +03:00 · 2020-08-31 19:55:55 +02:00 · 2020-08-31 19:55:55 +02:00 · c38298b8fa
commit c38298b8fa
parent fe298fa50a 9af82f3f11
16 changed files with 345 additions and 140 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -264,9 +264,9 @@ def train_while_improving(
        epoch (int): How many passes over the data have been completed.
        step (int): How many steps have been completed.
-        score (float): The main score form the last evaluation.
+        score (float): The main score from the last evaluation.
        other_scores: : The other scores from the last evaluation.
-        loss: The accumulated losses throughout training.
+        losses: The accumulated losses throughout training.
        checkpoints: A list of previous results, where each result is a
            (score, step, epoch) tuple.
    """
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -112,6 +112,9 @@ class Warnings:
            "word segmenters: {supported}. Defaulting to {default}.")
    W104 = ("Skipping modifications for '{target}' segmenter. The current "
            "segmenter is '{current}'.")
    W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
            "need to match on a stream of documents, you can use nlp.pipe and "
            "call the {matcher} on each Doc object.")
@add_codes
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -176,18 +176,10 @@ cdef class Matcher:
        return (self._callbacks[key], self._patterns[key])
    def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
-        """Match a stream of documents, yielding them in turn.
+        """Match a stream of documents, yielding them in turn. Deprecated as of
-
+        spaCy v3.0.
        docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
        batch_size (int): Number of documents to accumulate into a working set.
        return_matches (bool): Yield the match lists along with the docs, making
            results (doc, matches) tuples.
        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
            and yield (result, context) tuples out.
            If both return_matches and as_tuples are True, the output will
            be a sequence of ((doc, matches), context) tuples.
        YIELDS (Doc): Documents, in order.
        """
        warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
        if as_tuples:
            for doc, context in docs:
                matches = self(doc)
@ -203,13 +195,16 @@ cdef class Matcher:
                else:
                    yield doc
-    def __call__(self, object doclike):
+    def __call__(self, object doclike, *, as_spans=False):
        """Find all token sequences matching the supplied pattern.
        doclike (Doc or Span): The document to match over.
-        RETURNS (list): A list of `(key, start, end)` tuples,
+        as_spans (bool): Return Span objects with labels instead of (match_id,
            start, end) tuples.
        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
-            `doc[start:end]`. The `label_id` and `key` are both integers.
+            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
            to True, a list of Span objects is returned.
        """
        if isinstance(doclike, Doc):
            doc = doclike
@ -262,6 +257,9 @@ cdef class Matcher:
            on_match = self._callbacks.get(key, None)
            if on_match is not None:
                on_match(self, doc, i, final_matches)
        if as_spans:
            return [Span(doc, start, end, label=key) for key, start, end in final_matches]
        else:
            return final_matches
    def _normalize_key(self, key):
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -7,6 +7,7 @@ import warnings
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
 from ..tokens.span cimport Span
 from ..typedefs cimport attr_t
 from ..schemas import TokenPattern
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
-    def __call__(self, doc):
+    def __call__(self, doc, *, as_spans=False):
        """Find all sequences matching the supplied patterns on the `Doc`.
        doc (Doc): The document to match over.
-        RETURNS (list): A list of `(key, start, end)` tuples,
+        as_spans (bool): Return Span objects with labels instead of (match_id,
            start, end) tuples.
        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
-            `doc[start:end]`. The `label_id` and `key` are both integers.
+            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
            to True, a list of Span objects is returned.
        DOCS: https://spacy.io/api/phrasematcher#call
        """
@ -239,6 +243,9 @@ cdef class PhraseMatcher:
            on_match = self._callbacks.get(self.vocab.strings[ent_id])
            if on_match is not None:
                on_match(self, doc, i, matches)
        if as_spans:
            return [Span(doc, start, end, label=key) for key, start, end in matches]
        else:
            return matches
    cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
            idx += 1
    def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
-        """Match a stream of documents, yielding them in turn.
+        """Match a stream of documents, yielding them in turn. Deprecated as of
-
+        spaCy v3.0.
        docs (iterable): A stream of documents.
        batch_size (int): Number of documents to accumulate into a working set.
        return_matches (bool): Yield the match lists along with the docs, making
            results (doc, matches) tuples.
        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
            and yield (result, context) tuples out.
            If both return_matches and as_tuples are True, the output will
            be a sequence of ((doc, matches), context) tuples.
        YIELDS (Doc): Documents, in order.
        DOCS: https://spacy.io/api/phrasematcher#pipe
        """
        warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
        if as_tuples:
            for doc, context in stream:
                matches = self(doc)
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -2,7 +2,8 @@ import pytest
 import re
 from mock import Mock
 from spacy.matcher import Matcher, DependencyMatcher
-from spacy.tokens import Doc, Token
+from spacy.tokens import Doc, Token, Span
 from ..doc.test_underscore import clean_underscore  # noqa: F401
@ -469,3 +470,26 @@ def test_matcher_span(matcher):
    assert len(matcher(doc)) == 2
    assert len(matcher(span_js)) == 1
    assert len(matcher(span_java)) == 1
 def test_matcher_as_spans(matcher):
    """Test the new as_spans=True API."""
    text = "JavaScript is good but Java is better"
    doc = Doc(matcher.vocab, words=text.split())
    matches = matcher(doc, as_spans=True)
    assert len(matches) == 2
    assert isinstance(matches[0], Span)
    assert matches[0].text == "JavaScript"
    assert matches[0].label_ == "JS"
    assert isinstance(matches[1], Span)
    assert matches[1].text == "Java"
    assert matches[1].label_ == "Java"
 def test_matcher_deprecated(matcher):
    doc = Doc(matcher.vocab, words=["hello", "world"])
    with pytest.warns(DeprecationWarning) as record:
        for _ in matcher.pipe([doc]):
            pass
        assert record.list
        assert "spaCy v3.0" in str(record.list[0].message)
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -2,7 +2,7 @@ import pytest
 import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from ..util import get_doc
@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
    # clunky way to vaguely check that callback is unpickled
    (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
    assert isinstance(callbacks.get("TEST2"), Mock)
 def test_phrase_matcher_as_spans(en_vocab):
    """Test the new as_spans=True API."""
    matcher = PhraseMatcher(en_vocab)
    matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
    matcher.add("B", [Doc(en_vocab, words=["test"])])
    doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
    matches = matcher(doc, as_spans=True)
    assert len(matches) == 2
    assert isinstance(matches[0], Span)
    assert matches[0].text == "hello world"
    assert matches[0].label_ == "A"
    assert isinstance(matches[1], Span)
    assert matches[1].text == "test"
    assert matches[1].label_ == "B"
 def test_phrase_matcher_deprecated(en_vocab):
    matcher = PhraseMatcher(en_vocab)
    matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
    doc = Doc(en_vocab, words=["hello", "world"])
    with pytest.warns(DeprecationWarning) as record:
        for _ in matcher.pipe([doc]):
            pass
        assert record.list
        assert "spaCy v3.0" in str(record.list[0].message)
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -119,9 +119,9 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
 argument that connects to the shared `tok2vec` component in the pipeline.
 | Name        | Description                                                                                                                                                                                                                                                                                                                          |
-| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `width`     | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~                                                                                                                                                                                                                                     |
-| `upstream`  | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
+| `upstream`  | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                               |
 ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
@ -323,11 +323,11 @@ for details and system requirements.
 Load and wrap a transformer model from the
 [HuggingFace `transformers`](https://huggingface.co/transformers) library. You
-can any transformer that has pretrained weights and a PyTorch implementation.
+can use any transformer that has pretrained weights and a PyTorch
-The `name` variable is passed through to the underlying library, so it can be
+implementation. The `name` variable is passed through to the underlying library,
-either a string or a path. If it's a string, the pretrained weights will be
+so it can be either a string or a path. If it's a string, the pretrained weights
-downloaded via the transformers library if they are not already available
+will be downloaded via the transformers library if they are not already
-locally.
+available locally.
 In order to support longer documents, the
 [TransformerModel](/api/architectures#TransformerModel) layer allows you to pass
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -117,30 +117,11 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 > ```
 | Name                                  | Description                                                                                                                                                                                                                                                                                              |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `doclike`                             | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
-| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
+| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
-
+| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
-## Matcher.pipe {#pipe tag="method"}
+| **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
 Match a stream of documents, yielding them in turn.
 > #### Example
 >
 > ```python
 > from spacy.matcher import Matcher
 > matcher = Matcher(nlp.vocab)
 > for doc in matcher.pipe(docs, batch_size=50):
 >     pass
 > ```
 | Name                                          | Description                                                                                                                                                                                                                         |
 | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `docs`                                        | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~                                                                                                                                                                      |
 | `batch_size`                                  | The number of documents to accumulate into a working set. ~~int~~                                                                                                                                                                   |
 | `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~                                                                                                                                         |
 | `as_tuples`                                   | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
 | **YIELDS**                                    | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~                                                                                                                                                   |
 ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -58,9 +58,11 @@ Find all token sequences matching the supplied patterns on the `Doc`.
 > ```
 | Name                                  | Description                                                                                                                                                                                                                                                                                              |
-| ----------- | ----------------------------------- |
+| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `doc`                                 | The document to match over. ~~Doc~~                                                                                                                                                                                                                                                                      |
-| **RETURNS** | list                                | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
+| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
 | `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
 | **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
 <Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
 </Infobox>
 ## PhraseMatcher.pipe {#pipe tag="method"}
 Match a stream of documents, yielding them in turn.
 > #### Example
 >
 > ```python
 >   from spacy.matcher import PhraseMatcher
 >   matcher = PhraseMatcher(nlp.vocab)
 >   for doc in matcher.pipe(docs, batch_size=50):
 >       pass
 > ```
 | Name                                          | Description                                                                                                                                                                                                                         |
 | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `docs`                                        | A stream of documents. ~~Iterable[Doc]~~                                                                                                                                                                                            |
 | `batch_size`                                  | The number of documents to accumulate into a working set. ~~int~~                                                                                                                                                                   |
 | `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~                                                                                                                                         |
 | `as_tuples`                                   | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
 | **YIELDS**                                    | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~                                                                                                                    |
 ## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
 Get the number of rules added to the matcher. Note that this only returns the
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -4,6 +4,7 @@ menu:
  - ['spacy', 'spacy']
  - ['displacy', 'displacy']
  - ['registry', 'registry']
  - ['Loggers', 'loggers']
  - ['Batchers', 'batchers']
  - ['Data & Alignment', 'gold']
  - ['Utility Functions', 'util']
@ -316,6 +317,7 @@ factories.
 | `initializers`    | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers).                                                                                                                                                         |
 | `languages`       | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                                                                 |
 | `layers`          | Registry for functions that create [layers](https://thinc.ai/docs/api-layers).                                                                                                                                                                     |
 | `loggers`         | Registry for functions that log [training results](/usage/training).                                                                                                                                                                               |
 | `lookups`         | Registry for large lookup tables available via `vocab.lookups`.                                                                                                                                                                                    |
 | `losses`          | Registry for functions that create [losses](https://thinc.ai/docs/api-loss).                                                                                                                                                                       |
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                             |
@ -340,7 +342,7 @@ See the [`Transformer`](/api/transformer) API reference and
 >     def annotation_setter(docs, trf_data) -> None:
 >        # Set annotations on the docs
 >
->     return annotation_sette
+>     return annotation_setter
 > ```
 | Registry name                                               | Description                                                                                                                                                                                                                                       |
@ -348,6 +350,110 @@ See the [`Transformer`](/api/transformer) API reference and
 | [`span_getters`](/api/transformer#span_getters)             | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences.                                                                                                      |
 | [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
 ## Loggers {#loggers source="spacy/gold/loggers.py" new="3"}
 A logger records the training results. When a logger is created, two functions
 are returned: one for logging the information for each training step, and a
 second function that is called to finalize the logging when the training is
 finished. To log each training step, a
 [dictionary](/usage/training#custom-logging) is passed on from the
 [`spacy train`](/api/cli#train), including information such as the training loss
 and the accuracy scores on the development set.
 There are two built-in logging functions: a logger printing results to the
 console in tabular format (which is the default), and one that also sends the
 results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
 using one of the built-in loggers listed here, you can also
 [implement your own](/usage/training#custom-logging).
 #### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
 > #### Example config
 >
 > ```ini
 > [training.logger]
 > @loggers = "spacy.ConsoleLogger.v1"
 > ```
 Writes the results of a training step to the console in a tabular format.
 <Accordion title="Example console output" spaced>
 ```cli
 $ python -m spacy train config.cfg
 ```
 ```
 ℹ Using CPU
 ℹ Loading config and nlp from: config.cfg
 ℹ Pipeline: ['tok2vec', 'tagger']
 ℹ Start training
 ℹ Training. Initial learn rate: 0.0
 E     #        LOSS TOK2VEC   LOSS TAGGER   TAG_ACC   SCORE
 ---   ------   ------------   -----------   -------   ------
  1        0           0.00         86.20      0.22     0.00
  1      200           3.08      18968.78     34.00     0.34
  1      400          31.81      22539.06     33.64     0.34
  1      600          92.13      22794.91     43.80     0.44
  1      800         183.62      21541.39     56.05     0.56
  1     1000         352.49      25461.82     65.15     0.65
  1     1200         422.87      23708.82     71.84     0.72
  1     1400         601.92      24994.79     76.57     0.77
  1     1600         662.57      22268.02     80.20     0.80
  1     1800        1101.50      28413.77     82.56     0.83
  1     2000        1253.43      28736.36     85.00     0.85
  1     2200        1411.02      28237.53     87.42     0.87
  1     2400        1605.35      28439.95     88.70     0.89
 ```
 Note that the cumulative loss keeps increasing within one epoch, but should
 start decreasing across epochs.
 </Accordion>
 #### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
 > #### Installation
 >
 > ```bash
 > $ pip install wandb
 > $ wandb login
 > ```
 Built-in logger that sends the results of each training step to the dashboard of
 the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
 & Biases should be installed, and you should be logged in. The logger will send
 the full config file to W&B, as well as various system information such as
 memory utilization, network traffic, disk IO, GPU statistics, etc. This will
 also include information such as your hostname and operating system, as well as
 the location of your Python executable.
 <Infobox variant="warning">
 Note that by default, the full (interpolated)
 [training config](/usage/training#config) is sent over to the W&B dashboard. If
 you prefer to **exclude certain information** such as path names, you can list
 those fields in "dot notation" in the `remove_config_values` parameter. These
 fields will then be removed from the config before uploading, but will otherwise
 remain in the config file stored on your local system.
 </Infobox>
 > #### Example config
 >
 > ```ini
 > [training.logger]
 > @loggers = "spacy.WandbLogger.v1"
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
 > ```
 | Name                   | Description                                                                                                                           |
 | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
 | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
 ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
 A data batcher implements a batching strategy that essentially turns a stream of
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -25,8 +25,8 @@ work out-of-the-box.
 </Infobox>
-This pipeline component lets you use transformer models in your pipeline.
+This pipeline component lets you use transformer models in your pipeline. It
-Supports all models that are available via the
+supports all models that are available via the
 [HuggingFace `transformers`](https://huggingface.co/transformers) library.
 Usually you will connect subsequent components to the shared transformer using
 the [TransformerListener](/api/architectures#TransformerListener) layer. This
@ -50,8 +50,8 @@ The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
 `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
 [`config.cfg` for training](/usage/training#config). See the
-[model architectures](/api/architectures) documentation for details on the
+[model architectures](/api/architectures#transformers) documentation for details
-architectures and their arguments and hyperparameters.
+on the transformer architectures and their arguments and hyperparameters.
 > #### Example
 >
@ -62,9 +62,9 @@ architectures and their arguments and hyperparameters.
 > ```
 | Setting             | Description                                                                                                                                                                                                                                                                                                           |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `max_batch_items`   | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                           |
-| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
 | `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                        |
 ```python
@ -103,10 +103,10 @@ your application, you would normally use a shortcut for this and instantiate the
 component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
 | Name                | Description                                                                                                                                                                                                                                                                             |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`             | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                        |
 | `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~                                                      |
-| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
 | _keyword-only_      |                                                                                                                                                                                                                                                                                         |
 | `name`              | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                     |
 | `max_batch_items`   | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~                                                                                                                                                                                                                           |
@ -383,9 +383,8 @@ return tensors that refer to a whole padded batch of documents. These tensors
 are wrapped into the
 [FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
 `FullTransformerBatch` then splits out the per-document data, which is handled
-by this class. Instances of this class
+by this class. Instances of this class are typically assigned to the
-are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
+[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
 extension attribute.
 | Name      | Description                                                                                                                                                                                                                                                                                                                                             |
 | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -447,8 +446,9 @@ overlap, and you can also omit sections of the Doc if they are not relevant.
 Span getters can be referenced in the `[components.transformer.model.get_spans]`
 block of the config to customize the sequences processed by the transformer. You
-can also register custom span getters using the `@spacy.registry.span_getters`
+can also register
-decorator.
+[custom span getters](/usage/embeddings-transformers#transformers-training-custom-settings)
 using the `@spacy.registry.span_getters` decorator.
 > #### Example
 >
@ -518,7 +518,7 @@ right context.
 ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
-Annotation setters are functions that that take a batch of `Doc` objects and a
+Annotation setters are functions that take a batch of `Doc` objects and a
 [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set
 additional annotations on the `Doc`, e.g. to set custom or built-in attributes.
 You can register custom annotation setters using the
@ -552,5 +552,5 @@ The component sets the following
 [custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
 | Name             | Description                                                              |
-| -------------- | ------------------------------------------------------------------------ |
+| ---------------- | ------------------------------------------------------------------------ |
-| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
+| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -251,13 +251,14 @@ for doc in nlp.pipe(["some text", "some other text"]):
    tokvecs = doc._.trf_data.tensors[-1]
 ```
-You can customize how the [`Transformer`](/api/transformer) component sets
+You can also customize how the [`Transformer`](/api/transformer) component sets
-annotations onto the [`Doc`](/api/doc), by changing the `annotation_setter`.
+annotations onto the [`Doc`](/api/doc), by specifying a custom
-This callback will be called with the raw input and output data for the whole
+`annotation_setter`. This callback will be called with the raw input and output
-batch, along with the batch of `Doc` objects, allowing you to implement whatever
+data for the whole batch, along with the batch of `Doc` objects, allowing you to
-you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
+implement whatever you need. The annotation setter is called with a batch of
-objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
+[`Doc`](/api/doc) objects and a
-containing the transformers data for the batch.
+[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the
 transformers data for the batch.
 ```python
 def custom_annotation_setter(docs, trf_data):
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -914,4 +914,4 @@ mattis pretium.
 ### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
-<!-- TODO: decide how we want this to work? Just send results plus config from spacy evaluate in a separate command/script? -->
+<!-- TODO: link to WandB logger, explain that it's built-in but that you can also do other cool stuff with WandB? And then include example project (still need to decide what we want to do here)  -->
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -493,6 +493,39 @@ you prefer.
 | `i`       | Index of the current match (`matches[i`]). ~~int~~                                                                                                 |
 | `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
 ### Creating spans from matches {#matcher-spans}
 Creating [`Span`](/api/span) objects from the returned matches is a very common
 use case. spaCy makes this easy by giving you access to the `start` and `end`
 token of each match, which you can use to construct a new span with an optional
 label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
 matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
 using the `match_id` as the span label.
 ```python
 ### {executable="true"}
 import spacy
 from spacy.matcher import Matcher
 from spacy.tokens import Span
 nlp = spacy.blank("en")
 matcher = Matcher(nlp.vocab)
 matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
 doc = nlp("Barack Obama was the 44th president of the United States")
 # 1. Return (match_id, start, end) tuples
 matches = matcher(doc)
 for match_id, start, end in matches:
    # Create the matched span and assign the match_id as a label
    span = Span(doc, start, end, label=match_id)
    print(span.text, span.label_)
 # 2. Return Span objects directly
 matches = matcher(doc, as_spans=True)
 for span in matches:
    print(span.text, span.label_)
 ```
 ### Using custom pipeline components {#matcher-pipeline}
 Let's say your data also contains some annoying pre-processing artifacts, like
@ -823,15 +856,6 @@ for token in doc:
    print(token.text, token._.is_hashtag)
 ```
 To process a stream of social media posts, we can use
 [`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
 objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
 ```python
 docs = nlp.pipe(LOTS_OF_TWEETS)
 matches = matcher.pipe(docs)
 ```
 ## Efficient phrase matching {#phrasematcher}
 If you need to match large terminology lists, you can also use the
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -605,6 +605,68 @@ to your Python file. Before loading the config, spaCy will import the
 $ python -m spacy train config.cfg --output ./output --code ./functions.py
 ```
 #### Example: Custom logging function {#custom-logging}
 During training, the results of each step are passed to a logger function. By
 default, these results are written to the console with the
 [`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
 for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
 [`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
 **dictionary** with the following keys:
 | Key            | Value                                                                                          |
 | -------------- | ---------------------------------------------------------------------------------------------- |
 | `epoch`        | How many passes over the data have been completed. ~~int~~                                     |
 | `step`         | How many steps have been completed. ~~int~~                                                    |
 | `score`        | The main score from the last evaluation, measured on the dev set. ~~float~~                    |
 | `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~         |
 | `losses`       | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~                 |
 | `checkpoints`  | A list of previous results, where each result is a (score, step, epoch) tuple. ~~List[Tuple]~~ |
 You can easily implement and plug in your own logger that records the training
 results in a custom way, or sends them to an experiment management tracker of
 your choice. In this example, the function `my_custom_logger.v1` writes the
 tabular results to a file:
 > ```ini
 > ### config.cfg (excerpt)
 > [training.logger]
 > @loggers = "my_custom_logger.v1"
 > log_path = "my_file.tab"
 > ```
 ```python
 ### functions.py
 from typing import Tuple, Callable, Dict, Any
 import spacy
 from pathlib import Path
@spacy.registry.loggers("my_custom_logger.v1")
 def custom_logger(log_path):
    def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
        with Path(log_path).open("w") as file_:
            file_.write("step\\t")
            file_.write("score\\t")
            for pipe in nlp.pipe_names:
                file_.write(f"loss_{pipe}\\t")
            file_.write("\\n")
        def log_step(info: Dict[str, Any]):
            with Path(log_path).open("a") as file_:
                file_.write(f"{info['step']}\\t")
                file_.write(f"{info['score']}\\t")
                for pipe in nlp.pipe_names:
                    file_.write(f"{info['losses'][pipe]}\\t")
                file_.write("\\n")
        def finalize():
            pass
        return log_step, finalize
    return setup_logger
 ```
 #### Example: Custom batch size schedule {#custom-code-schedule}
 For example, let's say you've implemented your own batch size schedule to use
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | `GoldParse`                                              | [`Example`](/api/example)                                                                                    |
 | `GoldCorpus`                                             | [`Corpus`](/api/corpus)                                                                                      |
 | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`          | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                   |
 | `Matcher.pipe`, `PhraseMatcher.pipe`                     | not needed                                                                                                   |
 | `spacy init-model`                                       | [`spacy init model`](/api/cli#init-model)                                                                    |
 | `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data)                                                                    |
 | `spacy profile`                                          | [`spacy debug profile`](/api/cli#debug-profile)                                                              |
`@ -914,4 +914,4 @@ mattis pretium.`

	`### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />`	`### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />`

	`<!-- TODO: decide how we want this to work? Just send results plus config from spacy evaluate in a separate command/script? -->`	`<!-- TODO: link to WandB logger, explain that it's built-in but that you can also do other cool stuff with WandB? And then include example project (still need to decide what we want to do here) -->`