Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-02 03:10:22 +03:00 · 2020-08-31 19:55:55 +02:00 · 2020-08-31 19:55:55 +02:00 · c38298b8fa
commit c38298b8fa
parent fe298fa50a 9af82f3f11
16 changed files with 345 additions and 140 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -264,9 +264,9 @@ def train_while_improving(

        epoch (int): How many passes over the data have been completed.
        step (int): How many steps have been completed.
-        score (float): The main score form the last evaluation.
+        score (float): The main score from the last evaluation.
        other_scores: : The other scores from the last evaluation.
-        loss: The accumulated losses throughout training.
+        losses: The accumulated losses throughout training.
        checkpoints: A list of previous results, where each result is a
            (score, step, epoch) tuple.
    """
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -112,6 +112,9 @@ class Warnings:
            "word segmenters: {supported}. Defaulting to {default}.")
    W104 = ("Skipping modifications for '{target}' segmenter. The current "
            "segmenter is '{current}'.")
+    W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
+            "need to match on a stream of documents, you can use nlp.pipe and "
+            "call the {matcher} on each Doc object.")


@add_codes
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -176,18 +176,10 @@ cdef class Matcher:
        return (self._callbacks[key], self._patterns[key])

    def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
-        """Match a stream of documents, yielding them in turn.
-
-        docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
-        batch_size (int): Number of documents to accumulate into a working set.
-        return_matches (bool): Yield the match lists along with the docs, making
-            results (doc, matches) tuples.
-        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
-            and yield (result, context) tuples out.
-            If both return_matches and as_tuples are True, the output will
-            be a sequence of ((doc, matches), context) tuples.
-        YIELDS (Doc): Documents, in order.
+        """Match a stream of documents, yielding them in turn. Deprecated as of
+        spaCy v3.0.
        """
+        warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
        if as_tuples:
            for doc, context in docs:
                matches = self(doc)
@ -203,13 +195,16 @@ cdef class Matcher:
                else:
                    yield doc

-    def __call__(self, object doclike):
+    def __call__(self, object doclike, *, as_spans=False):
        """Find all token sequences matching the supplied pattern.

        doclike (Doc or Span): The document to match over.
-        RETURNS (list): A list of `(key, start, end)` tuples,
+        as_spans (bool): Return Span objects with labels instead of (match_id,
+            start, end) tuples.
+        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
-            `doc[start:end]`. The `label_id` and `key` are both integers.
+            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
+            to True, a list of Span objects is returned.
        """
        if isinstance(doclike, Doc):
            doc = doclike
@ -262,7 +257,10 @@ cdef class Matcher:
            on_match = self._callbacks.get(key, None)
            if on_match is not None:
                on_match(self, doc, i, final_matches)
-        return final_matches
+        if as_spans:
+            return [Span(doc, start, end, label=key) for key, start, end in final_matches]
+        else:
+            return final_matches

    def _normalize_key(self, key):
        if isinstance(key, basestring):
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -7,6 +7,7 @@ import warnings
 from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
 from ..structs cimport TokenC
 from ..tokens.token cimport Token
+from ..tokens.span cimport Span
 from ..typedefs cimport attr_t

 from ..schemas import TokenPattern
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
                result = internal_node
            map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)

-    def __call__(self, doc):
+    def __call__(self, doc, *, as_spans=False):
        """Find all sequences matching the supplied patterns on the `Doc`.

        doc (Doc): The document to match over.
-        RETURNS (list): A list of `(key, start, end)` tuples,
+        as_spans (bool): Return Span objects with labels instead of (match_id,
+            start, end) tuples.
+        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
-            `doc[start:end]`. The `label_id` and `key` are both integers.
+            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
+            to True, a list of Span objects is returned.

        DOCS: https://spacy.io/api/phrasematcher#call
        """
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
            on_match = self._callbacks.get(self.vocab.strings[ent_id])
            if on_match is not None:
                on_match(self, doc, i, matches)
-        return matches
+        if as_spans:
+            return [Span(doc, start, end, label=key) for key, start, end in matches]
+        else:
+            return matches

    cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
        cdef MapStruct* current_node = self.c_map
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
            idx += 1

    def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
-        """Match a stream of documents, yielding them in turn.
-
-        docs (iterable): A stream of documents.
-        batch_size (int): Number of documents to accumulate into a working set.
-        return_matches (bool): Yield the match lists along with the docs, making
-            results (doc, matches) tuples.
-        as_tuples (bool): Interpret the input stream as (doc, context) tuples,
-            and yield (result, context) tuples out.
-            If both return_matches and as_tuples are True, the output will
-            be a sequence of ((doc, matches), context) tuples.
-        YIELDS (Doc): Documents, in order.
-
-        DOCS: https://spacy.io/api/phrasematcher#pipe
+        """Match a stream of documents, yielding them in turn. Deprecated as of
+        spaCy v3.0.
        """
+        warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
        if as_tuples:
            for doc, context in stream:
                matches = self(doc)
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -2,7 +2,8 @@ import pytest
 import re
 from mock import Mock
 from spacy.matcher import Matcher, DependencyMatcher
-from spacy.tokens import Doc, Token
+from spacy.tokens import Doc, Token, Span
+
 from ..doc.test_underscore import clean_underscore  # noqa: F401


@ -469,3 +470,26 @@ def test_matcher_span(matcher):
    assert len(matcher(doc)) == 2
    assert len(matcher(span_js)) == 1
    assert len(matcher(span_java)) == 1
+
+
+def test_matcher_as_spans(matcher):
+    """Test the new as_spans=True API."""
+    text = "JavaScript is good but Java is better"
+    doc = Doc(matcher.vocab, words=text.split())
+    matches = matcher(doc, as_spans=True)
+    assert len(matches) == 2
+    assert isinstance(matches[0], Span)
+    assert matches[0].text == "JavaScript"
+    assert matches[0].label_ == "JS"
+    assert isinstance(matches[1], Span)
+    assert matches[1].text == "Java"
+    assert matches[1].label_ == "Java"
+
+
+def test_matcher_deprecated(matcher):
+    doc = Doc(matcher.vocab, words=["hello", "world"])
+    with pytest.warns(DeprecationWarning) as record:
+        for _ in matcher.pipe([doc]):
+            pass
+        assert record.list
+        assert "spaCy v3.0" in str(record.list[0].message)
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@ -2,7 +2,7 @@ import pytest
 import srsly
 from mock import Mock
 from spacy.matcher import PhraseMatcher
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from ..util import get_doc


@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
    # clunky way to vaguely check that callback is unpickled
    (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
    assert isinstance(callbacks.get("TEST2"), Mock)
+
+
+def test_phrase_matcher_as_spans(en_vocab):
+    """Test the new as_spans=True API."""
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
+    matcher.add("B", [Doc(en_vocab, words=["test"])])
+    doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
+    matches = matcher(doc, as_spans=True)
+    assert len(matches) == 2
+    assert isinstance(matches[0], Span)
+    assert matches[0].text == "hello world"
+    assert matches[0].label_ == "A"
+    assert isinstance(matches[1], Span)
+    assert matches[1].text == "test"
+    assert matches[1].label_ == "B"
+
+
+def test_phrase_matcher_deprecated(en_vocab):
+    matcher = PhraseMatcher(en_vocab)
+    matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
+    doc = Doc(en_vocab, words=["hello", "world"])
+    with pytest.warns(DeprecationWarning) as record:
+        for _ in matcher.pipe([doc]):
+            pass
+        assert record.list
+        assert "spaCy v3.0" in str(record.list[0].message)
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -118,11 +118,11 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
 [Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
 argument that connects to the shared `tok2vec` component in the pipeline.

-| Name        | Description                                                                                                                                                                                                                                                                                                    |
-| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `width`     | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~                                                                                                                                                                                                               |
-| `upstream`  | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                         |
+| Name        | Description                                                                                                                                                                                                                                                                                                                          |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `width`     | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~                                                                                                                                                                                                                                     |
+| `upstream`  | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                               |

 ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}

@ -323,11 +323,11 @@ for details and system requirements.

 Load and wrap a transformer model from the
 [HuggingFace `transformers`](https://huggingface.co/transformers) library. You
-can any transformer that has pretrained weights and a PyTorch implementation.
-The `name` variable is passed through to the underlying library, so it can be
-either a string or a path. If it's a string, the pretrained weights will be
-downloaded via the transformers library if they are not already available
-locally.
+can use any transformer that has pretrained weights and a PyTorch
+implementation. The `name` variable is passed through to the underlying library,
+so it can be either a string or a path. If it's a string, the pretrained weights
+will be downloaded via the transformers library if they are not already
+available locally.

 In order to support longer documents, the
 [TransformerModel](/api/architectures#TransformerModel) layer allows you to pass
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 > matches = matcher(doc)
 > ```

-| Name        | Description                                                                                                                                                                                             |
-| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doclike`   | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                 |
-| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
-
-## Matcher.pipe {#pipe tag="method"}
-
-Match a stream of documents, yielding them in turn.
-
-> #### Example
->
-> ```python
-> from spacy.matcher import Matcher
-> matcher = Matcher(nlp.vocab)
-> for doc in matcher.pipe(docs, batch_size=50):
->     pass
-> ```
-
-| Name                                          | Description                                                                                                                                                                                                                         |
-| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~                                                                                                                                                                      |
-| `batch_size`                                  | The number of documents to accumulate into a working set. ~~int~~                                                                                                                                                                   |
-| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~                                                                                                                                         |
-| `as_tuples`                                   | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
-| **YIELDS**                                    | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~                                                                                                                                                   |
+| Name                                  | Description                                                                                                                                                                                                                                                                                              |
+| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doclike`                             | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
+| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
+| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
+| **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |

 ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}

--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
 > matches = matcher(doc)
 > ```

-| Name        | Description                         |
-| ----------- | ----------------------------------- |
-| `doc`       | The document to match over. ~~Doc~~ |
-| **RETURNS** | list                                | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
+| Name                                  | Description                                                                                                                                                                                                                                                                                              |
+| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doc`                                 | The document to match over. ~~Doc~~                                                                                                                                                                                                                                                                      |
+| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
+| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
+| **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |

 <Infobox title="Note on retrieving the string representation of the match_id" variant="warning">

@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]

 </Infobox>

-## PhraseMatcher.pipe {#pipe tag="method"}
-
-Match a stream of documents, yielding them in turn.
-
-> #### Example
->
-> ```python
->   from spacy.matcher import PhraseMatcher
->   matcher = PhraseMatcher(nlp.vocab)
->   for doc in matcher.pipe(docs, batch_size=50):
->       pass
-> ```
-
-| Name                                          | Description                                                                                                                                                                                                                         |
-| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`                                        | A stream of documents. ~~Iterable[Doc]~~                                                                                                                                                                                            |
-| `batch_size`                                  | The number of documents to accumulate into a working set. ~~int~~                                                                                                                                                                   |
-| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~                                                                                                                                         |
-| `as_tuples`                                   | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
-| **YIELDS**                                    | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~                                                                                                                    |
-
 ## PhraseMatcher.\_\_len\_\_ {#len tag="method"}

 Get the number of rules added to the matcher. Note that this only returns the
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -4,6 +4,7 @@ menu:
  - ['spacy', 'spacy']
  - ['displacy', 'displacy']
  - ['registry', 'registry']
+  - ['Loggers', 'loggers']
  - ['Batchers', 'batchers']
  - ['Data & Alignment', 'gold']
  - ['Utility Functions', 'util']
@ -316,6 +317,7 @@ factories.
 | `initializers`    | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers).                                                                                                                                                         |
 | `languages`       | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points).                                                                                                                 |
 | `layers`          | Registry for functions that create [layers](https://thinc.ai/docs/api-layers).                                                                                                                                                                     |
+| `loggers`         | Registry for functions that log [training results](/usage/training).                                                                                                                                                                               |
 | `lookups`         | Registry for large lookup tables available via `vocab.lookups`.                                                                                                                                                                                    |
 | `losses`          | Registry for functions that create [losses](https://thinc.ai/docs/api-loss).                                                                                                                                                                       |
 | `optimizers`      | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers).                                                                                                                                                             |
@ -340,7 +342,7 @@ See the [`Transformer`](/api/transformer) API reference and
 >     def annotation_setter(docs, trf_data) -> None:
 >        # Set annotations on the docs
 >
->     return annotation_sette
+>     return annotation_setter
 > ```

 | Registry name                                               | Description                                                                                                                                                                                                                                       |
@ -348,6 +350,110 @@ See the [`Transformer`](/api/transformer) API reference and
 | [`span_getters`](/api/transformer#span_getters)             | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences.                                                                                                      |
 | [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |

+## Loggers {#loggers source="spacy/gold/loggers.py" new="3"}
+
+A logger records the training results. When a logger is created, two functions
+are returned: one for logging the information for each training step, and a
+second function that is called to finalize the logging when the training is
+finished. To log each training step, a
+[dictionary](/usage/training#custom-logging) is passed on from the
+[`spacy train`](/api/cli#train), including information such as the training loss
+and the accuracy scores on the development set.
+
+There are two built-in logging functions: a logger printing results to the
+console in tabular format (which is the default), and one that also sends the
+results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
+using one of the built-in loggers listed here, you can also
+[implement your own](/usage/training#custom-logging).
+
+#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [training.logger]
+> @loggers = "spacy.ConsoleLogger.v1"
+> ```
+
+Writes the results of a training step to the console in a tabular format.
+
+<Accordion title="Example console output" spaced>
+
+```cli
+$ python -m spacy train config.cfg
+```
+
+```
+ℹ Using CPU
+ℹ Loading config and nlp from: config.cfg
+ℹ Pipeline: ['tok2vec', 'tagger']
+ℹ Start training
+ℹ Training. Initial learn rate: 0.0
+
+E     #        LOSS TOK2VEC   LOSS TAGGER   TAG_ACC   SCORE
+---   ------   ------------   -----------   -------   ------
+  1        0           0.00         86.20      0.22     0.00
+  1      200           3.08      18968.78     34.00     0.34
+  1      400          31.81      22539.06     33.64     0.34
+  1      600          92.13      22794.91     43.80     0.44
+  1      800         183.62      21541.39     56.05     0.56
+  1     1000         352.49      25461.82     65.15     0.65
+  1     1200         422.87      23708.82     71.84     0.72
+  1     1400         601.92      24994.79     76.57     0.77
+  1     1600         662.57      22268.02     80.20     0.80
+  1     1800        1101.50      28413.77     82.56     0.83
+  1     2000        1253.43      28736.36     85.00     0.85
+  1     2200        1411.02      28237.53     87.42     0.87
+  1     2400        1605.35      28439.95     88.70     0.89
+```
+
+Note that the cumulative loss keeps increasing within one epoch, but should
+start decreasing across epochs.
+
+ </Accordion>
+
+#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
+
+> #### Installation
+>
+> ```bash
+> $ pip install wandb
+> $ wandb login
+> ```
+
+Built-in logger that sends the results of each training step to the dashboard of
+the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
+& Biases should be installed, and you should be logged in. The logger will send
+the full config file to W&B, as well as various system information such as
+memory utilization, network traffic, disk IO, GPU statistics, etc. This will
+also include information such as your hostname and operating system, as well as
+the location of your Python executable.
+
+<Infobox variant="warning">
+
+Note that by default, the full (interpolated)
+[training config](/usage/training#config) is sent over to the W&B dashboard. If
+you prefer to **exclude certain information** such as path names, you can list
+those fields in "dot notation" in the `remove_config_values` parameter. These
+fields will then be removed from the config before uploading, but will otherwise
+remain in the config file stored on your local system.
+
+</Infobox>
+
+> #### Example config
+>
+> ```ini
+> [training.logger]
+> @loggers = "spacy.WandbLogger.v1"
+> project_name = "monitor_spacy_training"
+> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
+> ```
+
+| Name                   | Description                                                                                                                           |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
+| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
+
 ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}

 A data batcher implements a batching strategy that essentially turns a stream of
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -25,8 +25,8 @@ work out-of-the-box.

 </Infobox>

-This pipeline component lets you use transformer models in your pipeline.
-Supports all models that are available via the
+This pipeline component lets you use transformer models in your pipeline. It
+supports all models that are available via the
 [HuggingFace `transformers`](https://huggingface.co/transformers) library.
 Usually you will connect subsequent components to the shared transformer using
 the [TransformerListener](/api/architectures#TransformerListener) layer. This
@ -50,8 +50,8 @@ The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
 `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
 [`config.cfg` for training](/usage/training#config). See the
-[model architectures](/api/architectures) documentation for details on the
-architectures and their arguments and hyperparameters.
+[model architectures](/api/architectures#transformers) documentation for details
+on the transformer architectures and their arguments and hyperparameters.

 > #### Example
 >
@ -61,11 +61,11 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
 > ```

-| Setting             | Description                                                                                                                                                                                                                                                                                                            |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `max_batch_items`   | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                            |
-| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
-| `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                         |
+| Setting             | Description                                                                                                                                                                                                                                                                                                           |
+| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `max_batch_items`   | Maximum size of a padded batch. Defaults to `4096`. ~~int~~                                                                                                                                                                                                                                                           |
+| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~                                                                                                                        |

 ```python
 https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In
 your application, you would normally use a shortcut for this and instantiate the
 component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).

-| Name                | Description                                                                                                                                                                                                                                                                              |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`             | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                         |
-| `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~                                                       |
-| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
-| _keyword-only_      |                                                                                                                                                                                                                                                                                          |
-| `name`              | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                      |
-| `max_batch_items`   | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~                                                                                                                                                                                                                            |
+| Name                | Description                                                                                                                                                                                                                                                                             |
+| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab`             | The shared vocabulary. ~~Vocab~~                                                                                                                                                                                                                                                        |
+| `model`             | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~                                                      |
+| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
+| _keyword-only_      |                                                                                                                                                                                                                                                                                         |
+| `name`              | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                                                                                                                                                                                     |
+| `max_batch_items`   | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~                                                                                                                                                                                                                           |

 ## Transformer.\_\_call\_\_ {#call tag="method"}

@ -383,9 +383,8 @@ return tensors that refer to a whole padded batch of documents. These tensors
 are wrapped into the
 [FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
 `FullTransformerBatch` then splits out the per-document data, which is handled
-by this class. Instances of this class
-are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
-extension attribute.
+by this class. Instances of this class are typically assigned to the
+[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.

 | Name      | Description                                                                                                                                                                                                                                                                                                                                             |
 | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -447,8 +446,9 @@ overlap, and you can also omit sections of the Doc if they are not relevant.

 Span getters can be referenced in the `[components.transformer.model.get_spans]`
 block of the config to customize the sequences processed by the transformer. You
-can also register custom span getters using the `@spacy.registry.span_getters`
-decorator.
+can also register
+[custom span getters](/usage/embeddings-transformers#transformers-training-custom-settings)
+using the `@spacy.registry.span_getters` decorator.

 > #### Example
 >
@ -518,7 +518,7 @@ right context.

 ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}

-Annotation setters are functions that that take a batch of `Doc` objects and a
+Annotation setters are functions that take a batch of `Doc` objects and a
 [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set
 additional annotations on the `Doc`, e.g. to set custom or built-in attributes.
 You can register custom annotation setters using the
@ -551,6 +551,6 @@ The following built-in functions are available:
 The component sets the following
 [custom extension attributes](/usage/processing-pipeline#custom-components-attributes):

-| Name           | Description                                                              |
-| -------------- | ------------------------------------------------------------------------ |
-| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
+| Name             | Description                                                              |
+| ---------------- | ------------------------------------------------------------------------ |
+| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -251,13 +251,14 @@ for doc in nlp.pipe(["some text", "some other text"]):
    tokvecs = doc._.trf_data.tensors[-1]
 ```

-You can customize how the [`Transformer`](/api/transformer) component sets
-annotations onto the [`Doc`](/api/doc), by changing the `annotation_setter`.
-This callback will be called with the raw input and output data for the whole
-batch, along with the batch of `Doc` objects, allowing you to implement whatever
-you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
-objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
-containing the transformers data for the batch.
+You can also customize how the [`Transformer`](/api/transformer) component sets
+annotations onto the [`Doc`](/api/doc), by specifying a custom
+`annotation_setter`. This callback will be called with the raw input and output
+data for the whole batch, along with the batch of `Doc` objects, allowing you to
+implement whatever you need. The annotation setter is called with a batch of
+[`Doc`](/api/doc) objects and a
+[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the
+transformers data for the batch.

 ```python
 def custom_annotation_setter(docs, trf_data):
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@ -914,4 +914,4 @@ mattis pretium.

 ### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />

-<!-- TODO: decide how we want this to work? Just send results plus config from spacy evaluate in a separate command/script? -->
+<!-- TODO: link to WandB logger, explain that it's built-in but that you can also do other cool stuff with WandB? And then include example project (still need to decide what we want to do here)  -->
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -493,6 +493,39 @@ you prefer.
 | `i`       | Index of the current match (`matches[i`]). ~~int~~                                                                                                 |
 | `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |

+### Creating spans from matches {#matcher-spans}
+
+Creating [`Span`](/api/span) objects from the returned matches is a very common
+use case. spaCy makes this easy by giving you access to the `start` and `end`
+token of each match, which you can use to construct a new span with an optional
+label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
+matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
+using the `match_id` as the span label.
+
+```python
+### {executable="true"}
+import spacy
+from spacy.matcher import Matcher
+from spacy.tokens import Span
+
+nlp = spacy.blank("en")
+matcher = Matcher(nlp.vocab)
+matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
+doc = nlp("Barack Obama was the 44th president of the United States")
+
+# 1. Return (match_id, start, end) tuples
+matches = matcher(doc)
+for match_id, start, end in matches:
+    # Create the matched span and assign the match_id as a label
+    span = Span(doc, start, end, label=match_id)
+    print(span.text, span.label_)
+
+# 2. Return Span objects directly
+matches = matcher(doc, as_spans=True)
+for span in matches:
+    print(span.text, span.label_)
+```
+
 ### Using custom pipeline components {#matcher-pipeline}

 Let's say your data also contains some annoying pre-processing artifacts, like
@ -823,15 +856,6 @@ for token in doc:
    print(token.text, token._.is_hashtag)
 ```

-To process a stream of social media posts, we can use
-[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
-objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
-
-```python
-docs = nlp.pipe(LOTS_OF_TWEETS)
-matches = matcher.pipe(docs)
-```
-
 ## Efficient phrase matching {#phrasematcher}

 If you need to match large terminology lists, you can also use the
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -605,6 +605,68 @@ to your Python file. Before loading the config, spaCy will import the
 $ python -m spacy train config.cfg --output ./output --code ./functions.py
 ```

+#### Example: Custom logging function {#custom-logging}
+
+During training, the results of each step are passed to a logger function. By
+default, these results are written to the console with the
+[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
+for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
+[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
+**dictionary** with the following keys:
+
+| Key            | Value                                                                                          |
+| -------------- | ---------------------------------------------------------------------------------------------- |
+| `epoch`        | How many passes over the data have been completed. ~~int~~                                     |
+| `step`         | How many steps have been completed. ~~int~~                                                    |
+| `score`        | The main score from the last evaluation, measured on the dev set. ~~float~~                    |
+| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~         |
+| `losses`       | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~                 |
+| `checkpoints`  | A list of previous results, where each result is a (score, step, epoch) tuple. ~~List[Tuple]~~ |
+
+You can easily implement and plug in your own logger that records the training
+results in a custom way, or sends them to an experiment management tracker of
+your choice. In this example, the function `my_custom_logger.v1` writes the
+tabular results to a file:
+
+> ```ini
+> ### config.cfg (excerpt)
+> [training.logger]
+> @loggers = "my_custom_logger.v1"
+> log_path = "my_file.tab"
+> ```
+
+```python
+### functions.py
+from typing import Tuple, Callable, Dict, Any
+import spacy
+from pathlib import Path
+
+@spacy.registry.loggers("my_custom_logger.v1")
+def custom_logger(log_path):
+    def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
+        with Path(log_path).open("w") as file_:
+            file_.write("step\\t")
+            file_.write("score\\t")
+            for pipe in nlp.pipe_names:
+                file_.write(f"loss_{pipe}\\t")
+            file_.write("\\n")
+
+        def log_step(info: Dict[str, Any]):
+            with Path(log_path).open("a") as file_:
+                file_.write(f"{info['step']}\\t")
+                file_.write(f"{info['score']}\\t")
+                for pipe in nlp.pipe_names:
+                    file_.write(f"{info['losses'][pipe]}\\t")
+                file_.write("\\n")
+
+        def finalize():
+            pass
+
+        return log_step, finalize
+
+    return setup_logger
+```
+
 #### Example: Custom batch size schedule {#custom-code-schedule}

 For example, let's say you've implemented your own batch size schedule to use
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | `GoldParse`                                              | [`Example`](/api/example)                                                                                    |
 | `GoldCorpus`                                             | [`Corpus`](/api/corpus)                                                                                      |
 | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`          | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                   |
+| `Matcher.pipe`, `PhraseMatcher.pipe`                     | not needed                                                                                                   |
 | `spacy init-model`                                       | [`spacy init model`](/api/cli#init-model)                                                                    |
 | `spacy debug-data`                                       | [`spacy debug data`](/api/cli#debug-data)                                                                    |
 | `spacy profile`                                          | [`spacy debug profile`](/api/cli#debug-profile)                                                              |