Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthw Honnibal 2020-08-31 19:55:55 +02:00
commit c38298b8fa
16 changed files with 345 additions and 140 deletions

View File

@ -264,9 +264,9 @@ def train_while_improving(
epoch (int): How many passes over the data have been completed. epoch (int): How many passes over the data have been completed.
step (int): How many steps have been completed. step (int): How many steps have been completed.
score (float): The main score form the last evaluation. score (float): The main score from the last evaluation.
other_scores: : The other scores from the last evaluation. other_scores: : The other scores from the last evaluation.
loss: The accumulated losses throughout training. losses: The accumulated losses throughout training.
checkpoints: A list of previous results, where each result is a checkpoints: A list of previous results, where each result is a
(score, step, epoch) tuple. (score, step, epoch) tuple.
""" """

View File

@ -112,6 +112,9 @@ class Warnings:
"word segmenters: {supported}. Defaulting to {default}.") "word segmenters: {supported}. Defaulting to {default}.")
W104 = ("Skipping modifications for '{target}' segmenter. The current " W104 = ("Skipping modifications for '{target}' segmenter. The current "
"segmenter is '{current}'.") "segmenter is '{current}'.")
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
"need to match on a stream of documents, you can use nlp.pipe and "
"call the {matcher} on each Doc object.")
@add_codes @add_codes

View File

@ -176,18 +176,10 @@ cdef class Matcher:
return (self._callbacks[key], self._patterns[key]) return (self._callbacks[key], self._patterns[key])
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn. Deprecated as of
spaCy v3.0.
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
""" """
warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in docs: for doc, context in docs:
matches = self(doc) matches = self(doc)
@ -203,13 +195,16 @@ cdef class Matcher:
else: else:
yield doc yield doc
def __call__(self, object doclike): def __call__(self, object doclike, *, as_spans=False):
"""Find all token sequences matching the supplied pattern. """Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over. doclike (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples, as_spans (bool): Return Span objects with labels instead of (match_id,
start, end) tuples.
RETURNS (list): A list of `(match_id, start, end)` tuples,
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `match_id` is an integer. If as_spans is set
to True, a list of Span objects is returned.
""" """
if isinstance(doclike, Doc): if isinstance(doclike, Doc):
doc = doclike doc = doclike
@ -262,7 +257,10 @@ cdef class Matcher:
on_match = self._callbacks.get(key, None) on_match = self._callbacks.get(key, None)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, final_matches) on_match(self, doc, i, final_matches)
return final_matches if as_spans:
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
else:
return final_matches
def _normalize_key(self, key): def _normalize_key(self, key):
if isinstance(key, basestring): if isinstance(key, basestring):

View File

@ -7,6 +7,7 @@ import warnings
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
from ..structs cimport TokenC from ..structs cimport TokenC
from ..tokens.token cimport Token from ..tokens.token cimport Token
from ..tokens.span cimport Span
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..schemas import TokenPattern from ..schemas import TokenPattern
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
result = internal_node result = internal_node
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL) map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
def __call__(self, doc): def __call__(self, doc, *, as_spans=False):
"""Find all sequences matching the supplied patterns on the `Doc`. """Find all sequences matching the supplied patterns on the `Doc`.
doc (Doc): The document to match over. doc (Doc): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples, as_spans (bool): Return Span objects with labels instead of (match_id,
start, end) tuples.
RETURNS (list): A list of `(match_id, start, end)` tuples,
describing the matches. A match tuple describes a span describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers. `doc[start:end]`. The `match_id` is an integer. If as_spans is set
to True, a list of Span objects is returned.
DOCS: https://spacy.io/api/phrasematcher#call DOCS: https://spacy.io/api/phrasematcher#call
""" """
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
on_match = self._callbacks.get(self.vocab.strings[ent_id]) on_match = self._callbacks.get(self.vocab.strings[ent_id])
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
return matches if as_spans:
return [Span(doc, start, end, label=key) for key, start, end in matches]
else:
return matches
cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil: cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
cdef MapStruct* current_node = self.c_map cdef MapStruct* current_node = self.c_map
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
idx += 1 idx += 1
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False): def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn. """Match a stream of documents, yielding them in turn. Deprecated as of
spaCy v3.0.
docs (iterable): A stream of documents.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
and yield (result, context) tuples out.
If both return_matches and as_tuples are True, the output will
be a sequence of ((doc, matches), context) tuples.
YIELDS (Doc): Documents, in order.
DOCS: https://spacy.io/api/phrasematcher#pipe
""" """
warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
if as_tuples: if as_tuples:
for doc, context in stream: for doc, context in stream:
matches = self(doc) matches = self(doc)

View File

@ -2,7 +2,8 @@ import pytest
import re import re
from mock import Mock from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token from spacy.tokens import Doc, Token, Span
from ..doc.test_underscore import clean_underscore # noqa: F401 from ..doc.test_underscore import clean_underscore # noqa: F401
@ -469,3 +470,26 @@ def test_matcher_span(matcher):
assert len(matcher(doc)) == 2 assert len(matcher(doc)) == 2
assert len(matcher(span_js)) == 1 assert len(matcher(span_js)) == 1
assert len(matcher(span_java)) == 1 assert len(matcher(span_java)) == 1
def test_matcher_as_spans(matcher):
"""Test the new as_spans=True API."""
text = "JavaScript is good but Java is better"
doc = Doc(matcher.vocab, words=text.split())
matches = matcher(doc, as_spans=True)
assert len(matches) == 2
assert isinstance(matches[0], Span)
assert matches[0].text == "JavaScript"
assert matches[0].label_ == "JS"
assert isinstance(matches[1], Span)
assert matches[1].text == "Java"
assert matches[1].label_ == "Java"
def test_matcher_deprecated(matcher):
doc = Doc(matcher.vocab, words=["hello", "world"])
with pytest.warns(DeprecationWarning) as record:
for _ in matcher.pipe([doc]):
pass
assert record.list
assert "spaCy v3.0" in str(record.list[0].message)

View File

@ -2,7 +2,7 @@ import pytest
import srsly import srsly
from mock import Mock from mock import Mock
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc from spacy.tokens import Doc, Span
from ..util import get_doc from ..util import get_doc
@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
# clunky way to vaguely check that callback is unpickled # clunky way to vaguely check that callback is unpickled
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1] (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
assert isinstance(callbacks.get("TEST2"), Mock) assert isinstance(callbacks.get("TEST2"), Mock)
def test_phrase_matcher_as_spans(en_vocab):
"""Test the new as_spans=True API."""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
matcher.add("B", [Doc(en_vocab, words=["test"])])
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
matches = matcher(doc, as_spans=True)
assert len(matches) == 2
assert isinstance(matches[0], Span)
assert matches[0].text == "hello world"
assert matches[0].label_ == "A"
assert isinstance(matches[1], Span)
assert matches[1].text == "test"
assert matches[1].label_ == "B"
def test_phrase_matcher_deprecated(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
doc = Doc(en_vocab, words=["hello", "world"])
with pytest.warns(DeprecationWarning) as record:
for _ in matcher.pipe([doc]):
pass
assert record.list
assert "spaCy v3.0" in str(record.list[0].message)

View File

@ -118,11 +118,11 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec` [Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
argument that connects to the shared `tok2vec` component in the pipeline. argument that connects to the shared `tok2vec` component in the pipeline.
| Name | Description | | Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ | | `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ | | `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed} ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
@ -323,11 +323,11 @@ for details and system requirements.
Load and wrap a transformer model from the Load and wrap a transformer model from the
[HuggingFace `transformers`](https://huggingface.co/transformers) library. You [HuggingFace `transformers`](https://huggingface.co/transformers) library. You
can any transformer that has pretrained weights and a PyTorch implementation. can use any transformer that has pretrained weights and a PyTorch
The `name` variable is passed through to the underlying library, so it can be implementation. The `name` variable is passed through to the underlying library,
either a string or a path. If it's a string, the pretrained weights will be so it can be either a string or a path. If it's a string, the pretrained weights
downloaded via the transformers library if they are not already available will be downloaded via the transformers library if they are not already
locally. available locally.
In order to support longer documents, the In order to support longer documents, the
[TransformerModel](/api/architectures#TransformerModel) layer allows you to pass [TransformerModel](/api/architectures#TransformerModel) layer allows you to pass

View File

@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ | | `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ | | _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
## Matcher.pipe {#pipe tag="method"} | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
Match a stream of documents, yielding them in turn.
> #### Example
>
> ```python
> from spacy.matcher import Matcher
> matcher = Matcher(nlp.vocab)
> for doc in matcher.pipe(docs, batch_size=50):
> pass
> ```
| Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## Matcher.\_\_len\_\_ {#len tag="method" new="2"} ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}

View File

@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------- | | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | The document to match over. ~~Doc~~ | | `doc` | The document to match over. ~~Doc~~ |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ | | _keyword-only_ | |
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning"> <Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
</Infobox> </Infobox>
## PhraseMatcher.pipe {#pipe tag="method"}
Match a stream of documents, yielding them in turn.
> #### Example
>
> ```python
> from spacy.matcher import PhraseMatcher
> matcher = PhraseMatcher(nlp.vocab)
> for doc in matcher.pipe(docs, batch_size=50):
> pass
> ```
| Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## PhraseMatcher.\_\_len\_\_ {#len tag="method"} ## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
Get the number of rules added to the matcher. Note that this only returns the Get the number of rules added to the matcher. Note that this only returns the

View File

@ -4,6 +4,7 @@ menu:
- ['spacy', 'spacy'] - ['spacy', 'spacy']
- ['displacy', 'displacy'] - ['displacy', 'displacy']
- ['registry', 'registry'] - ['registry', 'registry']
- ['Loggers', 'loggers']
- ['Batchers', 'batchers'] - ['Batchers', 'batchers']
- ['Data & Alignment', 'gold'] - ['Data & Alignment', 'gold']
- ['Utility Functions', 'util'] - ['Utility Functions', 'util']
@ -316,6 +317,7 @@ factories.
| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | | `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). |
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | | `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
| `loggers` | Registry for functions that log [training results](/usage/training). |
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | | `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | | `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
@ -340,7 +342,7 @@ See the [`Transformer`](/api/transformer) API reference and
> def annotation_setter(docs, trf_data) -> None: > def annotation_setter(docs, trf_data) -> None:
> # Set annotations on the docs > # Set annotations on the docs
> >
> return annotation_sette > return annotation_setter
> ``` > ```
| Registry name | Description | | Registry name | Description |
@ -348,6 +350,110 @@ See the [`Transformer`](/api/transformer) API reference and
| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. | | [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. |
| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | | [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
## Loggers {#loggers source="spacy/gold/loggers.py" new="3"}
A logger records the training results. When a logger is created, two functions
are returned: one for logging the information for each training step, and a
second function that is called to finalize the logging when the training is
finished. To log each training step, a
[dictionary](/usage/training#custom-logging) is passed on from the
[`spacy train`](/api/cli#train), including information such as the training loss
and the accuracy scores on the development set.
There are two built-in logging functions: a logger printing results to the
console in tabular format (which is the default), and one that also sends the
results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
using one of the built-in loggers listed here, you can also
[implement your own](/usage/training#custom-logging).
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.ConsoleLogger.v1"
> ```
Writes the results of a training step to the console in a tabular format.
<Accordion title="Example console output" spaced>
```cli
$ python -m spacy train config.cfg
```
```
Using CPU
Loading config and nlp from: config.cfg
Pipeline: ['tok2vec', 'tagger']
Start training
Training. Initial learn rate: 0.0
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
--- ------ ------------ ----------- ------- ------
1 0 0.00 86.20 0.22 0.00
1 200 3.08 18968.78 34.00 0.34
1 400 31.81 22539.06 33.64 0.34
1 600 92.13 22794.91 43.80 0.44
1 800 183.62 21541.39 56.05 0.56
1 1000 352.49 25461.82 65.15 0.65
1 1200 422.87 23708.82 71.84 0.72
1 1400 601.92 24994.79 76.57 0.77
1 1600 662.57 22268.02 80.20 0.80
1 1800 1101.50 28413.77 82.56 0.83
1 2000 1253.43 28736.36 85.00 0.85
1 2200 1411.02 28237.53 87.42 0.87
1 2400 1605.35 28439.95 88.70 0.89
```
Note that the cumulative loss keeps increasing within one epoch, but should
start decreasing across epochs.
</Accordion>
#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
> #### Installation
>
> ```bash
> $ pip install wandb
> $ wandb login
> ```
Built-in logger that sends the results of each training step to the dashboard of
the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
& Biases should be installed, and you should be logged in. The logger will send
the full config file to W&B, as well as various system information such as
memory utilization, network traffic, disk IO, GPU statistics, etc. This will
also include information such as your hostname and operating system, as well as
the location of your Python executable.
<Infobox variant="warning">
Note that by default, the full (interpolated)
[training config](/usage/training#config) is sent over to the W&B dashboard. If
you prefer to **exclude certain information** such as path names, you can list
those fields in "dot notation" in the `remove_config_values` parameter. These
fields will then be removed from the config before uploading, but will otherwise
remain in the config file stored on your local system.
</Infobox>
> #### Example config
>
> ```ini
> [training.logger]
> @loggers = "spacy.WandbLogger.v1"
> project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
> ```
| Name | Description |
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"} ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
A data batcher implements a batching strategy that essentially turns a stream of A data batcher implements a batching strategy that essentially turns a stream of

View File

@ -25,8 +25,8 @@ work out-of-the-box.
</Infobox> </Infobox>
This pipeline component lets you use transformer models in your pipeline. This pipeline component lets you use transformer models in your pipeline. It
Supports all models that are available via the supports all models that are available via the
[HuggingFace `transformers`](https://huggingface.co/transformers) library. [HuggingFace `transformers`](https://huggingface.co/transformers) library.
Usually you will connect subsequent components to the shared transformer using Usually you will connect subsequent components to the shared transformer using
the [TransformerListener](/api/architectures#TransformerListener) layer. This the [TransformerListener](/api/architectures#TransformerListener) layer. This
@ -50,8 +50,8 @@ The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config). See the [`config.cfg` for training](/usage/training#config). See the
[model architectures](/api/architectures) documentation for details on the [model architectures](/api/architectures#transformers) documentation for details
architectures and their arguments and hyperparameters. on the transformer architectures and their arguments and hyperparameters.
> #### Example > #### Example
> >
@ -61,11 +61,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
> ``` > ```
| Setting | Description | | Setting | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ | | `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | | `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
```python ```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In
your application, you would normally use a shortcut for this and instantiate the your application, you would normally use a shortcut for this and instantiate the
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Description | | Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | | `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ | | `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
## Transformer.\_\_call\_\_ {#call tag="method"} ## Transformer.\_\_call\_\_ {#call tag="method"}
@ -383,9 +383,8 @@ return tensors that refer to a whole padded batch of documents. These tensors
are wrapped into the are wrapped into the
[FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The [FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
`FullTransformerBatch` then splits out the per-document data, which is handled `FullTransformerBatch` then splits out the per-document data, which is handled
by this class. Instances of this class by this class. Instances of this class are typically assigned to the
are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes) [`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
extension attribute.
| Name | Description | | Name | Description |
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -447,8 +446,9 @@ overlap, and you can also omit sections of the Doc if they are not relevant.
Span getters can be referenced in the `[components.transformer.model.get_spans]` Span getters can be referenced in the `[components.transformer.model.get_spans]`
block of the config to customize the sequences processed by the transformer. You block of the config to customize the sequences processed by the transformer. You
can also register custom span getters using the `@spacy.registry.span_getters` can also register
decorator. [custom span getters](/usage/embeddings-transformers#transformers-training-custom-settings)
using the `@spacy.registry.span_getters` decorator.
> #### Example > #### Example
> >
@ -518,7 +518,7 @@ right context.
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
Annotation setters are functions that that take a batch of `Doc` objects and a Annotation setters are functions that take a batch of `Doc` objects and a
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set
additional annotations on the `Doc`, e.g. to set custom or built-in attributes. additional annotations on the `Doc`, e.g. to set custom or built-in attributes.
You can register custom annotation setters using the You can register custom annotation setters using the
@ -551,6 +551,6 @@ The following built-in functions are available:
The component sets the following The component sets the following
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes): [custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------ | | ---------------- | ------------------------------------------------------------------------ |
| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ | | `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |

View File

@ -251,13 +251,14 @@ for doc in nlp.pipe(["some text", "some other text"]):
tokvecs = doc._.trf_data.tensors[-1] tokvecs = doc._.trf_data.tensors[-1]
``` ```
You can customize how the [`Transformer`](/api/transformer) component sets You can also customize how the [`Transformer`](/api/transformer) component sets
annotations onto the [`Doc`](/api/doc), by changing the `annotation_setter`. annotations onto the [`Doc`](/api/doc), by specifying a custom
This callback will be called with the raw input and output data for the whole `annotation_setter`. This callback will be called with the raw input and output
batch, along with the batch of `Doc` objects, allowing you to implement whatever data for the whole batch, along with the batch of `Doc` objects, allowing you to
you need. The annotation setter is called with a batch of [`Doc`](/api/doc) implement whatever you need. The annotation setter is called with a batch of
objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) [`Doc`](/api/doc) objects and a
containing the transformers data for the batch. [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the
transformers data for the batch.
```python ```python
def custom_annotation_setter(docs, trf_data): def custom_annotation_setter(docs, trf_data):

View File

@ -914,4 +914,4 @@ mattis pretium.
### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" /> ### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
<!-- TODO: decide how we want this to work? Just send results plus config from spacy evaluate in a separate command/script? --> <!-- TODO: link to WandB logger, explain that it's built-in but that you can also do other cool stuff with WandB? And then include example project (still need to decide what we want to do here) -->

View File

@ -493,6 +493,39 @@ you prefer.
| `i` | Index of the current match (`matches[i`]). ~~int~~ | | `i` | Index of the current match (`matches[i`]). ~~int~~ |
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ | | `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
### Creating spans from matches {#matcher-spans}
Creating [`Span`](/api/span) objects from the returned matches is a very common
use case. spaCy makes this easy by giving you access to the `start` and `end`
token of each match, which you can use to construct a new span with an optional
label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
using the `match_id` as the span label.
```python
### {executable="true"}
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
doc = nlp("Barack Obama was the 44th president of the United States")
# 1. Return (match_id, start, end) tuples
matches = matcher(doc)
for match_id, start, end in matches:
# Create the matched span and assign the match_id as a label
span = Span(doc, start, end, label=match_id)
print(span.text, span.label_)
# 2. Return Span objects directly
matches = matcher(doc, as_spans=True)
for span in matches:
print(span.text, span.label_)
```
### Using custom pipeline components {#matcher-pipeline} ### Using custom pipeline components {#matcher-pipeline}
Let's say your data also contains some annoying pre-processing artifacts, like Let's say your data also contains some annoying pre-processing artifacts, like
@ -823,15 +856,6 @@ for token in doc:
print(token.text, token._.is_hashtag) print(token.text, token._.is_hashtag)
``` ```
To process a stream of social media posts, we can use
[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
```python
docs = nlp.pipe(LOTS_OF_TWEETS)
matches = matcher.pipe(docs)
```
## Efficient phrase matching {#phrasematcher} ## Efficient phrase matching {#phrasematcher}
If you need to match large terminology lists, you can also use the If you need to match large terminology lists, you can also use the

View File

@ -605,6 +605,68 @@ to your Python file. Before loading the config, spaCy will import the
$ python -m spacy train config.cfg --output ./output --code ./functions.py $ python -m spacy train config.cfg --output ./output --code ./functions.py
``` ```
#### Example: Custom logging function {#custom-logging}
During training, the results of each step are passed to a logger function. By
default, these results are written to the console with the
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
**dictionary** with the following keys:
| Key | Value |
| -------------- | ---------------------------------------------------------------------------------------------- |
| `epoch` | How many passes over the data have been completed. ~~int~~ |
| `step` | How many steps have been completed. ~~int~~ |
| `score` | The main score from the last evaluation, measured on the dev set. ~~float~~ |
| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~ |
| `losses` | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~ |
| `checkpoints` | A list of previous results, where each result is a (score, step, epoch) tuple. ~~List[Tuple]~~ |
You can easily implement and plug in your own logger that records the training
results in a custom way, or sends them to an experiment management tracker of
your choice. In this example, the function `my_custom_logger.v1` writes the
tabular results to a file:
> ```ini
> ### config.cfg (excerpt)
> [training.logger]
> @loggers = "my_custom_logger.v1"
> log_path = "my_file.tab"
> ```
```python
### functions.py
from typing import Tuple, Callable, Dict, Any
import spacy
from pathlib import Path
@spacy.registry.loggers("my_custom_logger.v1")
def custom_logger(log_path):
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
with Path(log_path).open("w") as file_:
file_.write("step\\t")
file_.write("score\\t")
for pipe in nlp.pipe_names:
file_.write(f"loss_{pipe}\\t")
file_.write("\\n")
def log_step(info: Dict[str, Any]):
with Path(log_path).open("a") as file_:
file_.write(f"{info['step']}\\t")
file_.write(f"{info['score']}\\t")
for pipe in nlp.pipe_names:
file_.write(f"{info['losses'][pipe]}\\t")
file_.write("\\n")
def finalize():
pass
return log_step, finalize
return setup_logger
```
#### Example: Custom batch size schedule {#custom-code-schedule} #### Example: Custom batch size schedule {#custom-code-schedule}
For example, let's say you've implemented your own batch size schedule to use For example, let's say you've implemented your own batch size schedule to use

View File

@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
| `GoldParse` | [`Example`](/api/example) | | `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) | | `GoldCorpus` | [`Corpus`](/api/corpus) |
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) | | `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | | `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | | `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |