mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c38298b8fa
|
@ -264,9 +264,9 @@ def train_while_improving(
|
|||
|
||||
epoch (int): How many passes over the data have been completed.
|
||||
step (int): How many steps have been completed.
|
||||
score (float): The main score form the last evaluation.
|
||||
score (float): The main score from the last evaluation.
|
||||
other_scores: : The other scores from the last evaluation.
|
||||
loss: The accumulated losses throughout training.
|
||||
losses: The accumulated losses throughout training.
|
||||
checkpoints: A list of previous results, where each result is a
|
||||
(score, step, epoch) tuple.
|
||||
"""
|
||||
|
|
|
@ -112,6 +112,9 @@ class Warnings:
|
|||
"word segmenters: {supported}. Defaulting to {default}.")
|
||||
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||
"segmenter is '{current}'.")
|
||||
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
||||
"need to match on a stream of documents, you can use nlp.pipe and "
|
||||
"call the {matcher} on each Doc object.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -176,18 +176,10 @@ cdef class Matcher:
|
|||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||
spaCy v3.0.
|
||||
"""
|
||||
warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
|
||||
if as_tuples:
|
||||
for doc, context in docs:
|
||||
matches = self(doc)
|
||||
|
@ -203,13 +195,16 @@ cdef class Matcher:
|
|||
else:
|
||||
yield doc
|
||||
|
||||
def __call__(self, object doclike):
|
||||
def __call__(self, object doclike, *, as_spans=False):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doclike (Doc or Span): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||
start, end) tuples.
|
||||
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||
to True, a list of Span objects is returned.
|
||||
"""
|
||||
if isinstance(doclike, Doc):
|
||||
doc = doclike
|
||||
|
@ -262,7 +257,10 @@ cdef class Matcher:
|
|||
on_match = self._callbacks.get(key, None)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, final_matches)
|
||||
return final_matches
|
||||
if as_spans:
|
||||
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
|
||||
else:
|
||||
return final_matches
|
||||
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
|
|
|
@ -7,6 +7,7 @@ import warnings
|
|||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.span cimport Span
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from ..schemas import TokenPattern
|
||||
|
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
|
|||
result = internal_node
|
||||
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
||||
|
||||
def __call__(self, doc):
|
||||
def __call__(self, doc, *, as_spans=False):
|
||||
"""Find all sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||
start, end) tuples.
|
||||
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||
to True, a list of Span objects is returned.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#call
|
||||
"""
|
||||
|
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
|
|||
on_match = self._callbacks.get(self.vocab.strings[ent_id])
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
return matches
|
||||
if as_spans:
|
||||
return [Span(doc, start, end, label=key) for key, start, end in matches]
|
||||
else:
|
||||
return matches
|
||||
|
||||
cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
|
||||
cdef MapStruct* current_node = self.c_map
|
||||
|
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
|
|||
idx += 1
|
||||
|
||||
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
||||
and yield (result, context) tuples out.
|
||||
If both return_matches and as_tuples are True, the output will
|
||||
be a sequence of ((doc, matches), context) tuples.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#pipe
|
||||
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||
spaCy v3.0.
|
||||
"""
|
||||
warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
|
||||
if as_tuples:
|
||||
for doc, context in stream:
|
||||
matches = self(doc)
|
||||
|
|
|
@ -2,7 +2,8 @@ import pytest
|
|||
import re
|
||||
from mock import Mock
|
||||
from spacy.matcher import Matcher, DependencyMatcher
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.tokens import Doc, Token, Span
|
||||
|
||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||
|
||||
|
||||
|
@ -469,3 +470,26 @@ def test_matcher_span(matcher):
|
|||
assert len(matcher(doc)) == 2
|
||||
assert len(matcher(span_js)) == 1
|
||||
assert len(matcher(span_java)) == 1
|
||||
|
||||
|
||||
def test_matcher_as_spans(matcher):
|
||||
"""Test the new as_spans=True API."""
|
||||
text = "JavaScript is good but Java is better"
|
||||
doc = Doc(matcher.vocab, words=text.split())
|
||||
matches = matcher(doc, as_spans=True)
|
||||
assert len(matches) == 2
|
||||
assert isinstance(matches[0], Span)
|
||||
assert matches[0].text == "JavaScript"
|
||||
assert matches[0].label_ == "JS"
|
||||
assert isinstance(matches[1], Span)
|
||||
assert matches[1].text == "Java"
|
||||
assert matches[1].label_ == "Java"
|
||||
|
||||
|
||||
def test_matcher_deprecated(matcher):
|
||||
doc = Doc(matcher.vocab, words=["hello", "world"])
|
||||
with pytest.warns(DeprecationWarning) as record:
|
||||
for _ in matcher.pipe([doc]):
|
||||
pass
|
||||
assert record.list
|
||||
assert "spaCy v3.0" in str(record.list[0].message)
|
||||
|
|
|
@ -2,7 +2,7 @@ import pytest
|
|||
import srsly
|
||||
from mock import Mock
|
||||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tokens import Doc, Span
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
|
@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
|
|||
# clunky way to vaguely check that callback is unpickled
|
||||
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
|
||||
assert isinstance(callbacks.get("TEST2"), Mock)
|
||||
|
||||
|
||||
def test_phrase_matcher_as_spans(en_vocab):
|
||||
"""Test the new as_spans=True API."""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["test"])])
|
||||
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
|
||||
matches = matcher(doc, as_spans=True)
|
||||
assert len(matches) == 2
|
||||
assert isinstance(matches[0], Span)
|
||||
assert matches[0].text == "hello world"
|
||||
assert matches[0].label_ == "A"
|
||||
assert isinstance(matches[1], Span)
|
||||
assert matches[1].text == "test"
|
||||
assert matches[1].label_ == "B"
|
||||
|
||||
|
||||
def test_phrase_matcher_deprecated(en_vocab):
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
with pytest.warns(DeprecationWarning) as record:
|
||||
for _ in matcher.pipe([doc]):
|
||||
pass
|
||||
assert record.list
|
||||
assert "spaCy v3.0" in str(record.list[0].message)
|
||||
|
|
|
@ -118,11 +118,11 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
|
|||
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
|
||||
argument that connects to the shared `tok2vec` component in the pipeline.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
||||
|
||||
|
@ -323,11 +323,11 @@ for details and system requirements.
|
|||
|
||||
Load and wrap a transformer model from the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library. You
|
||||
can any transformer that has pretrained weights and a PyTorch implementation.
|
||||
The `name` variable is passed through to the underlying library, so it can be
|
||||
either a string or a path. If it's a string, the pretrained weights will be
|
||||
downloaded via the transformers library if they are not already available
|
||||
locally.
|
||||
can use any transformer that has pretrained weights and a PyTorch
|
||||
implementation. The `name` variable is passed through to the underlying library,
|
||||
so it can be either a string or a path. If it's a string, the pretrained weights
|
||||
will be downloaded via the transformers library if they are not already
|
||||
available locally.
|
||||
|
||||
In order to support longer documents, the
|
||||
[TransformerModel](/api/architectures#TransformerModel) layer allows you to pass
|
||||
|
|
|
@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
|
||||
## Matcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import Matcher
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> for doc in matcher.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
|
||||
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
|
||||
|
||||
|
|
|
@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------- |
|
||||
| `doc` | The document to match over. ~~Doc~~ |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | The document to match over. ~~Doc~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||
|
||||
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
|
||||
|
||||
|
@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
|
|||
|
||||
</Infobox>
|
||||
|
||||
## PhraseMatcher.pipe {#pipe tag="method"}
|
||||
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import PhraseMatcher
|
||||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> for doc in matcher.pipe(docs, batch_size=50):
|
||||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
|
||||
## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Get the number of rules added to the matcher. Note that this only returns the
|
||||
|
|
|
@ -4,6 +4,7 @@ menu:
|
|||
- ['spacy', 'spacy']
|
||||
- ['displacy', 'displacy']
|
||||
- ['registry', 'registry']
|
||||
- ['Loggers', 'loggers']
|
||||
- ['Batchers', 'batchers']
|
||||
- ['Data & Alignment', 'gold']
|
||||
- ['Utility Functions', 'util']
|
||||
|
@ -316,6 +317,7 @@ factories.
|
|||
| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). |
|
||||
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||
| `loggers` | Registry for functions that log [training results](/usage/training). |
|
||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||
|
@ -340,7 +342,7 @@ See the [`Transformer`](/api/transformer) API reference and
|
|||
> def annotation_setter(docs, trf_data) -> None:
|
||||
> # Set annotations on the docs
|
||||
>
|
||||
> return annotation_sette
|
||||
> return annotation_setter
|
||||
> ```
|
||||
|
||||
| Registry name | Description |
|
||||
|
@ -348,6 +350,110 @@ See the [`Transformer`](/api/transformer) API reference and
|
|||
| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. |
|
||||
| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
|
||||
|
||||
## Loggers {#loggers source="spacy/gold/loggers.py" new="3"}
|
||||
|
||||
A logger records the training results. When a logger is created, two functions
|
||||
are returned: one for logging the information for each training step, and a
|
||||
second function that is called to finalize the logging when the training is
|
||||
finished. To log each training step, a
|
||||
[dictionary](/usage/training#custom-logging) is passed on from the
|
||||
[`spacy train`](/api/cli#train), including information such as the training loss
|
||||
and the accuracy scores on the development set.
|
||||
|
||||
There are two built-in logging functions: a logger printing results to the
|
||||
console in tabular format (which is the default), and one that also sends the
|
||||
results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
|
||||
using one of the built-in loggers listed here, you can also
|
||||
[implement your own](/usage/training#custom-logging).
|
||||
|
||||
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.ConsoleLogger.v1"
|
||||
> ```
|
||||
|
||||
Writes the results of a training step to the console in a tabular format.
|
||||
|
||||
<Accordion title="Example console output" spaced>
|
||||
|
||||
```cli
|
||||
$ python -m spacy train config.cfg
|
||||
```
|
||||
|
||||
```
|
||||
ℹ Using CPU
|
||||
ℹ Loading config and nlp from: config.cfg
|
||||
ℹ Pipeline: ['tok2vec', 'tagger']
|
||||
ℹ Start training
|
||||
ℹ Training. Initial learn rate: 0.0
|
||||
|
||||
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
|
||||
--- ------ ------------ ----------- ------- ------
|
||||
1 0 0.00 86.20 0.22 0.00
|
||||
1 200 3.08 18968.78 34.00 0.34
|
||||
1 400 31.81 22539.06 33.64 0.34
|
||||
1 600 92.13 22794.91 43.80 0.44
|
||||
1 800 183.62 21541.39 56.05 0.56
|
||||
1 1000 352.49 25461.82 65.15 0.65
|
||||
1 1200 422.87 23708.82 71.84 0.72
|
||||
1 1400 601.92 24994.79 76.57 0.77
|
||||
1 1600 662.57 22268.02 80.20 0.80
|
||||
1 1800 1101.50 28413.77 82.56 0.83
|
||||
1 2000 1253.43 28736.36 85.00 0.85
|
||||
1 2200 1411.02 28237.53 87.42 0.87
|
||||
1 2400 1605.35 28439.95 88.70 0.89
|
||||
```
|
||||
|
||||
Note that the cumulative loss keeps increasing within one epoch, but should
|
||||
start decreasing across epochs.
|
||||
|
||||
</Accordion>
|
||||
|
||||
#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
|
||||
|
||||
> #### Installation
|
||||
>
|
||||
> ```bash
|
||||
> $ pip install wandb
|
||||
> $ wandb login
|
||||
> ```
|
||||
|
||||
Built-in logger that sends the results of each training step to the dashboard of
|
||||
the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
|
||||
& Biases should be installed, and you should be logged in. The logger will send
|
||||
the full config file to W&B, as well as various system information such as
|
||||
memory utilization, network traffic, disk IO, GPU statistics, etc. This will
|
||||
also include information such as your hostname and operating system, as well as
|
||||
the location of your Python executable.
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Note that by default, the full (interpolated)
|
||||
[training config](/usage/training#config) is sent over to the W&B dashboard. If
|
||||
you prefer to **exclude certain information** such as path names, you can list
|
||||
those fields in "dot notation" in the `remove_config_values` parameter. These
|
||||
fields will then be removed from the config before uploading, but will otherwise
|
||||
remain in the config file stored on your local system.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.logger]
|
||||
> @loggers = "spacy.WandbLogger.v1"
|
||||
> project_name = "monitor_spacy_training"
|
||||
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
||||
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
||||
|
||||
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
|
||||
|
||||
A data batcher implements a batching strategy that essentially turns a stream of
|
||||
|
|
|
@ -25,8 +25,8 @@ work out-of-the-box.
|
|||
|
||||
</Infobox>
|
||||
|
||||
This pipeline component lets you use transformer models in your pipeline.
|
||||
Supports all models that are available via the
|
||||
This pipeline component lets you use transformer models in your pipeline. It
|
||||
supports all models that are available via the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library.
|
||||
Usually you will connect subsequent components to the shared transformer using
|
||||
the [TransformerListener](/api/architectures#TransformerListener) layer. This
|
||||
|
@ -50,8 +50,8 @@ The default config is defined by the pipeline component factory and describes
|
|||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config). See the
|
||||
[model architectures](/api/architectures) documentation for details on the
|
||||
architectures and their arguments and hyperparameters.
|
||||
[model architectures](/api/architectures#transformers) documentation for details
|
||||
on the transformer architectures and their arguments and hyperparameters.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -61,11 +61,11 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| Setting | Description |
|
||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||
|
@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In
|
|||
your application, you would normally use a shortcut for this and instantiate the
|
||||
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
||||
| Name | Description |
|
||||
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
||||
|
||||
## Transformer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -383,9 +383,8 @@ return tensors that refer to a whole padded batch of documents. These tensors
|
|||
are wrapped into the
|
||||
[FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
|
||||
`FullTransformerBatch` then splits out the per-document data, which is handled
|
||||
by this class. Instances of this class
|
||||
are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
|
||||
extension attribute.
|
||||
by this class. Instances of this class are typically assigned to the
|
||||
[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
|
||||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -447,8 +446,9 @@ overlap, and you can also omit sections of the Doc if they are not relevant.
|
|||
|
||||
Span getters can be referenced in the `[components.transformer.model.get_spans]`
|
||||
block of the config to customize the sequences processed by the transformer. You
|
||||
can also register custom span getters using the `@spacy.registry.span_getters`
|
||||
decorator.
|
||||
can also register
|
||||
[custom span getters](/usage/embeddings-transformers#transformers-training-custom-settings)
|
||||
using the `@spacy.registry.span_getters` decorator.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -518,7 +518,7 @@ right context.
|
|||
|
||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||
|
||||
Annotation setters are functions that that take a batch of `Doc` objects and a
|
||||
Annotation setters are functions that take a batch of `Doc` objects and a
|
||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set
|
||||
additional annotations on the `Doc`, e.g. to set custom or built-in attributes.
|
||||
You can register custom annotation setters using the
|
||||
|
@ -551,6 +551,6 @@ The following built-in functions are available:
|
|||
The component sets the following
|
||||
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------ |
|
||||
| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------ |
|
||||
| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
|
||||
|
|
|
@ -251,13 +251,14 @@ for doc in nlp.pipe(["some text", "some other text"]):
|
|||
tokvecs = doc._.trf_data.tensors[-1]
|
||||
```
|
||||
|
||||
You can customize how the [`Transformer`](/api/transformer) component sets
|
||||
annotations onto the [`Doc`](/api/doc), by changing the `annotation_setter`.
|
||||
This callback will be called with the raw input and output data for the whole
|
||||
batch, along with the batch of `Doc` objects, allowing you to implement whatever
|
||||
you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
|
||||
objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
||||
containing the transformers data for the batch.
|
||||
You can also customize how the [`Transformer`](/api/transformer) component sets
|
||||
annotations onto the [`Doc`](/api/doc), by specifying a custom
|
||||
`annotation_setter`. This callback will be called with the raw input and output
|
||||
data for the whole batch, along with the batch of `Doc` objects, allowing you to
|
||||
implement whatever you need. The annotation setter is called with a batch of
|
||||
[`Doc`](/api/doc) objects and a
|
||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the
|
||||
transformers data for the batch.
|
||||
|
||||
```python
|
||||
def custom_annotation_setter(docs, trf_data):
|
||||
|
|
|
@ -914,4 +914,4 @@ mattis pretium.
|
|||
|
||||
### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
|
||||
|
||||
<!-- TODO: decide how we want this to work? Just send results plus config from spacy evaluate in a separate command/script? -->
|
||||
<!-- TODO: link to WandB logger, explain that it's built-in but that you can also do other cool stuff with WandB? And then include example project (still need to decide what we want to do here) -->
|
||||
|
|
|
@ -493,6 +493,39 @@ you prefer.
|
|||
| `i` | Index of the current match (`matches[i`]). ~~int~~ |
|
||||
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
|
||||
|
||||
### Creating spans from matches {#matcher-spans}
|
||||
|
||||
Creating [`Span`](/api/span) objects from the returned matches is a very common
|
||||
use case. spaCy makes this easy by giving you access to the `start` and `end`
|
||||
token of each match, which you can use to construct a new span with an optional
|
||||
label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
|
||||
matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
|
||||
using the `match_id` as the span label.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
|
||||
doc = nlp("Barack Obama was the 44th president of the United States")
|
||||
|
||||
# 1. Return (match_id, start, end) tuples
|
||||
matches = matcher(doc)
|
||||
for match_id, start, end in matches:
|
||||
# Create the matched span and assign the match_id as a label
|
||||
span = Span(doc, start, end, label=match_id)
|
||||
print(span.text, span.label_)
|
||||
|
||||
# 2. Return Span objects directly
|
||||
matches = matcher(doc, as_spans=True)
|
||||
for span in matches:
|
||||
print(span.text, span.label_)
|
||||
```
|
||||
|
||||
### Using custom pipeline components {#matcher-pipeline}
|
||||
|
||||
Let's say your data also contains some annoying pre-processing artifacts, like
|
||||
|
@ -823,15 +856,6 @@ for token in doc:
|
|||
print(token.text, token._.is_hashtag)
|
||||
```
|
||||
|
||||
To process a stream of social media posts, we can use
|
||||
[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
|
||||
objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
|
||||
|
||||
```python
|
||||
docs = nlp.pipe(LOTS_OF_TWEETS)
|
||||
matches = matcher.pipe(docs)
|
||||
```
|
||||
|
||||
## Efficient phrase matching {#phrasematcher}
|
||||
|
||||
If you need to match large terminology lists, you can also use the
|
||||
|
|
|
@ -605,6 +605,68 @@ to your Python file. Before loading the config, spaCy will import the
|
|||
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
||||
```
|
||||
|
||||
#### Example: Custom logging function {#custom-logging}
|
||||
|
||||
During training, the results of each step are passed to a logger function. By
|
||||
default, these results are written to the console with the
|
||||
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
|
||||
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
|
||||
[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
|
||||
**dictionary** with the following keys:
|
||||
|
||||
| Key | Value |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------- |
|
||||
| `epoch` | How many passes over the data have been completed. ~~int~~ |
|
||||
| `step` | How many steps have been completed. ~~int~~ |
|
||||
| `score` | The main score from the last evaluation, measured on the dev set. ~~float~~ |
|
||||
| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~ |
|
||||
| `losses` | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~ |
|
||||
| `checkpoints` | A list of previous results, where each result is a (score, step, epoch) tuple. ~~List[Tuple]~~ |
|
||||
|
||||
You can easily implement and plug in your own logger that records the training
|
||||
results in a custom way, or sends them to an experiment management tracker of
|
||||
your choice. In this example, the function `my_custom_logger.v1` writes the
|
||||
tabular results to a file:
|
||||
|
||||
> ```ini
|
||||
> ### config.cfg (excerpt)
|
||||
> [training.logger]
|
||||
> @loggers = "my_custom_logger.v1"
|
||||
> log_path = "my_file.tab"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py
|
||||
from typing import Tuple, Callable, Dict, Any
|
||||
import spacy
|
||||
from pathlib import Path
|
||||
|
||||
@spacy.registry.loggers("my_custom_logger.v1")
|
||||
def custom_logger(log_path):
|
||||
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
|
||||
with Path(log_path).open("w") as file_:
|
||||
file_.write("step\\t")
|
||||
file_.write("score\\t")
|
||||
for pipe in nlp.pipe_names:
|
||||
file_.write(f"loss_{pipe}\\t")
|
||||
file_.write("\\n")
|
||||
|
||||
def log_step(info: Dict[str, Any]):
|
||||
with Path(log_path).open("a") as file_:
|
||||
file_.write(f"{info['step']}\\t")
|
||||
file_.write(f"{info['score']}\\t")
|
||||
for pipe in nlp.pipe_names:
|
||||
file_.write(f"{info['losses'][pipe]}\\t")
|
||||
file_.write("\\n")
|
||||
|
||||
def finalize():
|
||||
pass
|
||||
|
||||
return log_step, finalize
|
||||
|
||||
return setup_logger
|
||||
```
|
||||
|
||||
#### Example: Custom batch size schedule {#custom-code-schedule}
|
||||
|
||||
For example, let's say you've implemented your own batch size schedule to use
|
||||
|
|
|
@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
| `GoldParse` | [`Example`](/api/example) |
|
||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
||||
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
|
||||
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
|
||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
||||
|
|
Loading…
Reference in New Issue
Block a user