mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c38298b8fa
|
@ -264,9 +264,9 @@ def train_while_improving(
|
||||||
|
|
||||||
epoch (int): How many passes over the data have been completed.
|
epoch (int): How many passes over the data have been completed.
|
||||||
step (int): How many steps have been completed.
|
step (int): How many steps have been completed.
|
||||||
score (float): The main score form the last evaluation.
|
score (float): The main score from the last evaluation.
|
||||||
other_scores: : The other scores from the last evaluation.
|
other_scores: : The other scores from the last evaluation.
|
||||||
loss: The accumulated losses throughout training.
|
losses: The accumulated losses throughout training.
|
||||||
checkpoints: A list of previous results, where each result is a
|
checkpoints: A list of previous results, where each result is a
|
||||||
(score, step, epoch) tuple.
|
(score, step, epoch) tuple.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -112,6 +112,9 @@ class Warnings:
|
||||||
"word segmenters: {supported}. Defaulting to {default}.")
|
"word segmenters: {supported}. Defaulting to {default}.")
|
||||||
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||||
"segmenter is '{current}'.")
|
"segmenter is '{current}'.")
|
||||||
|
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
|
||||||
|
"need to match on a stream of documents, you can use nlp.pipe and "
|
||||||
|
"call the {matcher} on each Doc object.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -176,18 +176,10 @@ cdef class Matcher:
|
||||||
return (self._callbacks[key], self._patterns[key])
|
return (self._callbacks[key], self._patterns[key])
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||||
|
spaCy v3.0.
|
||||||
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
|
|
||||||
batch_size (int): Number of documents to accumulate into a working set.
|
|
||||||
return_matches (bool): Yield the match lists along with the docs, making
|
|
||||||
results (doc, matches) tuples.
|
|
||||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
|
||||||
and yield (result, context) tuples out.
|
|
||||||
If both return_matches and as_tuples are True, the output will
|
|
||||||
be a sequence of ((doc, matches), context) tuples.
|
|
||||||
YIELDS (Doc): Documents, in order.
|
|
||||||
"""
|
"""
|
||||||
|
warnings.warn(Warnings.W105.format(matcher="Matcher"), DeprecationWarning)
|
||||||
if as_tuples:
|
if as_tuples:
|
||||||
for doc, context in docs:
|
for doc, context in docs:
|
||||||
matches = self(doc)
|
matches = self(doc)
|
||||||
|
@ -203,13 +195,16 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def __call__(self, object doclike):
|
def __call__(self, object doclike, *, as_spans=False):
|
||||||
"""Find all token sequences matching the supplied pattern.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
doclike (Doc or Span): The document to match over.
|
doclike (Doc or Span): The document to match over.
|
||||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||||
|
start, end) tuples.
|
||||||
|
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||||
describing the matches. A match tuple describes a span
|
describing the matches. A match tuple describes a span
|
||||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||||
|
to True, a list of Span objects is returned.
|
||||||
"""
|
"""
|
||||||
if isinstance(doclike, Doc):
|
if isinstance(doclike, Doc):
|
||||||
doc = doclike
|
doc = doclike
|
||||||
|
@ -262,7 +257,10 @@ cdef class Matcher:
|
||||||
on_match = self._callbacks.get(key, None)
|
on_match = self._callbacks.get(key, None)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, final_matches)
|
on_match(self, doc, i, final_matches)
|
||||||
return final_matches
|
if as_spans:
|
||||||
|
return [Span(doc, start, end, label=key) for key, start, end in final_matches]
|
||||||
|
else:
|
||||||
|
return final_matches
|
||||||
|
|
||||||
def _normalize_key(self, key):
|
def _normalize_key(self, key):
|
||||||
if isinstance(key, basestring):
|
if isinstance(key, basestring):
|
||||||
|
|
|
@ -7,6 +7,7 @@ import warnings
|
||||||
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
|
from ..tokens.span cimport Span
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
from ..schemas import TokenPattern
|
from ..schemas import TokenPattern
|
||||||
|
@ -216,13 +217,16 @@ cdef class PhraseMatcher:
|
||||||
result = internal_node
|
result = internal_node
|
||||||
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
map_set(self.mem, <MapStruct*>result, self.vocab.strings[key], NULL)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc, *, as_spans=False):
|
||||||
"""Find all sequences matching the supplied patterns on the `Doc`.
|
"""Find all sequences matching the supplied patterns on the `Doc`.
|
||||||
|
|
||||||
doc (Doc): The document to match over.
|
doc (Doc): The document to match over.
|
||||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
as_spans (bool): Return Span objects with labels instead of (match_id,
|
||||||
|
start, end) tuples.
|
||||||
|
RETURNS (list): A list of `(match_id, start, end)` tuples,
|
||||||
describing the matches. A match tuple describes a span
|
describing the matches. A match tuple describes a span
|
||||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
`doc[start:end]`. The `match_id` is an integer. If as_spans is set
|
||||||
|
to True, a list of Span objects is returned.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#call
|
DOCS: https://spacy.io/api/phrasematcher#call
|
||||||
"""
|
"""
|
||||||
|
@ -239,7 +243,10 @@ cdef class PhraseMatcher:
|
||||||
on_match = self._callbacks.get(self.vocab.strings[ent_id])
|
on_match = self._callbacks.get(self.vocab.strings[ent_id])
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
on_match(self, doc, i, matches)
|
on_match(self, doc, i, matches)
|
||||||
return matches
|
if as_spans:
|
||||||
|
return [Span(doc, start, end, label=key) for key, start, end in matches]
|
||||||
|
else:
|
||||||
|
return matches
|
||||||
|
|
||||||
cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
|
cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
|
||||||
cdef MapStruct* current_node = self.c_map
|
cdef MapStruct* current_node = self.c_map
|
||||||
|
@ -285,20 +292,10 @@ cdef class PhraseMatcher:
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
|
def pipe(self, stream, batch_size=1000, return_matches=False, as_tuples=False):
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""Match a stream of documents, yielding them in turn. Deprecated as of
|
||||||
|
spaCy v3.0.
|
||||||
docs (iterable): A stream of documents.
|
|
||||||
batch_size (int): Number of documents to accumulate into a working set.
|
|
||||||
return_matches (bool): Yield the match lists along with the docs, making
|
|
||||||
results (doc, matches) tuples.
|
|
||||||
as_tuples (bool): Interpret the input stream as (doc, context) tuples,
|
|
||||||
and yield (result, context) tuples out.
|
|
||||||
If both return_matches and as_tuples are True, the output will
|
|
||||||
be a sequence of ((doc, matches), context) tuples.
|
|
||||||
YIELDS (Doc): Documents, in order.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/phrasematcher#pipe
|
|
||||||
"""
|
"""
|
||||||
|
warnings.warn(Warnings.W105.format(matcher="PhraseMatcher"), DeprecationWarning)
|
||||||
if as_tuples:
|
if as_tuples:
|
||||||
for doc, context in stream:
|
for doc, context in stream:
|
||||||
matches = self(doc)
|
matches = self(doc)
|
||||||
|
|
|
@ -2,7 +2,8 @@ import pytest
|
||||||
import re
|
import re
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from spacy.matcher import Matcher, DependencyMatcher
|
from spacy.matcher import Matcher, DependencyMatcher
|
||||||
from spacy.tokens import Doc, Token
|
from spacy.tokens import Doc, Token, Span
|
||||||
|
|
||||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
@ -469,3 +470,26 @@ def test_matcher_span(matcher):
|
||||||
assert len(matcher(doc)) == 2
|
assert len(matcher(doc)) == 2
|
||||||
assert len(matcher(span_js)) == 1
|
assert len(matcher(span_js)) == 1
|
||||||
assert len(matcher(span_java)) == 1
|
assert len(matcher(span_java)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_as_spans(matcher):
|
||||||
|
"""Test the new as_spans=True API."""
|
||||||
|
text = "JavaScript is good but Java is better"
|
||||||
|
doc = Doc(matcher.vocab, words=text.split())
|
||||||
|
matches = matcher(doc, as_spans=True)
|
||||||
|
assert len(matches) == 2
|
||||||
|
assert isinstance(matches[0], Span)
|
||||||
|
assert matches[0].text == "JavaScript"
|
||||||
|
assert matches[0].label_ == "JS"
|
||||||
|
assert isinstance(matches[1], Span)
|
||||||
|
assert matches[1].text == "Java"
|
||||||
|
assert matches[1].label_ == "Java"
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_deprecated(matcher):
|
||||||
|
doc = Doc(matcher.vocab, words=["hello", "world"])
|
||||||
|
with pytest.warns(DeprecationWarning) as record:
|
||||||
|
for _ in matcher.pipe([doc]):
|
||||||
|
pass
|
||||||
|
assert record.list
|
||||||
|
assert "spaCy v3.0" in str(record.list[0].message)
|
||||||
|
|
|
@ -2,7 +2,7 @@ import pytest
|
||||||
import srsly
|
import srsly
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc, Span
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -287,3 +287,30 @@ def test_phrase_matcher_pickle(en_vocab):
|
||||||
# clunky way to vaguely check that callback is unpickled
|
# clunky way to vaguely check that callback is unpickled
|
||||||
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
|
(vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
|
||||||
assert isinstance(callbacks.get("TEST2"), Mock)
|
assert isinstance(callbacks.get("TEST2"), Mock)
|
||||||
|
|
||||||
|
|
||||||
|
def test_phrase_matcher_as_spans(en_vocab):
|
||||||
|
"""Test the new as_spans=True API."""
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("A", [Doc(en_vocab, words=["hello", "world"])])
|
||||||
|
matcher.add("B", [Doc(en_vocab, words=["test"])])
|
||||||
|
doc = Doc(en_vocab, words=["...", "hello", "world", "this", "is", "a", "test"])
|
||||||
|
matches = matcher(doc, as_spans=True)
|
||||||
|
assert len(matches) == 2
|
||||||
|
assert isinstance(matches[0], Span)
|
||||||
|
assert matches[0].text == "hello world"
|
||||||
|
assert matches[0].label_ == "A"
|
||||||
|
assert isinstance(matches[1], Span)
|
||||||
|
assert matches[1].text == "test"
|
||||||
|
assert matches[1].label_ == "B"
|
||||||
|
|
||||||
|
|
||||||
|
def test_phrase_matcher_deprecated(en_vocab):
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("TEST", [Doc(en_vocab, words=["helllo"])])
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
with pytest.warns(DeprecationWarning) as record:
|
||||||
|
for _ in matcher.pipe([doc]):
|
||||||
|
pass
|
||||||
|
assert record.list
|
||||||
|
assert "spaCy v3.0" in str(record.list[0].message)
|
||||||
|
|
|
@ -118,11 +118,11 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
|
||||||
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
|
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
|
||||||
argument that connects to the shared `tok2vec` component in the pipeline.
|
argument that connects to the shared `tok2vec` component in the pipeline.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
|
|
||||||
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
||||||
|
|
||||||
|
@ -323,11 +323,11 @@ for details and system requirements.
|
||||||
|
|
||||||
Load and wrap a transformer model from the
|
Load and wrap a transformer model from the
|
||||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library. You
|
[HuggingFace `transformers`](https://huggingface.co/transformers) library. You
|
||||||
can any transformer that has pretrained weights and a PyTorch implementation.
|
can use any transformer that has pretrained weights and a PyTorch
|
||||||
The `name` variable is passed through to the underlying library, so it can be
|
implementation. The `name` variable is passed through to the underlying library,
|
||||||
either a string or a path. If it's a string, the pretrained weights will be
|
so it can be either a string or a path. If it's a string, the pretrained weights
|
||||||
downloaded via the transformers library if they are not already available
|
will be downloaded via the transformers library if they are not already
|
||||||
locally.
|
available locally.
|
||||||
|
|
||||||
In order to support longer documents, the
|
In order to support longer documents, the
|
||||||
[TransformerModel](/api/architectures#TransformerModel) layer allows you to pass
|
[TransformerModel](/api/architectures#TransformerModel) layer allows you to pass
|
||||||
|
|
|
@ -116,31 +116,12 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
||||||
> matches = matcher(doc)
|
> matches = matcher(doc)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
| _keyword-only_ | |
|
||||||
|
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||||
## Matcher.pipe {#pipe tag="method"}
|
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||||
|
|
||||||
Match a stream of documents, yielding them in turn.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> from spacy.matcher import Matcher
|
|
||||||
> matcher = Matcher(nlp.vocab)
|
|
||||||
> for doc in matcher.pipe(docs, batch_size=50):
|
|
||||||
> pass
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
|
|
||||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
|
||||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
|
||||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
|
||||||
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
|
||||||
|
|
||||||
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
|
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
|
||||||
|
|
||||||
|
|
|
@ -57,10 +57,12 @@ Find all token sequences matching the supplied patterns on the `Doc`.
|
||||||
> matches = matcher(doc)
|
> matches = matcher(doc)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | ----------------------------------- |
|
| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `doc` | The document to match over. ~~Doc~~ |
|
| `doc` | The document to match over. ~~Doc~~ |
|
||||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
| _keyword-only_ | |
|
||||||
|
| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~ |
|
||||||
|
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
|
||||||
|
|
||||||
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
|
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
|
||||||
|
|
||||||
|
@ -74,27 +76,6 @@ match_id_string = nlp.vocab.strings[match_id]
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## PhraseMatcher.pipe {#pipe tag="method"}
|
|
||||||
|
|
||||||
Match a stream of documents, yielding them in turn.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> from spacy.matcher import PhraseMatcher
|
|
||||||
> matcher = PhraseMatcher(nlp.vocab)
|
|
||||||
> for doc in matcher.pipe(docs, batch_size=50):
|
|
||||||
> pass
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Description |
|
|
||||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
||||||
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
|
|
||||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
|
||||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
|
||||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
|
||||||
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
|
||||||
|
|
||||||
## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
|
## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
|
||||||
|
|
||||||
Get the number of rules added to the matcher. Note that this only returns the
|
Get the number of rules added to the matcher. Note that this only returns the
|
||||||
|
|
|
@ -4,6 +4,7 @@ menu:
|
||||||
- ['spacy', 'spacy']
|
- ['spacy', 'spacy']
|
||||||
- ['displacy', 'displacy']
|
- ['displacy', 'displacy']
|
||||||
- ['registry', 'registry']
|
- ['registry', 'registry']
|
||||||
|
- ['Loggers', 'loggers']
|
||||||
- ['Batchers', 'batchers']
|
- ['Batchers', 'batchers']
|
||||||
- ['Data & Alignment', 'gold']
|
- ['Data & Alignment', 'gold']
|
||||||
- ['Utility Functions', 'util']
|
- ['Utility Functions', 'util']
|
||||||
|
@ -316,6 +317,7 @@ factories.
|
||||||
| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). |
|
| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). |
|
||||||
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||||
|
| `loggers` | Registry for functions that log [training results](/usage/training). |
|
||||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||||
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). |
|
||||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||||
|
@ -340,7 +342,7 @@ See the [`Transformer`](/api/transformer) API reference and
|
||||||
> def annotation_setter(docs, trf_data) -> None:
|
> def annotation_setter(docs, trf_data) -> None:
|
||||||
> # Set annotations on the docs
|
> # Set annotations on the docs
|
||||||
>
|
>
|
||||||
> return annotation_sette
|
> return annotation_setter
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Registry name | Description |
|
| Registry name | Description |
|
||||||
|
@ -348,6 +350,110 @@ See the [`Transformer`](/api/transformer) API reference and
|
||||||
| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. |
|
| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. |
|
||||||
| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
|
| [`annotation_setters`](/api/transformer#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. |
|
||||||
|
|
||||||
|
## Loggers {#loggers source="spacy/gold/loggers.py" new="3"}
|
||||||
|
|
||||||
|
A logger records the training results. When a logger is created, two functions
|
||||||
|
are returned: one for logging the information for each training step, and a
|
||||||
|
second function that is called to finalize the logging when the training is
|
||||||
|
finished. To log each training step, a
|
||||||
|
[dictionary](/usage/training#custom-logging) is passed on from the
|
||||||
|
[`spacy train`](/api/cli#train), including information such as the training loss
|
||||||
|
and the accuracy scores on the development set.
|
||||||
|
|
||||||
|
There are two built-in logging functions: a logger printing results to the
|
||||||
|
console in tabular format (which is the default), and one that also sends the
|
||||||
|
results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of
|
||||||
|
using one of the built-in loggers listed here, you can also
|
||||||
|
[implement your own](/usage/training#custom-logging).
|
||||||
|
|
||||||
|
#### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"}
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [training.logger]
|
||||||
|
> @loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Writes the results of a training step to the console in a tabular format.
|
||||||
|
|
||||||
|
<Accordion title="Example console output" spaced>
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy train config.cfg
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
ℹ Using CPU
|
||||||
|
ℹ Loading config and nlp from: config.cfg
|
||||||
|
ℹ Pipeline: ['tok2vec', 'tagger']
|
||||||
|
ℹ Start training
|
||||||
|
ℹ Training. Initial learn rate: 0.0
|
||||||
|
|
||||||
|
E # LOSS TOK2VEC LOSS TAGGER TAG_ACC SCORE
|
||||||
|
--- ------ ------------ ----------- ------- ------
|
||||||
|
1 0 0.00 86.20 0.22 0.00
|
||||||
|
1 200 3.08 18968.78 34.00 0.34
|
||||||
|
1 400 31.81 22539.06 33.64 0.34
|
||||||
|
1 600 92.13 22794.91 43.80 0.44
|
||||||
|
1 800 183.62 21541.39 56.05 0.56
|
||||||
|
1 1000 352.49 25461.82 65.15 0.65
|
||||||
|
1 1200 422.87 23708.82 71.84 0.72
|
||||||
|
1 1400 601.92 24994.79 76.57 0.77
|
||||||
|
1 1600 662.57 22268.02 80.20 0.80
|
||||||
|
1 1800 1101.50 28413.77 82.56 0.83
|
||||||
|
1 2000 1253.43 28736.36 85.00 0.85
|
||||||
|
1 2200 1411.02 28237.53 87.42 0.87
|
||||||
|
1 2400 1605.35 28439.95 88.70 0.89
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the cumulative loss keeps increasing within one epoch, but should
|
||||||
|
start decreasing across epochs.
|
||||||
|
|
||||||
|
</Accordion>
|
||||||
|
|
||||||
|
#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
|
||||||
|
|
||||||
|
> #### Installation
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ pip install wandb
|
||||||
|
> $ wandb login
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Built-in logger that sends the results of each training step to the dashboard of
|
||||||
|
the [Weights & Biases](https://www.wandb.com/) tool. To use this logger, Weights
|
||||||
|
& Biases should be installed, and you should be logged in. The logger will send
|
||||||
|
the full config file to W&B, as well as various system information such as
|
||||||
|
memory utilization, network traffic, disk IO, GPU statistics, etc. This will
|
||||||
|
also include information such as your hostname and operating system, as well as
|
||||||
|
the location of your Python executable.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
Note that by default, the full (interpolated)
|
||||||
|
[training config](/usage/training#config) is sent over to the W&B dashboard. If
|
||||||
|
you prefer to **exclude certain information** such as path names, you can list
|
||||||
|
those fields in "dot notation" in the `remove_config_values` parameter. These
|
||||||
|
fields will then be removed from the config before uploading, but will otherwise
|
||||||
|
remain in the config file stored on your local system.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [training.logger]
|
||||||
|
> @loggers = "spacy.WandbLogger.v1"
|
||||||
|
> project_name = "monitor_spacy_training"
|
||||||
|
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
|
||||||
|
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
|
||||||
|
|
||||||
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
|
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
|
||||||
|
|
||||||
A data batcher implements a batching strategy that essentially turns a stream of
|
A data batcher implements a batching strategy that essentially turns a stream of
|
||||||
|
|
|
@ -25,8 +25,8 @@ work out-of-the-box.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
This pipeline component lets you use transformer models in your pipeline.
|
This pipeline component lets you use transformer models in your pipeline. It
|
||||||
Supports all models that are available via the
|
supports all models that are available via the
|
||||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library.
|
[HuggingFace `transformers`](https://huggingface.co/transformers) library.
|
||||||
Usually you will connect subsequent components to the shared transformer using
|
Usually you will connect subsequent components to the shared transformer using
|
||||||
the [TransformerListener](/api/architectures#TransformerListener) layer. This
|
the [TransformerListener](/api/architectures#TransformerListener) layer. This
|
||||||
|
@ -50,8 +50,8 @@ The default config is defined by the pipeline component factory and describes
|
||||||
how the component should be configured. You can override its settings via the
|
how the component should be configured. You can override its settings via the
|
||||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
[`config.cfg` for training](/usage/training#config). See the
|
[`config.cfg` for training](/usage/training#config). See the
|
||||||
[model architectures](/api/architectures) documentation for details on the
|
[model architectures](/api/architectures#transformers) documentation for details
|
||||||
architectures and their arguments and hyperparameters.
|
on the transformer architectures and their arguments and hyperparameters.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -61,11 +61,11 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Setting | Description |
|
| Setting | Description |
|
||||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs to set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||||
|
@ -102,14 +102,14 @@ attribute. You can also provide a callback to set additional annotations. In
|
||||||
your application, you would normally use a shortcut for this and instantiate the
|
your application, you would normally use a shortcut for this and instantiate the
|
||||||
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs and stores the annotations on the `Doc`. The `Doc._.trf_data` attribute is set prior to calling the callback. By default, no additional annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
||||||
|
|
||||||
## Transformer.\_\_call\_\_ {#call tag="method"}
|
## Transformer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
@ -383,9 +383,8 @@ return tensors that refer to a whole padded batch of documents. These tensors
|
||||||
are wrapped into the
|
are wrapped into the
|
||||||
[FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
|
[FullTransformerBatch](/api/transformer#fulltransformerbatch) object. The
|
||||||
`FullTransformerBatch` then splits out the per-document data, which is handled
|
`FullTransformerBatch` then splits out the per-document data, which is handled
|
||||||
by this class. Instances of this class
|
by this class. Instances of this class are typically assigned to the
|
||||||
are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
|
[`Doc._.trf_data`](/api/transformer#custom-attributes) extension attribute.
|
||||||
extension attribute.
|
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -447,8 +446,9 @@ overlap, and you can also omit sections of the Doc if they are not relevant.
|
||||||
|
|
||||||
Span getters can be referenced in the `[components.transformer.model.get_spans]`
|
Span getters can be referenced in the `[components.transformer.model.get_spans]`
|
||||||
block of the config to customize the sequences processed by the transformer. You
|
block of the config to customize the sequences processed by the transformer. You
|
||||||
can also register custom span getters using the `@spacy.registry.span_getters`
|
can also register
|
||||||
decorator.
|
[custom span getters](/usage/embeddings-transformers#transformers-training-custom-settings)
|
||||||
|
using the `@spacy.registry.span_getters` decorator.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -518,7 +518,7 @@ right context.
|
||||||
|
|
||||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||||
|
|
||||||
Annotation setters are functions that that take a batch of `Doc` objects and a
|
Annotation setters are functions that take a batch of `Doc` objects and a
|
||||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set
|
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set
|
||||||
additional annotations on the `Doc`, e.g. to set custom or built-in attributes.
|
additional annotations on the `Doc`, e.g. to set custom or built-in attributes.
|
||||||
You can register custom annotation setters using the
|
You can register custom annotation setters using the
|
||||||
|
@ -551,6 +551,6 @@ The following built-in functions are available:
|
||||||
The component sets the following
|
The component sets the following
|
||||||
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
|
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | ------------------------------------------------------------------------ |
|
| ---------------- | ------------------------------------------------------------------------ |
|
||||||
| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
|
| `Doc._.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
|
||||||
|
|
|
@ -251,13 +251,14 @@ for doc in nlp.pipe(["some text", "some other text"]):
|
||||||
tokvecs = doc._.trf_data.tensors[-1]
|
tokvecs = doc._.trf_data.tensors[-1]
|
||||||
```
|
```
|
||||||
|
|
||||||
You can customize how the [`Transformer`](/api/transformer) component sets
|
You can also customize how the [`Transformer`](/api/transformer) component sets
|
||||||
annotations onto the [`Doc`](/api/doc), by changing the `annotation_setter`.
|
annotations onto the [`Doc`](/api/doc), by specifying a custom
|
||||||
This callback will be called with the raw input and output data for the whole
|
`annotation_setter`. This callback will be called with the raw input and output
|
||||||
batch, along with the batch of `Doc` objects, allowing you to implement whatever
|
data for the whole batch, along with the batch of `Doc` objects, allowing you to
|
||||||
you need. The annotation setter is called with a batch of [`Doc`](/api/doc)
|
implement whatever you need. The annotation setter is called with a batch of
|
||||||
objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch)
|
[`Doc`](/api/doc) objects and a
|
||||||
containing the transformers data for the batch.
|
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) containing the
|
||||||
|
transformers data for the batch.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def custom_annotation_setter(docs, trf_data):
|
def custom_annotation_setter(docs, trf_data):
|
||||||
|
|
|
@ -914,4 +914,4 @@ mattis pretium.
|
||||||
|
|
||||||
### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
|
### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
|
||||||
|
|
||||||
<!-- TODO: decide how we want this to work? Just send results plus config from spacy evaluate in a separate command/script? -->
|
<!-- TODO: link to WandB logger, explain that it's built-in but that you can also do other cool stuff with WandB? And then include example project (still need to decide what we want to do here) -->
|
||||||
|
|
|
@ -493,6 +493,39 @@ you prefer.
|
||||||
| `i` | Index of the current match (`matches[i`]). ~~int~~ |
|
| `i` | Index of the current match (`matches[i`]). ~~int~~ |
|
||||||
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
|
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
|
||||||
|
|
||||||
|
### Creating spans from matches {#matcher-spans}
|
||||||
|
|
||||||
|
Creating [`Span`](/api/span) objects from the returned matches is a very common
|
||||||
|
use case. spaCy makes this easy by giving you access to the `start` and `end`
|
||||||
|
token of each match, which you can use to construct a new span with an optional
|
||||||
|
label. As of spaCy v3.0, you can also set `as_spans=True` when calling the
|
||||||
|
matcher on a `Doc`, which will return a list of [`Span`](/api/span) objects
|
||||||
|
using the `match_id` as the span label.
|
||||||
|
|
||||||
|
```python
|
||||||
|
### {executable="true"}
|
||||||
|
import spacy
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
matcher = Matcher(nlp.vocab)
|
||||||
|
matcher.add("PERSON", [[{"lower": "barack"}, {"lower": "obama"}]])
|
||||||
|
doc = nlp("Barack Obama was the 44th president of the United States")
|
||||||
|
|
||||||
|
# 1. Return (match_id, start, end) tuples
|
||||||
|
matches = matcher(doc)
|
||||||
|
for match_id, start, end in matches:
|
||||||
|
# Create the matched span and assign the match_id as a label
|
||||||
|
span = Span(doc, start, end, label=match_id)
|
||||||
|
print(span.text, span.label_)
|
||||||
|
|
||||||
|
# 2. Return Span objects directly
|
||||||
|
matches = matcher(doc, as_spans=True)
|
||||||
|
for span in matches:
|
||||||
|
print(span.text, span.label_)
|
||||||
|
```
|
||||||
|
|
||||||
### Using custom pipeline components {#matcher-pipeline}
|
### Using custom pipeline components {#matcher-pipeline}
|
||||||
|
|
||||||
Let's say your data also contains some annoying pre-processing artifacts, like
|
Let's say your data also contains some annoying pre-processing artifacts, like
|
||||||
|
@ -823,15 +856,6 @@ for token in doc:
|
||||||
print(token.text, token._.is_hashtag)
|
print(token.text, token._.is_hashtag)
|
||||||
```
|
```
|
||||||
|
|
||||||
To process a stream of social media posts, we can use
|
|
||||||
[`Language.pipe`](/api/language#pipe), which will return a stream of `Doc`
|
|
||||||
objects that we can pass to [`Matcher.pipe`](/api/matcher#pipe).
|
|
||||||
|
|
||||||
```python
|
|
||||||
docs = nlp.pipe(LOTS_OF_TWEETS)
|
|
||||||
matches = matcher.pipe(docs)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Efficient phrase matching {#phrasematcher}
|
## Efficient phrase matching {#phrasematcher}
|
||||||
|
|
||||||
If you need to match large terminology lists, you can also use the
|
If you need to match large terminology lists, you can also use the
|
||||||
|
|
|
@ -605,6 +605,68 @@ to your Python file. Before loading the config, spaCy will import the
|
||||||
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
$ python -m spacy train config.cfg --output ./output --code ./functions.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Example: Custom logging function {#custom-logging}
|
||||||
|
|
||||||
|
During training, the results of each step are passed to a logger function. By
|
||||||
|
default, these results are written to the console with the
|
||||||
|
[`ConsoleLogger`](/api/top-level#ConsoleLogger). There is also built-in support
|
||||||
|
for writing the log files to [Weights & Biases](https://www.wandb.com/) with the
|
||||||
|
[`WandbLogger`](/api/top-level#WandbLogger). The logger function receives a
|
||||||
|
**dictionary** with the following keys:
|
||||||
|
|
||||||
|
| Key | Value |
|
||||||
|
| -------------- | ---------------------------------------------------------------------------------------------- |
|
||||||
|
| `epoch` | How many passes over the data have been completed. ~~int~~ |
|
||||||
|
| `step` | How many steps have been completed. ~~int~~ |
|
||||||
|
| `score` | The main score from the last evaluation, measured on the dev set. ~~float~~ |
|
||||||
|
| `other_scores` | The other scores from the last evaluation, measured on the dev set. ~~Dict[str, Any]~~ |
|
||||||
|
| `losses` | The accumulated training losses, keyed by component name. ~~Dict[str, float]~~ |
|
||||||
|
| `checkpoints` | A list of previous results, where each result is a (score, step, epoch) tuple. ~~List[Tuple]~~ |
|
||||||
|
|
||||||
|
You can easily implement and plug in your own logger that records the training
|
||||||
|
results in a custom way, or sends them to an experiment management tracker of
|
||||||
|
your choice. In this example, the function `my_custom_logger.v1` writes the
|
||||||
|
tabular results to a file:
|
||||||
|
|
||||||
|
> ```ini
|
||||||
|
> ### config.cfg (excerpt)
|
||||||
|
> [training.logger]
|
||||||
|
> @loggers = "my_custom_logger.v1"
|
||||||
|
> log_path = "my_file.tab"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```python
|
||||||
|
### functions.py
|
||||||
|
from typing import Tuple, Callable, Dict, Any
|
||||||
|
import spacy
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
@spacy.registry.loggers("my_custom_logger.v1")
|
||||||
|
def custom_logger(log_path):
|
||||||
|
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
|
||||||
|
with Path(log_path).open("w") as file_:
|
||||||
|
file_.write("step\\t")
|
||||||
|
file_.write("score\\t")
|
||||||
|
for pipe in nlp.pipe_names:
|
||||||
|
file_.write(f"loss_{pipe}\\t")
|
||||||
|
file_.write("\\n")
|
||||||
|
|
||||||
|
def log_step(info: Dict[str, Any]):
|
||||||
|
with Path(log_path).open("a") as file_:
|
||||||
|
file_.write(f"{info['step']}\\t")
|
||||||
|
file_.write(f"{info['score']}\\t")
|
||||||
|
for pipe in nlp.pipe_names:
|
||||||
|
file_.write(f"{info['losses'][pipe]}\\t")
|
||||||
|
file_.write("\\n")
|
||||||
|
|
||||||
|
def finalize():
|
||||||
|
pass
|
||||||
|
|
||||||
|
return log_step, finalize
|
||||||
|
|
||||||
|
return setup_logger
|
||||||
|
```
|
||||||
|
|
||||||
#### Example: Custom batch size schedule {#custom-code-schedule}
|
#### Example: Custom batch size schedule {#custom-code-schedule}
|
||||||
|
|
||||||
For example, let's say you've implemented your own batch size schedule to use
|
For example, let's say you've implemented your own batch size schedule to use
|
||||||
|
|
|
@ -389,6 +389,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
||||||
| `GoldParse` | [`Example`](/api/example) |
|
| `GoldParse` | [`Example`](/api/example) |
|
||||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||||
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
||||||
|
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
|
||||||
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
|
| `spacy init-model` | [`spacy init model`](/api/cli#init-model) |
|
||||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||||
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user