Update docstrings, docs and types

This commit is contained in:
Ines Montani 2020-07-29 11:36:42 +02:00
parent 7adffc5361
commit e0ffe36e79
53 changed files with 821 additions and 422 deletions

View File

@ -1,7 +1,15 @@
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING
from pathlib import Path
import random import random
from .. import util from .. import util
from .example import Example from .example import Example
from ..tokens import DocBin, Doc from ..tokens import DocBin, Doc
from ..vocab import Vocab
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from ..language import Language # noqa: F401
class Corpus: class Corpus:
@ -11,20 +19,23 @@ class Corpus:
DOCS: https://spacy.io/api/corpus DOCS: https://spacy.io/api/corpus
""" """
def __init__(self, train_loc, dev_loc, limit=0): def __init__(
self, train_loc: Union[str, Path], dev_loc: Union[str, Path], limit: int = 0
) -> None:
"""Create a Corpus. """Create a Corpus.
train (str / Path): File or directory of training data. train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data. dev (str / Path): File or directory of development data.
limit (int): Max. number of examples returned limit (int): Max. number of examples returned.
RETURNS (Corpus): The newly created object.
DOCS: https://spacy.io/api/corpus#init
""" """
self.train_loc = train_loc self.train_loc = train_loc
self.dev_loc = dev_loc self.dev_loc = dev_loc
self.limit = limit self.limit = limit
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path: Union[str, Path]) -> List[Path]:
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.is_dir(): if not path.is_dir():
return [path] return [path]
@ -43,7 +54,9 @@ class Corpus:
locs.append(path) locs.append(path)
return locs return locs
def _make_example(self, nlp, reference, gold_preproc): def _make_example(
self, nlp: "Language", reference: Doc, gold_preproc: bool
) -> Example:
if gold_preproc or reference.has_unknown_spaces: if gold_preproc or reference.has_unknown_spaces:
return Example( return Example(
Doc( Doc(
@ -56,7 +69,9 @@ class Corpus:
else: else:
return Example(nlp.make_doc(reference.text), reference) return Example(nlp.make_doc(reference.text), reference)
def make_examples(self, nlp, reference_docs, max_length=0): def make_examples(
self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
) -> Iterator[Example]:
for reference in reference_docs: for reference in reference_docs:
if len(reference) == 0: if len(reference) == 0:
continue continue
@ -69,7 +84,9 @@ class Corpus:
elif max_length == 0 or len(ref_sent) < max_length: elif max_length == 0 or len(ref_sent) < max_length:
yield self._make_example(nlp, ref_sent.as_doc(), False) yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc(self, nlp, reference_docs): def make_examples_gold_preproc(
self, nlp: "Language", reference_docs: Iterable[Doc]
) -> Iterator[Example]:
for reference in reference_docs: for reference in reference_docs:
if reference.is_sentenced: if reference.is_sentenced:
ref_sents = [sent.as_doc() for sent in reference.sents] ref_sents = [sent.as_doc() for sent in reference.sents]
@ -80,7 +97,9 @@ class Corpus:
if len(eg.x): if len(eg.x):
yield eg yield eg
def read_docbin(self, vocab, locs): def read_docbin(
self, vocab: Vocab, locs: Iterable[Union[str, Path]]
) -> Iterator[Doc]:
""" Yield training examples as example dicts """ """ Yield training examples as example dicts """
i = 0 i = 0
for loc in locs: for loc in locs:
@ -96,8 +115,14 @@ class Corpus:
if self.limit >= 1 and i >= self.limit: if self.limit >= 1 and i >= self.limit:
break break
def count_train(self, nlp): def count_train(self, nlp: "Language") -> int:
"""Returns count of words in train examples""" """Returns count of words in train examples.
nlp (Language): The current nlp. object.
RETURNS (int): The word count.
DOCS: https://spacy.io/api/corpus#count_train
"""
n = 0 n = 0
i = 0 i = 0
for example in self.train_dataset(nlp): for example in self.train_dataset(nlp):
@ -108,8 +133,25 @@ class Corpus:
return n return n
def train_dataset( def train_dataset(
self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs self,
): nlp: "Language",
*,
shuffle: bool = True,
gold_preproc: bool = False,
max_length: int = 0
) -> Iterator[Example]:
"""Yield examples from the training data.
nlp (Language): The current nlp object.
shuffle (bool): Whether to shuffle the examples.
gold_preproc (bool): Whether to train on gold-standard sentences and tokens.
max_length (int): Maximum document length. Longer documents will be
split into sentences, if sentence boundaries are available. 0 for
no limit.
YIELDS (Example): The examples.
DOCS: https://spacy.io/api/corpus#train_dataset
"""
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
if gold_preproc: if gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs) examples = self.make_examples_gold_preproc(nlp, ref_docs)
@ -120,7 +162,17 @@ class Corpus:
random.shuffle(examples) random.shuffle(examples)
yield from examples yield from examples
def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs): def dev_dataset(
self, nlp: "Language", *, gold_preproc: bool = False
) -> Iterator[Example]:
"""Yield examples from the development data.
nlp (Language): The current nlp object.
gold_preproc (bool): Whether to train on gold-standard sentences and tokens.
YIELDS (Example): The examples.
DOCS: https://spacy.io/api/corpus#dev_dataset
"""
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
if gold_preproc: if gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs) examples = self.make_examples_gold_preproc(nlp, ref_docs)

View File

@ -21,7 +21,6 @@ class Lemmatizer:
lookups (Lookups): The lookups object containing the (optional) tables lookups (Lookups): The lookups object containing the (optional) tables
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
RETURNS (Lemmatizer): The newly constructed object.
""" """
self.lookups = lookups if lookups is not None else Lookups() self.lookups = lookups if lookups is not None else Lookups()
self.is_base_form = is_base_form self.is_base_form = is_base_form

View File

@ -52,8 +52,6 @@ class Lookups:
def __init__(self) -> None: def __init__(self) -> None:
"""Initialize the Lookups object. """Initialize the Lookups object.
RETURNS (Lookups): The newly created object.
DOCS: https://spacy.io/api/lookups#init DOCS: https://spacy.io/api/lookups#init
""" """
self._tables = {} self._tables = {}
@ -202,7 +200,6 @@ class Table(OrderedDict):
data (dict): The dictionary. data (dict): The dictionary.
name (str): Optional table name for reference. name (str): Optional table name for reference.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.from_dict DOCS: https://spacy.io/api/lookups#table.from_dict
""" """
@ -215,7 +212,6 @@ class Table(OrderedDict):
name (str): Optional table name for reference. name (str): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter. data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.init DOCS: https://spacy.io/api/lookups#table.init
""" """

View File

@ -36,7 +36,6 @@ cdef class DependencyMatcher:
vocab (Vocab): The vocabulary object, which must be shared with the vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on. documents the matcher will operate on.
RETURNS (DependencyMatcher): The newly constructed object.
""" """
size = 20 size = 20
# TODO: make matcher work with validation # TODO: make matcher work with validation

View File

@ -37,7 +37,6 @@ cdef class Matcher:
vocab (Vocab): The vocabulary object, which must be shared with the vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on. documents the matcher will operate on.
RETURNS (Matcher): The newly constructed object.
""" """
self._extra_predicates = [] self._extra_predicates = []
self._patterns = {} self._patterns = {}

View File

@ -32,7 +32,6 @@ cdef class PhraseMatcher:
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
attr (int / str): Token attribute to match on. attr (int / str): Token attribute to match on.
validate (bool): Perform additional validation when patterns are added. validate (bool): Perform additional validation when patterns are added.
RETURNS (PhraseMatcher): The newly constructed object.
DOCS: https://spacy.io/api/phrasematcher#init DOCS: https://spacy.io/api/phrasematcher#init
""" """

View File

@ -86,7 +86,6 @@ class EntityRuler:
overwrite_ents (bool): If existing entities are present, e.g. entities overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary. added by the model, overwrite them by matches if necessary.
ent_id_sep (str): Separator used internally for entity IDs. ent_id_sep (str): Separator used internally for entity IDs.
RETURNS (EntityRuler): The newly constructed object.
DOCS: https://spacy.io/api/entityruler#init DOCS: https://spacy.io/api/entityruler#init
""" """

View File

@ -72,7 +72,6 @@ class Scorer:
def __init__(self, nlp=None, **cfg): def __init__(self, nlp=None, **cfg):
"""Initialize the Scorer. """Initialize the Scorer.
RETURNS (Scorer): The newly created object.
DOCS: https://spacy.io/api/scorer#init DOCS: https://spacy.io/api/scorer#init
""" """

View File

@ -97,7 +97,6 @@ cdef class StringStore:
"""Create the StringStore. """Create the StringStore.
strings (iterable): A sequence of unicode strings to add to the store. strings (iterable): A sequence of unicode strings to add to the store.
RETURNS (StringStore): The newly constructed object.
""" """
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()

View File

@ -50,7 +50,6 @@ cdef class Tokenizer:
recognised as tokens. recognised as tokens.
url_match (callable): A boolean function matching strings to be url_match (callable): A boolean function matching strings to be
recognised as tokens after considering prefixes and suffixes. recognised as tokens after considering prefixes and suffixes.
RETURNS (Tokenizer): The newly constructed object.
EXAMPLE: EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab) >>> tokenizer = Tokenizer(nlp.vocab)

View File

@ -312,6 +312,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
"""Retokenize the document, such that the token at """Retokenize the document, such that the token at
`doc[token_index]` is split into tokens with the orth 'orths' `doc[token_index]` is split into tokens with the orth 'orths'
token_index(int): token index of the token to split. token_index(int): token index of the token to split.
orths: IDs of the verbatim text content of the tokens to create orths: IDs of the verbatim text content of the tokens to create
**attributes: Attributes to assign to each of the newly created tokens. By default, **attributes: Attributes to assign to each of the newly created tokens. By default,
attributes are inherited from the original token. attributes are inherited from the original token.

View File

@ -1,10 +1,12 @@
from typing import Iterable, Iterator
import numpy import numpy
import zlib import zlib
import srsly import srsly
from thinc.api import NumpyOps from thinc.api import NumpyOps
from .doc import Doc
from ..vocab import Vocab
from ..compat import copy_reg from ..compat import copy_reg
from ..tokens import Doc
from ..attrs import SPACY, ORTH, intify_attr from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors from ..errors import Errors
@ -44,13 +46,18 @@ class DocBin:
document from the DocBin. document from the DocBin.
""" """
def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]): def __init__(
self,
attrs: Iterable[str] = ALL_ATTRS,
store_user_data: bool = False,
docs=Iterable[Doc],
) -> None:
"""Create a DocBin object to hold serialized annotations. """Create a DocBin object to hold serialized annotations.
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are attrs (Iterable[str]): List of attributes to serialize. 'orth' and
always serialized, so they're not required. Defaults to None. 'spacy' are always serialized, so they're not required.
store_user_data (bool): Whether to include the `Doc.user_data`. store_user_data (bool): Whether to include the `Doc.user_data`.
RETURNS (DocBin): The newly constructed object. docs (Iterable[Doc]): Docs to add.
DOCS: https://spacy.io/api/docbin#init DOCS: https://spacy.io/api/docbin#init
""" """
@ -68,11 +75,11 @@ class DocBin:
for doc in docs: for doc in docs:
self.add(doc) self.add(doc)
def __len__(self): def __len__(self) -> int:
"""RETURNS: The number of Doc objects added to the DocBin.""" """RETURNS: The number of Doc objects added to the DocBin."""
return len(self.tokens) return len(self.tokens)
def add(self, doc): def add(self, doc: Doc) -> None:
"""Add a Doc's annotations to the DocBin for serialization. """Add a Doc's annotations to the DocBin for serialization.
doc (Doc): The Doc object to add. doc (Doc): The Doc object to add.
@ -100,7 +107,7 @@ class DocBin:
if self.store_user_data: if self.store_user_data:
self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.user_data.append(srsly.msgpack_dumps(doc.user_data))
def get_docs(self, vocab): def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
"""Recover Doc objects from the annotations, using the given vocab. """Recover Doc objects from the annotations, using the given vocab.
vocab (Vocab): The shared vocab. vocab (Vocab): The shared vocab.
@ -125,7 +132,7 @@ class DocBin:
doc.user_data.update(user_data) doc.user_data.update(user_data)
yield doc yield doc
def merge(self, other): def merge(self, other: "DocBin") -> None:
"""Extend the annotations of this DocBin with the annotations from """Extend the annotations of this DocBin with the annotations from
another. Will raise an error if the pre-defined attrs of the two another. Will raise an error if the pre-defined attrs of the two
DocBins don't match. DocBins don't match.
@ -144,7 +151,7 @@ class DocBin:
if self.store_user_data: if self.store_user_data:
self.user_data.extend(other.user_data) self.user_data.extend(other.user_data)
def to_bytes(self): def to_bytes(self) -> bytes:
"""Serialize the DocBin's annotations to a bytestring. """Serialize the DocBin's annotations to a bytestring.
RETURNS (bytes): The serialized DocBin. RETURNS (bytes): The serialized DocBin.
@ -156,7 +163,6 @@ class DocBin:
lengths = [len(tokens) for tokens in self.tokens] lengths = [len(tokens) for tokens in self.tokens]
tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([]) tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
msg = { msg = {
"version": self.version, "version": self.version,
"attrs": self.attrs, "attrs": self.attrs,
@ -171,7 +177,7 @@ class DocBin:
msg["user_data"] = self.user_data msg["user_data"] = self.user_data
return zlib.compress(srsly.msgpack_dumps(msg)) return zlib.compress(srsly.msgpack_dumps(msg))
def from_bytes(self, bytes_data): def from_bytes(self, bytes_data: bytes) -> "DocBin":
"""Deserialize the DocBin's annotations from a bytestring. """Deserialize the DocBin's annotations from a bytestring.
bytes_data (bytes): The data to load from. bytes_data (bytes): The data to load from.

View File

@ -173,7 +173,6 @@ cdef class Doc:
words. True means that the word is followed by a space, False means words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)` it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc. user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object.
DOCS: https://spacy.io/api/doc#init DOCS: https://spacy.io/api/doc#init
""" """

View File

@ -94,7 +94,6 @@ cdef class Span:
kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span. of the span.
RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/span#init DOCS: https://spacy.io/api/span#init
""" """

View File

@ -58,7 +58,6 @@ cdef class Vectors:
data (numpy.ndarray): The vector data. data (numpy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data. keys (iterable): A sequence of keys, aligned with the data.
name (str): A name to identify the vectors table. name (str): A name to identify the vectors table.
RETURNS (Vectors): The newly created object.
DOCS: https://spacy.io/api/vectors#init DOCS: https://spacy.io/api/vectors#init
""" """

View File

@ -74,7 +74,6 @@ cdef class Vocab:
lookups (Lookups): Container for large lookup tables and dictionaries. lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability. oov_prob (float): Default OOV probability.
vectors_name (unicode): Optional name to identify the vectors table. vectors_name (unicode): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object.
""" """
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
if lookups in (None, True, False): if lookups in (None, True, False):

View File

@ -4,6 +4,7 @@ teaser: Pre-defined model architectures included with the core library
source: spacy/ml/models source: spacy/ml/models
menu: menu:
- ['Tok2Vec', 'tok2vec'] - ['Tok2Vec', 'tok2vec']
- ['Transformers', 'transformers']
- ['Parser & NER', 'parser'] - ['Parser & NER', 'parser']
- ['Text Classification', 'textcat'] - ['Text Classification', 'textcat']
- ['Entity Linking', 'entitylinker'] - ['Entity Linking', 'entitylinker']
@ -13,7 +14,7 @@ TODO: intro and how architectures work, link to
[`registry`](/api/top-level#registry), [`registry`](/api/top-level#registry),
[custom models](/usage/training#custom-models) usage etc. [custom models](/usage/training#custom-models) usage etc.
## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}} ## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
@ -21,12 +22,14 @@ TODO: intro and how architectures work, link to
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM} ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
### spacy-transformers.TransformerModel.v1 {#TransformerModel}
## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"} ## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"}
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser} ### spacy.TransitionBasedParser.v1 {#TransitionBasedParser}
<!-- TODO: intro -->
> #### Example Config > #### Example Config
> >
> ```ini > ```ini

View File

@ -13,25 +13,84 @@ datasets in the [DocBin](/api/docbin) (`.spacy`) format.
Create a `Corpus`. The input data can be a file or a directory of files. Create a `Corpus`. The input data can be a file or a directory of files.
> #### Example
>
> ```python
> from spacy.gold import Corpus
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------ | ---------------------------------------------------------------- | | ------- | ------------ | ---------------------------------------------------------------- |
| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | | `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). |
| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | | `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). |
| `limit` | int | Maximum number of examples returned. | | `limit` | int | Maximum number of examples returned. `0` for no limit (default). |
| **RETURNS** | `Corpus` | The newly constructed object. |
<!-- TODO: document remaining methods / decide which to document -->
## Corpus.walk_corpus {#walk_corpus tag="staticmethod"}
## Corpus.make_examples {#make_examples tag="method"}
## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"}
## Corpus.read_docbin {#read_docbin tag="method"}
## Corpus.count_train {#count_train tag="method"}
## Corpus.train_dataset {#train_dataset tag="method"} ## Corpus.train_dataset {#train_dataset tag="method"}
Yield examples from the training data.
> #### Example
>
> ```python
> from spacy.gold import Corpus
> import spacy
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> nlp = spacy.blank("en")
> train_data = corpus.train_dataset(nlp)
> ```
| Name | Type | Description |
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `nlp` | `Language` | The current `nlp` object. |
| _keyword-only_ | | |
| `shuffle` | bool | Whether to shuffle the examples. Defaults to `True`. |
| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. |
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. `0` for no limit (default).  |
| **YIELDS** | `Example` | The examples. |
## Corpus.dev_dataset {#dev_dataset tag="method"} ## Corpus.dev_dataset {#dev_dataset tag="method"}
Yield examples from the development data.
> #### Example
>
> ```python
> from spacy.gold import Corpus
> import spacy
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> nlp = spacy.blank("en")
> dev_data = corpus.dev_dataset(nlp)
> ```
| Name | Type | Description |
| -------------- | ---------- | ---------------------------------------------------------------------------- |
| `nlp` | `Language` | The current `nlp` object. |
| _keyword-only_ | | |
| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. |
| **YIELDS** | `Example` | The examples. |
## Corpus.count_train {#count_train tag="method"}
Get the word count of all training examples.
> #### Example
>
> ```python
> from spacy.gold import Corpus
> import spacy
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> nlp = spacy.blank("en")
> word_count = corpus.count_train(nlp)
> ```
| Name | Type | Description |
| ----------- | ---------- | ------------------------- |
| `nlp` | `Language` | The current `nlp` object. |
| **RETURNS** | int | The word count. |
<!-- TODO: document remaining methods? / decide which to document -->

View File

@ -88,12 +88,11 @@ Create a `Token` object from a `TokenC*` pointer.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | --------- | ------------------------------------------------------------ | | -------- | --------- | ------------------------------------------------------------ |
| `vocab` | `Vocab` | A reference to the shared `Vocab`. | | `vocab` | `Vocab` | A reference to the shared `Vocab`. |
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | | `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. |
| `offset` | `int` | The offset of the token within the document. | | `offset` | `int` | The offset of the token within the document. |
| `doc` | `Doc` | The parent document. | | `doc` | `Doc` | The parent document. |
| **RETURNS** | `Token` | The newly constructed object. |
## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"} ## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"}

View File

@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
## DependencyParser.begin_training {#begin_training tag="method"} ## DependencyParser.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example

View File

@ -31,11 +31,10 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | `Vocab` | A storage container for lexical types. |
| `words` | iterable | A list of strings to add to the container. | | `words` | iterable | A list of strings to add to the container. |
| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | | `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. |
| **RETURNS** | `Doc` | The newly constructed object. |
## Doc.\_\_getitem\_\_ {#getitem tag="method"} ## Doc.\_\_getitem\_\_ {#getitem tag="method"}

View File

@ -45,10 +45,10 @@ Create a `DocBin` object to hold serialized annotations.
> ``` > ```
| Argument | Type | Description | | Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | | `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | | `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. | | `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. |
## DocBin.\_\len\_\_ {#len tag="method"} ## DocBin.\_\len\_\_ {#len tag="method"}

View File

@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
## EntityLinker.begin_training {#begin_training tag="method"} ## EntityLinker.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
method, a knowledge base should have been defined with method, a knowledge base should have been defined with
[`set_kb`](/api/entitylinker#set_kb). [`set_kb`](/api/entitylinker#set_kb).

View File

@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
## EntityRecognizer.begin_training {#begin_training tag="method"} ## EntityRecognizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example

View File

@ -37,7 +37,6 @@ both documents.
| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. | | `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. | | `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. |
| **RETURNS** | `Example` | The newly constructed object. |
## Example.from_dict {#from_dict tag="classmethod"} ## Example.from_dict {#from_dict tag="classmethod"}

View File

@ -28,10 +28,9 @@ Create the knowledge base.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------------------- | --------------- | ---------------------------------------- | | ---------------------- | ------- | ---------------------------------------- |
| `vocab` | `Vocab` | A `Vocab` object. | | `vocab` | `Vocab` | A `Vocab` object. |
| `entity_vector_length` | int | Length of the fixed-size entity vectors. | | `entity_vector_length` | int | Length of the fixed-size entity vectors. |
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
@ -255,7 +254,6 @@ but instead these objects are returned by the
| `entity_freq` | float | The entity frequency as recorded in the KB. | | `entity_freq` | float | The entity frequency as recorded in the KB. |
| `alias_hash` | int | The hash of the textual mention or alias. | | `alias_hash` | int | The hash of the textual mention or alias. |
| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` | | `prior_prob` | float | The prior probability of the `alias` referring to the `entity` |
| **RETURNS** | `Candidate` | The newly constructed object. |
## Candidate attributes {#candidate_attributes} ## Candidate attributes {#candidate_attributes}

View File

@ -15,6 +15,58 @@ the tagger or parser that are called on a document in order. You can also add
your own processing pipeline components that take a `Doc` object, modify it and your own processing pipeline components that take a `Doc` object, modify it and
return it. return it.
## Language.\_\_init\_\_ {#init tag="method"}
Initialize a `Language` object.
> #### Example
>
> ```python
> # Construction from subclass
> from spacy.lang.en import English
> nlp = English()
>
> # Construction from scratch
> from spacy.vocab import Vocab
> from spacy.language import Language
> nlp = Language(Vocab())
> ```
| Name | Type | Description |
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
| _keyword-only_ | | |
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
## Language.from_config {#from_config tag="classmethod"}
Create a `Language` object from a loaded config. Will set up the tokenizer and
language data, add pipeline components based on the pipeline and components
define in the config and validate the results. If no config is provided, the
default config of the given language is used. This is also how spaCy loads a
model under the hood based on its [`config.cfg`](/api/data-formats#config).
> #### Example
>
> ```python
> from thinc.api import Config
> from spacy.language import Language
>
> config = Config().from_disk("./config.cfg")
> nlp = Language.from_config(config)
> ```
| Name | Type | Description |
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
| _keyword-only_ | |
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
| **RETURNS** | `Language` | The initialized object. |
## Language.component {#component tag="classmethod" new="3"} ## Language.component {#component tag="classmethod" new="3"}
Register a custom pipeline component under a given name. This allows Register a custom pipeline component under a given name. This allows
@ -101,57 +153,6 @@ examples, see the
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
## Language.\_\_init\_\_ {#init tag="method"}
Initialize a `Language` object.
> #### Example
>
> ```python
> from spacy.vocab import Vocab
> from spacy.language import Language
> nlp = Language(Vocab())
>
> from spacy.lang.en import English
> nlp = English()
> ```
| Name | Type | Description |
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
| _keyword-only_ | | |
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
| **RETURNS** | `Language` | The newly constructed object. |
## Language.from_config {#from_config tag="classmethod"}
Create a `Language` object from a loaded config. Will set up the tokenizer and
language data, add pipeline components based on the pipeline and components
define in the config and validate the results. If no config is provided, the
default config of the given language is used. This is also how spaCy loads a
model under the hood based on its [`config.cfg`](/api/data-formats#config).
> #### Example
>
> ```python
> from thinc.api import Config
> from spacy.language import Language
>
> config = Config().from_disk("./config.cfg")
> nlp = Language.from_config(config)
> ```
| Name | Type | Description |
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
| _keyword-only_ | |
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
| **RETURNS** | `Language` | The initialized object. |
## Language.\_\_call\_\_ {#call tag="method"} ## Language.\_\_call\_\_ {#call tag="method"}
Apply the pipeline to some text. The text can span multiple sentences, and can Apply the pipeline to some text. The text can span multiple sentences, and can
@ -165,10 +166,12 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------- | --------------------------------------------------------------------------------- | | --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ |
| `text` | str | The text to be processed. | | `text` | str | The text to be processed. |
| _keyword-only_ | | |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| **RETURNS** | `Doc` | A container for accessing the annotations. | | `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. |
## Language.pipe {#pipe tag="method"} ## Language.pipe {#pipe tag="method"}
@ -184,15 +187,57 @@ more efficient than processing texts one-by-one.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts` | `Iterable[str]` | A sequence of strings. | | `texts` | `Iterable[str]` | A sequence of strings. |
| _keyword-only_ | | |
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | | `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
| `batch_size` | int | The number of texts to buffer. | | `batch_size` | int | The number of texts to buffer. |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | | `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | | `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
| **YIELDS** | `Doc` | Documents in the order of the original text. | | **YIELDS** | `Doc` | Documents in the order of the original text. |
## Language.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example
>
> ```python
> optimizer = nlp.begin_training(get_examples)
> ```
| Name | Type | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
| _keyword-only_ | | |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
Continue training a pretrained model. Create and return an optimizer, and
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
Rehearsal is used to prevent models from "forgetting" their initialized
"knowledge". To perform rehearsal, collect samples of text you want the models
to retain performance on, and call [`nlp.rehearse`](/api/language#rehearse) with
a batch of [Example](/api/example) objects.
> #### Example
>
> ```python
> optimizer = nlp.resume_training()
> nlp.rehearse(examples, sgd=optimizer)
> ```
| Name | Type | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
## Language.update {#update tag="method"} ## Language.update {#update tag="method"}
Update the models in the pipeline. Update the models in the pipeline.
@ -207,13 +252,35 @@ Update the models in the pipeline.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- | | --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | | `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | float | The dropout rate. |
| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | | `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | | `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | | `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
## Language.rehearse {#rehearse tag="method,experimental"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
the "catastrophic forgetting" problem. This feature is experimental.
> #### Example
>
> ```python
> optimizer = nlp.resume_training()
> losses = nlp.rehearse(examples, sgd=optimizer)
> ```
| Name | Type | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| _keyword-only_ | | |
| `drop` | float | The dropout rate. |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
## Language.evaluate {#evaluate tag="method"} ## Language.evaluate {#evaluate tag="method"}
@ -228,32 +295,14 @@ Evaluate a model's pipeline components.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------------- | ------------------------------- | ------------------------------------------------------------------------------------- | | --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| _keyword-only_ | | |
| `verbose` | bool | Print debugging information. | | `verbose` | bool | Print debugging information. |
| `batch_size` | int | The batch size to use. | | `batch_size` | int | The batch size to use. |
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | | `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | | `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| **RETURNS** | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores. | | **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
## Language.begin_training {#begin_training tag="method"}
Allocate models, pre-process training data and acquire an
[`Optimizer`](https://thinc.ai/docs/api-optimizers).
> #### Example
>
> ```python
> optimizer = nlp.begin_training(get_examples)
> ```
| Name | Type | Description |
| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ |
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
| `**cfg` | - | Config parameters (sent to all components). |
| **RETURNS** | `Optimizer` | An optimizer. |
## Language.use_params {#use_params tag="contextmanager, method"} ## Language.use_params {#use_params tag="contextmanager, method"}
@ -296,6 +345,7 @@ To create a component and add it to the pipeline, you should always use
| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | str | Name of the registered component factory. | | `factory_name` | str | Name of the registered component factory. |
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | | `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
| _keyword-only_ | | |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | | `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
| **RETURNS** | callable | The pipeline component. | | **RETURNS** | callable | The pipeline component. |
@ -419,9 +469,12 @@ Replace a component in the pipeline.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | --------------------------------- | | ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the component to replace. | | `name` | str | Name of the component to replace. |
| `component` | callable | The pipeline component to insert. | | `component` | callable | The pipeline component to insert. |
| _keyword-only_ | | |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. |
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
## Language.rename_pipe {#rename_pipe tag="method" new="2"} ## Language.rename_pipe {#rename_pipe tag="method" new="2"}
@ -493,7 +546,8 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
</Infobox> </Infobox>
| Name | Type | Description | | Name | Type | Description |
| ----------- | --------------- | ------------------------------------------------------------------------------------ | | -------------- | --------------- | ------------------------------------------------------------------------------------ |
| _keyword-only_ | | |
| `disable` | str / list | Name(s) of pipeline components to disable. | | `disable` | str / list | Name(s) of pipeline components to disable. |
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | | `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | | **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
@ -767,8 +821,8 @@ serialization by passing in the string names via the `exclude` argument.
The `FactoryMeta` contains the information about the component and its default The `FactoryMeta` contains the information about the component and its default
provided by the [`@Language.component`](/api/language#component) or provided by the [`@Language.component`](/api/language#component) or
[`@Language.factory`](/api/language#factory) decorator. It's created whenever a [`@Language.factory`](/api/language#factory) decorator. It's created whenever a
component is added to the pipeline and stored on the `Language` class for each component is defined and stored on the `Language` class for each component
component instance and factory instance. instance and factory instance.
| Name | Type | Description | | Name | Type | Description |
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |

View File

@ -31,7 +31,6 @@ when a `Language` subclass and its `Vocab` is initialized.
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | | `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
| **RETURNS** | `Lemmatizer` | The newly created object. |
## Lemmatizer.\_\_call\_\_ {#call tag="method"} ## Lemmatizer.\_\_call\_\_ {#call tag="method"}

View File

@ -14,10 +14,9 @@ lemmatization depends on the part-of-speech tag).
Create a `Lexeme` object. Create a `Lexeme` object.
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ----------------------------- | | ------- | ------- | -------------------------- |
| `vocab` | `Vocab` | The parent vocabulary. | | `vocab` | `Vocab` | The parent vocabulary. |
| `orth` | int | The orth id of the lexeme. | | `orth` | int | The orth id of the lexeme. |
| **RETURNS** | `Lexeme` | The newly constructed object. |
## Lexeme.set_flag {#set_flag tag="method"} ## Lexeme.set_flag {#set_flag tag="method"}

View File

@ -237,9 +237,8 @@ Initialize a new table.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ---------------------------------- | | ------ | ---- | ---------------------------------- |
| `name` | str | Optional table name for reference. | | `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
### Table.from_dict {#table.from_dict tag="classmethod"} ### Table.from_dict {#table.from_dict tag="classmethod"}

View File

@ -20,10 +20,9 @@ string where an integer is expected) or unexpected property names.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | | --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. | | `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. |
| **RETURNS** | `Matcher` | The newly constructed object. |
## Matcher.\_\_call\_\_ {#call tag="method"} ## Matcher.\_\_call\_\_ {#call tag="method"}

View File

@ -6,7 +6,6 @@ source: spacy/tokens/morphanalysis.pyx
Stores a single morphological analysis. Stores a single morphological analysis.
## MorphAnalysis.\_\_init\_\_ {#init tag="method"} ## MorphAnalysis.\_\_init\_\_ {#init tag="method"}
Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of
@ -22,11 +21,9 @@ morphological features.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | ----------------------------- | | ---------- | ------------------ | --------------------------- |
| `vocab` | `Vocab` | The vocab. | | `vocab` | `Vocab` | The vocab. |
| `features` | `Union[Dict, str]` | The morphological features. | | `features` | `Union[Dict, str]` | The morphological features. |
| **RETURNS** | `MorphAnalysis` | The newly constructed object. |
## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"} ## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"}
@ -44,7 +41,6 @@ Whether a feature/value pair is in the analysis.
| ----------- | ----- | ------------------------------------- | | ----------- | ----- | ------------------------------------- |
| **RETURNS** | `str` | A feature/value pair in the analysis. | | **RETURNS** | `str` | A feature/value pair in the analysis. |
## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"} ## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"}
Iterate over the feature/value pairs in the analysis. Iterate over the feature/value pairs in the analysis.
@ -61,7 +57,6 @@ Iterate over the feature/value pairs in the analysis.
| ---------- | ----- | ------------------------------------- | | ---------- | ----- | ------------------------------------- |
| **YIELDS** | `str` | A feature/value pair in the analysis. | | **YIELDS** | `str` | A feature/value pair in the analysis. |
## MorphAnalysis.\_\_len\_\_ {#len tag="method"} ## MorphAnalysis.\_\_len\_\_ {#len tag="method"}
Returns the number of features in the analysis. Returns the number of features in the analysis.
@ -78,7 +73,6 @@ Returns the number of features in the analysis.
| ----------- | ----- | --------------------------------------- | | ----------- | ----- | --------------------------------------- |
| **RETURNS** | `int` | The number of features in the analysis. | | **RETURNS** | `int` | The number of features in the analysis. |
## MorphAnalysis.\_\_str\_\_ {#str tag="method"} ## MorphAnalysis.\_\_str\_\_ {#str tag="method"}
Returns the morphological analysis in the UD FEATS string format. Returns the morphological analysis in the UD FEATS string format.
@ -92,10 +86,9 @@ Returns the morphological analysis in the UD FEATS string format.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----- | ---------------------------------| | ----------- | ----- | -------------------------------- |
| **RETURNS** | `str` | The analysis in UD FEATS format. | | **RETURNS** | `str` | The analysis in UD FEATS format. |
## MorphAnalysis.get {#get tag="method"} ## MorphAnalysis.get {#get tag="method"}
Retrieve values for a feature by field. Retrieve values for a feature by field.
@ -109,11 +102,10 @@ Retrieve values for a feature by field.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------ | ----------------------------------- | | ----------- | ------ | ---------------------------------- |
| `field` | `str` | The field to retrieve. | | `field` | `str` | The field to retrieve. |
| **RETURNS** | `list` | A list of the individual features. | | **RETURNS** | `list` | A list of the individual features. |
## MorphAnalysis.to_dict {#to_dict tag="method"} ## MorphAnalysis.to_dict {#to_dict tag="method"}
Produce a dict representation of the analysis, in the same format as the tag Produce a dict representation of the analysis, in the same format as the tag
@ -128,10 +120,9 @@ map.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------ | -----------------------------------------| | ----------- | ------ | ---------------------------------------- |
| **RETURNS** | `dict` | The dict representation of the analysis. | | **RETURNS** | `dict` | The dict representation of the analysis. |
## MorphAnalysis.from_id {#from_id tag="classmethod"} ## MorphAnalysis.from_id {#from_id tag="classmethod"}
Create a morphological analysis from a given hash ID. Create a morphological analysis from a given hash ID.
@ -149,5 +140,3 @@ Create a morphological analysis from a given hash ID.
| ------- | ------- | -------------------------------- | | ------- | ------- | -------------------------------- |
| `vocab` | `Vocab` | The vocab. | | `vocab` | `Vocab` | The vocab. |
| `key` | `int` | The hash of the features string. | | `key` | `int` | The hash of the features string. |

View File

@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
## Morphologizer.begin_training {#begin_training tag="method"} ## Morphologizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example

View File

@ -4,12 +4,11 @@ tag: class
source: spacy/morphology.pyx source: spacy/morphology.pyx
--- ---
Store the possible morphological analyses for a language, and index them Store the possible morphological analyses for a language, and index them by
by hash. To save space on each token, tokens only know the hash of their hash. To save space on each token, tokens only know the hash of their
morphological analysis, so queries of morphological attributes are delegated to morphological analysis, so queries of morphological attributes are delegated to
this class. this class.
## Morphology.\_\_init\_\_ {#init tag="method"} ## Morphology.\_\_init\_\_ {#init tag="method"}
Create a Morphology object using the tag map, lemmatizer and exceptions. Create a Morphology object using the tag map, lemmatizer and exceptions.
@ -23,20 +22,17 @@ Create a Morphology object using the tag map, lemmatizer and exceptions.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | | ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- |
| `strings` | `StringStore` | The string store. | | `strings` | `StringStore` | The string store. |
| `tag_map` | `Dict[str, Dict]` | The tag map. | | `tag_map` | `Dict[str, Dict]` | The tag map. |
| `lemmatizer` | `Lemmatizer` | The lemmatizer. | | `lemmatizer` | `Lemmatizer` | The lemmatizer. |
| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | | `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
| **RETURNS** | `Morphology` | The newly constructed object. |
## Morphology.add {#add tag="method"} ## Morphology.add {#add tag="method"}
Insert a morphological analysis in the morphology table, if not already Insert a morphological analysis in the morphology table, if not already present.
present. The morphological analysis may be provided in the UD FEATS format as a The morphological analysis may be provided in the UD FEATS format as a string or
string or in the tag map dictionary format. Returns the hash of the new in the tag map dictionary format. Returns the hash of the new analysis.
analysis.
> #### Example > #### Example
> >
@ -47,10 +43,9 @@ analysis.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------- | --------------------------- | | ---------- | ------------------ | --------------------------- |
| `features` | `Union[Dict, str]` | The morphological features. | | `features` | `Union[Dict, str]` | The morphological features. |
## Morphology.get {#get tag="method"} ## Morphology.get {#get tag="method"}
> #### Example > #### Example
@ -64,32 +59,29 @@ analysis.
Get the FEATS string for the hash of the morphological analysis. Get the FEATS string for the hash of the morphological analysis.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------ | --------------------------------------- | | ------- | ---- | --------------------------------------- |
| `morph` | int | The hash of the morphological analysis. | | `morph` | int | The hash of the morphological analysis. |
## Morphology.load_tag_map {#load_tag_map tag="method"} ## Morphology.load_tag_map {#load_tag_map tag="method"}
Replace the current tag map with the provided tag map. Replace the current tag map with the provided tag map.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | ------------ | | --------- | ----------------- | ------------ |
| `tag_map` | `Dict[str, Dict]` | The tag map. | | `tag_map` | `Dict[str, Dict]` | The tag map. |
## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"} ## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
Replace the current morphological exceptions with the provided exceptions. Replace the current morphological exceptions with the provided exceptions.
| Name | Type | Description | | Name | Type | Description |
| ------------- | ------------------ | ----------------------------- | | ------------- | ----------------- | ----------------------------- |
| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | | `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
## Morphology.add_special_case {#add_special_case tag="method"} ## Morphology.add_special_case {#add_special_case tag="method"}
Add a special-case rule to the morphological analyzer. Tokens whose tag and Add a special-case rule to the morphological analyzer. Tokens whose tag and orth
orth match the rule will receive the specified properties. match the rule will receive the specified properties.
> #### Example > #### Example
> >
@ -99,26 +91,23 @@ orth match the rule will receive the specified properties.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---- | ---------------------------------------------- | | ---------- | ---- | ---------------------------------------------- |
| `tag_str` | str | The fine-grained tag. | | `tag_str` | str | The fine-grained tag. |
| `orth_str` | str | The token text. | | `orth_str` | str | The token text. |
| `attrs` | dict | The features to assign for this token and tag. | | `attrs` | dict | The features to assign for this token and tag. |
## Morphology.exc {#exc tag="property"} ## Morphology.exc {#exc tag="property"}
The current morphological exceptions. The current morphological exceptions.
| Name | Type | Description | | Name | Type | Description |
| ---------- | ----- | --------------------------------------------------- | | ---------- | ---- | --------------------------------------------------- |
| **YIELDS** | dict | The current dictionary of morphological exceptions. | | **YIELDS** | dict | The current dictionary of morphological exceptions. |
## Morphology.lemmatize {#lemmatize tag="method"} ## Morphology.lemmatize {#lemmatize tag="method"}
TODO TODO
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
Convert a string FEATS representation to a dictionary of features and values in Convert a string FEATS representation to a dictionary of features and values in
@ -133,11 +122,10 @@ the same format as the tag map.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---- | ------------------------------------------------------------- | | ----------- | ---- | ------------------------------------------------------------------ |
| `feats` | str | The morphological features in Universal Dependencies FEATS format. | | `feats` | str | The morphological features in Universal Dependencies FEATS format. |
| **RETURNS** | dict | The morphological features as a dictionary. | | **RETURNS** | dict | The morphological features as a dictionary. |
## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"} ## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"}
Convert a dictionary of features and values to a string FEATS representation. Convert a dictionary of features and values to a string FEATS representation.
@ -155,7 +143,6 @@ Convert a dictionary of features and values to a string FEATS representation.
| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. | | `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. |
| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. | | **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |

View File

@ -36,11 +36,10 @@ be shown.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | | --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
| `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | | `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. | | `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
| **RETURNS** | `PhraseMatcher` | The newly constructed object. |
## PhraseMatcher.\_\_call\_\_ {#call tag="method"} ## PhraseMatcher.\_\_call\_\_ {#call tag="method"}

View File

@ -95,7 +95,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
## Pipe.begin_training {#begin_training tag="method"} ## Pipe.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example
@ -198,7 +198,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
> >
> ```python > ```python
> pipe = nlp.add_pipe("your_custom_pipe") > pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = nlp.begin_training() > optimizer = nlp.resume_training()
> losses = pipe.rehearse(examples, sgd=optimizer) > losses = pipe.rehearse(examples, sgd=optimizer)
> ``` > ```

View File

@ -29,9 +29,8 @@ Create a new `Scorer`.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | | `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. |
| **RETURNS** | `Scorer` | The newly created object. |
## Scorer.score {#score tag="method"} ## Scorer.score {#score tag="method"}

View File

@ -116,7 +116,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
## SentenceRecognizer.begin_training {#begin_training tag="method"} ## SentenceRecognizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example
@ -201,7 +201,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
> >
> ```python > ```python
> senter = nlp.add_pipe("senter") > senter = nlp.add_pipe("senter")
> optimizer = nlp.begin_training() > optimizer = nlp.resume_training()
> losses = senter.rehearse(examples, sgd=optimizer) > losses = senter.rehearse(examples, sgd=optimizer)
> ``` > ```

View File

@ -19,14 +19,13 @@ Create a Span object from the slice `doc[start : end]`.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | | -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | `Doc` | The parent document. |
| `start` | int | The index of the first token of the span. | | `start` | int | The index of the first token of the span. |
| `end` | int | The index of the first token after the span. | | `end` | int | The index of the first token after the span. |
| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | | `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. |
| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | | `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. |
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
| **RETURNS** | `Span` | The newly constructed object. |
## Span.\_\_getitem\_\_ {#getitem tag="method"} ## Span.\_\_getitem\_\_ {#getitem tag="method"}

View File

@ -20,9 +20,8 @@ Create the `StringStore`.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------- | ------------------------------------------ | | --------- | -------- | ------------------------------------------ |
| `strings` | iterable | A sequence of strings to add to the store. | | `strings` | iterable | A sequence of strings to add to the store. |
| **RETURNS** | `StringStore` | The newly constructed object. |
## StringStore.\_\_len\_\_ {#len tag="method"} ## StringStore.\_\_len\_\_ {#len tag="method"}

View File

@ -114,7 +114,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
## Tagger.begin_training {#begin_training tag="method"} ## Tagger.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example
@ -199,7 +199,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
> >
> ```python > ```python
> tagger = nlp.add_pipe("tagger") > tagger = nlp.add_pipe("tagger")
> optimizer = nlp.begin_training() > optimizer = nlp.resume_training()
> losses = tagger.rehearse(examples, sgd=optimizer) > losses = tagger.rehearse(examples, sgd=optimizer)
> ``` > ```

View File

@ -133,7 +133,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
## TextCategorizer.begin_training {#begin_training tag="method"} ## TextCategorizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example
@ -218,7 +218,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
> >
> ```python > ```python
> textcat = nlp.add_pipe("textcat") > textcat = nlp.add_pipe("textcat")
> optimizer = nlp.begin_training() > optimizer = nlp.resume_training()
> losses = textcat.rehearse(examples, sgd=optimizer) > losses = textcat.rehearse(examples, sgd=optimizer)
> ``` > ```

View File

@ -110,7 +110,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
## Tok2Vec.begin_training {#begin_training tag="method"} ## Tok2Vec.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. [`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example > #### Example

View File

@ -18,11 +18,10 @@ Construct a `Token` object.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------------------------- | | -------- | ------- | ------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | `Vocab` | A storage container for lexical types. |
| `doc` | `Doc` | The parent document. | | `doc` | `Doc` | The parent document. |
| `offset` | int | The index of the token within the document. | | `offset` | int | The index of the token within the document. |
| **RETURNS** | `Token` | The newly constructed object. |
## Token.\_\_len\_\_ {#len tag="method"} ## Token.\_\_len\_\_ {#len tag="method"}
@ -394,7 +393,7 @@ The L2 norm of the token's vector representation.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | `Doc` | The parent document. |
| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. | | `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. |
| `text` | str | Verbatim text content. | | `text` | str | Verbatim text content. |

View File

@ -35,7 +35,7 @@ the
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | | ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. | | `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
@ -43,7 +43,6 @@ the
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | | `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | | `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"} ## Tokenizer.\_\_call\_\_ {#call tag="method"}

View File

@ -0,0 +1,107 @@
---
title: Transformer
teaser: Pipeline component for multi-task learning with transformer models
tag: class
source: github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
new: 3
api_base_class: /api/pipe
api_string_name: transformer
---
> #### Installation
>
> ```bash
> $ pip install spacy-transformers
> ```
<Infobox title="Important note" variant="warning">
This component is available via the extension package
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). It
exposes the component via entry points, so if you have the package installed,
using `factory = "transformer"` in your
[training config](/usage/training#config) or `nlp.add_pipe("transformer")` will
work out-of-the-box.
</Infobox>
This pipeline component lets you use transformer models in your pipeline. The
component assigns the output of the transformer to the Doc's extension
attributes. We also calculate an alignment between the word-piece tokens and the
spaCy tokenization, so that we can use the last hidden states to set the
`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
token, the spaCy token receives the sum of their values. To access the values,
you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. For
more details, see the [usage documentation](/usage/transformers).
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config). See the
[model architectures](/api/architectures) documentation for details on the
architectures and their arguments and hyperparameters.
> #### Example
>
> ```python
> from spacy_transformers import Transformer, DEFAULT_CONFIG
>
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
> ```
| Setting | Type | Description | Default |
| ------------------- | ------------------------------------------ | ------------------------------- | ------------------------------------------------------------------- |
| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
| `annotation_setter` | Callable | <!-- TODO: --> | [`null_annotation_setter`](/api/transformer#null_annotation_setter) |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) |
```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
```
## Transformer.\_\_init\_\_ {#init tag="method"}
> #### Example
>
> ```python
> # Construction via add_pipe with default model
> trf = nlp.add_pipe("transformer")
>
> # Construction via add_pipe with custom model
> config = {"model": {"@architectures": "my_transformer"}}
> trf = nlp.add_pipe("transformer", config=config)
>
> # Construction from class
> from spacy_transformers import Transformer
> trf = Transformer(nlp.vocab, model)
> ```
Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe).
| Name | Type | Description |
| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `annotation_setter` | `Callable` | <!-- TODO: --> |
| _keyword-only_ | | |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
<!-- TODO: document rest -->
## TransformerData {#transformerdata tag="dataclass"}
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
## Custom attributes {#custom-attributes}
The component sets the following
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
| Name | Type | Description |
| -------------- | ----------------- | -------------- |
| `Doc.trf_data` | `TransformerData` | <!-- TODO: --> |

View File

@ -37,7 +37,6 @@ you can add vectors to later.
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
| `keys` | iterable | A sequence of keys aligned with the data. | | `keys` | iterable | A sequence of keys aligned with the data. |
| `name` | str | A name to identify the vectors table. | | `name` | str | A name to identify the vectors table. |
| **RETURNS** | `Vectors` | The newly created object. |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"} ## Vectors.\_\_getitem\_\_ {#getitem tag="method"}

View File

@ -31,7 +31,6 @@ Create the vocabulary.
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | | `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |
| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. |
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. | | `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
| **RETURNS** | `Vocab` | The newly constructed object. |
## Vocab.\_\_len\_\_ {#len tag="method"} ## Vocab.\_\_len\_\_ {#len tag="method"}

View File

@ -3,4 +3,154 @@ title: Transformers
teaser: Using transformer models like BERT in spaCy teaser: Using transformer models like BERT in spaCy
--- ---
TODO: ... spaCy v3.0 lets you use almost **any statistical model** to power your pipeline.
You can use models implemented in a variety of frameworks, including TensorFlow,
PyTorch and MXNet. To keep things sane, spaCy expects models from these
frameworks to be wrapped with a common interface, using our machine learning
library [Thinc](https://thinc.ai). A transformer model is just a statistical
model, so the
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package
actually has very little work to do: we just have to provide a few functions
that do the required plumbing. We also provide a pipeline component,
[`Transformer`](/api/transformer), that lets you do multi-task learning and lets
you save the transformer outputs for later use.
<Project id="en_core_bert">
Try out a BERT-based model pipeline using this project template: swap in your
data, edit the settings and hyperparameters and train, evaluate, package and
visualize your model.
</Project>
<!-- TODO: the text below has been copied from the spacy-transformers repo and needs to be updated and adjusted
### Training usage
The recommended workflow for training is to use spaCy's
[config system](/usage/training#config), usually via the
[`spacy train`](/api/cli#train) command. The config system lets you describe a
tree of objects by referring to creation functions, including functions you
register yourself. Here's a config snippet for the `Transformer` component,
along with matching Python code.
```ini
[nlp]
lang = "en"
pipeline = ["transformer"]
[components.transformer]
factory = "transformer"
extra_annotation_setter = null
max_batch_size = 32
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "bert-base-cased"
tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans]
@span_getters = "get_doc_spans.v1"
```
```python
from spacy_transformers import Transformer
trf = Transformer(
nlp.vocab,
TransformerModel(
"bert-base-cased",
get_spans=get_doc_spans,
tokenizer_config={"use_fast": True},
),
annotation_setter=null_annotation_setter,
max_batch_size=32,
)
```
The `components.transformer` block adds the `transformer` component to the
pipeline, and the `components.transformer.model` block describes the creation of
a Thinc [`Model`](https://thinc.ai/docs/api-model) object that will be passed
into the component. The block names a function registered in the
`@architectures` registry. This function will be looked up and called using the
provided arguments. You're not limited to just that function --- you can write
your own or use someone else's. The only limitation is that it must return an
object of type `Model[List[Doc], FullTransformerBatch]`: that is, a Thinc model
that takes a list of `Doc` objects, and returns a `FullTransformerBatch` object
with the transformer data.
The same idea applies to task models that power the downstream components. Most
of spaCy's built-in model creation functions support a `tok2vec` argument, which
should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This is
where we'll plug in our transformer model, using the `Tok2VecTransformer` layer,
which sneakily delegates to the `Transformer` pipeline component.
```ini
[nlp]
lang = "en"
pipeline = ["ner"]
[components.ner]
factory = "ner"
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy-transformers.Tok2VecListener.v1"
grad_factor = 1.0
[nlp.pipeline.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
```
The `Tok2VecListener` layer expects a `pooling` layer, which needs to be of type
`Model[Ragged, Floats2d]`. This layer determines how the vector for each spaCy
token will be computed from the zero or more source rows the token is aligned
against. Here we use the `reduce_mean` layer, which averages the wordpiece rows.
We could instead use `reduce_last`, `reduce_max`, or a custom function you write
yourself.
You can have multiple components all listening to the same transformer model,
and all passing gradients back to it. By default, all of the gradients will be
equally weighted. You can control this with the `grad_factor` setting, which
lets you reweight the gradients from the different listeners. For instance,
setting `grad_factor = 0` would disable gradients from one of the listeners,
while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
custom learning rate for each component. Instead of a constant, you can also
provide a schedule, allowing you to freeze the shared parameters at the start of
training.
### Runtime usage
Transformer models can be used as drop-in replacements for other types of neural
networks, so your spaCy pipeline can include them in a way that's completely
invisible to the user. Users will download, load and use the model in the
standard way, like any other spaCy pipeline.
Instead of using the transformers as subnetworks directly, you can also use them
via the [`Transformer`](/api/transformer) pipeline component. This sets the
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
which lets you access the transformers outputs at runtime via the
`doc._.trf_data` extension attribute. You can also customize how the
`Transformer` object sets annotations onto the `Doc`, by customizing the
`Transformer.annotation_setter` object. This callback will be called with the
raw input and output data for the whole batch, along with the batch of `Doc`
objects, allowing you to implement whatever you need.
```python
import spacy
nlp = spacy.load("en_core_trf_lg")
for doc in nlp.pipe(["some text", "some other text"]):
doc._.trf_data.tensors
tokvecs = doc._.trf_data.tensors[-1]
```
The `nlp` object in this example is just like any other spaCy pipeline
-->

View File

@ -32,17 +32,34 @@ with more recent versions of spaCy v2.x, it's **unlikely** that your code relied
on them. on them.
| Removed | Replacement | | Removed | Replacement |
| ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) | | `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) |
| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) | | `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) |
| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) | | `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) |
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | | `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | | keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | | `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer), | | `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) |
## Migrating from v2.x {#migrating} ## Migrating from v2.x {#migrating}
### Downloading and loading models {#migrating-downloading-models}
Model symlinks and shortcuts like `en` are now officially deprecated. There are
[many different models](/models) with different capabilities and not just one
"English model". In order to download and load a model, you should always use
its full name for instance, `en_core_web_sm`.
```diff
- python -m spacy download en
+ python -m spacy download en_core_web_sm
```
```diff
- nlp = spacy.load("en")
+ nlp = spacy.load("en_core_web_sm")
```
### Custom pipeline components and factories {#migrating-pipeline-components} ### Custom pipeline components and factories {#migrating-pipeline-components}
Custom pipeline components now have to be registered explicitly using the Custom pipeline components now have to be registered explicitly using the
@ -179,6 +196,10 @@ workflows, from data preprocessing to training and packaging your model.
<!-- TODO: write --> <!-- TODO: write -->
#### Training via the Python API {#migrating-training-python}
<!-- TODO: this should explain the GoldParse -> Example stuff -->
#### Packaging models {#migrating-training-packaging} #### Packaging models {#migrating-training-packaging}
The [`spacy package`](/api/cli#package) command now automatically builds the The [`spacy package`](/api/cli#package) command now automatically builds the

View File

@ -81,6 +81,7 @@
"items": [ "items": [
{ "text": "Tokenizer", "url": "/api/tokenizer" }, { "text": "Tokenizer", "url": "/api/tokenizer" },
{ "text": "Tok2Vec", "url": "/api/tok2vec" }, { "text": "Tok2Vec", "url": "/api/tok2vec" },
{ "text": "Transformer", "url": "/api/transformer" },
{ "text": "Lemmatizer", "url": "/api/lemmatizer" }, { "text": "Lemmatizer", "url": "/api/lemmatizer" },
{ "text": "Morphologizer", "url": "/api/morphologizer" }, { "text": "Morphologizer", "url": "/api/morphologizer" },
{ "text": "Tagger", "url": "/api/tagger" }, { "text": "Tagger", "url": "/api/tagger" },

View File

@ -33,11 +33,12 @@ const Link = ({
const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest) const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest)
const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest) const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest)
const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest) const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest)
const sourceWithText = (isSource || isApi) && isString(children) const withIcon = isApi || isArch || isSource
const sourceWithText = withIcon && isString(children)
const linkClassNames = classNames(classes.root, className, { const linkClassNames = classNames(classes.root, className, {
[classes.hidden]: hidden, [classes.hidden]: hidden,
[classes.nowrap]: (isApi || isSource || isArch) && !sourceWithText, [classes.nowrap]: (withIcon && !sourceWithText) || isArch,
[classes.withIcon]: isApi || isSource || isArch, [classes.withIcon]: withIcon,
}) })
const Wrapper = ws ? Whitespace : Fragment const Wrapper = ws ? Whitespace : Fragment
const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null

View File

@ -22,6 +22,7 @@ export const headingTextClassName = 'heading-text'
* @returns {string} - URL to the file on GitHub. * @returns {string} - URL to the file on GitHub.
*/ */
export function github(filepath, branch = 'master') { export function github(filepath, branch = 'master') {
if (filepath && filepath.startsWith('github.com')) return `https://${filepath}`
const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : '' const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : ''
return `https://github.com/${repo}${path}` return `https://github.com/${repo}${path}`
} }