Update docstrings, docs and types

This commit is contained in:
Ines Montani 2020-07-29 11:36:42 +02:00
parent 7adffc5361
commit e0ffe36e79
53 changed files with 821 additions and 422 deletions

View File

@ -1,7 +1,15 @@
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING
from pathlib import Path
import random
from .. import util
from .example import Example
from ..tokens import DocBin, Doc
from ..vocab import Vocab
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from ..language import Language # noqa: F401
class Corpus:
@ -11,20 +19,23 @@ class Corpus:
DOCS: https://spacy.io/api/corpus
"""
def __init__(self, train_loc, dev_loc, limit=0):
def __init__(
self, train_loc: Union[str, Path], dev_loc: Union[str, Path], limit: int = 0
) -> None:
"""Create a Corpus.
train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data.
limit (int): Max. number of examples returned
RETURNS (Corpus): The newly created object.
limit (int): Max. number of examples returned.
DOCS: https://spacy.io/api/corpus#init
"""
self.train_loc = train_loc
self.dev_loc = dev_loc
self.limit = limit
@staticmethod
def walk_corpus(path):
def walk_corpus(path: Union[str, Path]) -> List[Path]:
path = util.ensure_path(path)
if not path.is_dir():
return [path]
@ -43,7 +54,9 @@ class Corpus:
locs.append(path)
return locs
def _make_example(self, nlp, reference, gold_preproc):
def _make_example(
self, nlp: "Language", reference: Doc, gold_preproc: bool
) -> Example:
if gold_preproc or reference.has_unknown_spaces:
return Example(
Doc(
@ -56,7 +69,9 @@ class Corpus:
else:
return Example(nlp.make_doc(reference.text), reference)
def make_examples(self, nlp, reference_docs, max_length=0):
def make_examples(
self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
) -> Iterator[Example]:
for reference in reference_docs:
if len(reference) == 0:
continue
@ -69,7 +84,9 @@ class Corpus:
elif max_length == 0 or len(ref_sent) < max_length:
yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc(self, nlp, reference_docs):
def make_examples_gold_preproc(
self, nlp: "Language", reference_docs: Iterable[Doc]
) -> Iterator[Example]:
for reference in reference_docs:
if reference.is_sentenced:
ref_sents = [sent.as_doc() for sent in reference.sents]
@ -80,7 +97,9 @@ class Corpus:
if len(eg.x):
yield eg
def read_docbin(self, vocab, locs):
def read_docbin(
self, vocab: Vocab, locs: Iterable[Union[str, Path]]
) -> Iterator[Doc]:
""" Yield training examples as example dicts """
i = 0
for loc in locs:
@ -96,8 +115,14 @@ class Corpus:
if self.limit >= 1 and i >= self.limit:
break
def count_train(self, nlp):
"""Returns count of words in train examples"""
def count_train(self, nlp: "Language") -> int:
"""Returns count of words in train examples.
nlp (Language): The current nlp. object.
RETURNS (int): The word count.
DOCS: https://spacy.io/api/corpus#count_train
"""
n = 0
i = 0
for example in self.train_dataset(nlp):
@ -108,8 +133,25 @@ class Corpus:
return n
def train_dataset(
self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs
):
self,
nlp: "Language",
*,
shuffle: bool = True,
gold_preproc: bool = False,
max_length: int = 0
) -> Iterator[Example]:
"""Yield examples from the training data.
nlp (Language): The current nlp object.
shuffle (bool): Whether to shuffle the examples.
gold_preproc (bool): Whether to train on gold-standard sentences and tokens.
max_length (int): Maximum document length. Longer documents will be
split into sentences, if sentence boundaries are available. 0 for
no limit.
YIELDS (Example): The examples.
DOCS: https://spacy.io/api/corpus#train_dataset
"""
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
if gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
@ -120,7 +162,17 @@ class Corpus:
random.shuffle(examples)
yield from examples
def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs):
def dev_dataset(
self, nlp: "Language", *, gold_preproc: bool = False
) -> Iterator[Example]:
"""Yield examples from the development data.
nlp (Language): The current nlp object.
gold_preproc (bool): Whether to train on gold-standard sentences and tokens.
YIELDS (Example): The examples.
DOCS: https://spacy.io/api/corpus#dev_dataset
"""
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
if gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)

View File

@ -21,7 +21,6 @@ class Lemmatizer:
lookups (Lookups): The lookups object containing the (optional) tables
"lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup".
RETURNS (Lemmatizer): The newly constructed object.
"""
self.lookups = lookups if lookups is not None else Lookups()
self.is_base_form = is_base_form

View File

@ -52,8 +52,6 @@ class Lookups:
def __init__(self) -> None:
"""Initialize the Lookups object.
RETURNS (Lookups): The newly created object.
DOCS: https://spacy.io/api/lookups#init
"""
self._tables = {}
@ -202,7 +200,6 @@ class Table(OrderedDict):
data (dict): The dictionary.
name (str): Optional table name for reference.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.from_dict
"""
@ -215,7 +212,6 @@ class Table(OrderedDict):
name (str): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.init
"""

View File

@ -36,7 +36,6 @@ cdef class DependencyMatcher:
vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on.
RETURNS (DependencyMatcher): The newly constructed object.
"""
size = 20
# TODO: make matcher work with validation

View File

@ -37,7 +37,6 @@ cdef class Matcher:
vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on.
RETURNS (Matcher): The newly constructed object.
"""
self._extra_predicates = []
self._patterns = {}

View File

@ -32,7 +32,6 @@ cdef class PhraseMatcher:
vocab (Vocab): The shared vocabulary.
attr (int / str): Token attribute to match on.
validate (bool): Perform additional validation when patterns are added.
RETURNS (PhraseMatcher): The newly constructed object.
DOCS: https://spacy.io/api/phrasematcher#init
"""

View File

@ -86,7 +86,6 @@ class EntityRuler:
overwrite_ents (bool): If existing entities are present, e.g. entities
added by the model, overwrite them by matches if necessary.
ent_id_sep (str): Separator used internally for entity IDs.
RETURNS (EntityRuler): The newly constructed object.
DOCS: https://spacy.io/api/entityruler#init
"""

View File

@ -72,7 +72,6 @@ class Scorer:
def __init__(self, nlp=None, **cfg):
"""Initialize the Scorer.
RETURNS (Scorer): The newly created object.
DOCS: https://spacy.io/api/scorer#init
"""

View File

@ -97,7 +97,6 @@ cdef class StringStore:
"""Create the StringStore.
strings (iterable): A sequence of unicode strings to add to the store.
RETURNS (StringStore): The newly constructed object.
"""
self.mem = Pool()
self._map = PreshMap()

View File

@ -50,7 +50,6 @@ cdef class Tokenizer:
recognised as tokens.
url_match (callable): A boolean function matching strings to be
recognised as tokens after considering prefixes and suffixes.
RETURNS (Tokenizer): The newly constructed object.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)

View File

@ -312,6 +312,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
"""Retokenize the document, such that the token at
`doc[token_index]` is split into tokens with the orth 'orths'
token_index(int): token index of the token to split.
orths: IDs of the verbatim text content of the tokens to create
**attributes: Attributes to assign to each of the newly created tokens. By default,
attributes are inherited from the original token.

View File

@ -1,10 +1,12 @@
from typing import Iterable, Iterator
import numpy
import zlib
import srsly
from thinc.api import NumpyOps
from .doc import Doc
from ..vocab import Vocab
from ..compat import copy_reg
from ..tokens import Doc
from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors
@ -44,13 +46,18 @@ class DocBin:
document from the DocBin.
"""
def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
def __init__(
self,
attrs: Iterable[str] = ALL_ATTRS,
store_user_data: bool = False,
docs=Iterable[Doc],
) -> None:
"""Create a DocBin object to hold serialized annotations.
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
always serialized, so they're not required. Defaults to None.
attrs (Iterable[str]): List of attributes to serialize. 'orth' and
'spacy' are always serialized, so they're not required.
store_user_data (bool): Whether to include the `Doc.user_data`.
RETURNS (DocBin): The newly constructed object.
docs (Iterable[Doc]): Docs to add.
DOCS: https://spacy.io/api/docbin#init
"""
@ -68,11 +75,11 @@ class DocBin:
for doc in docs:
self.add(doc)
def __len__(self):
def __len__(self) -> int:
"""RETURNS: The number of Doc objects added to the DocBin."""
return len(self.tokens)
def add(self, doc):
def add(self, doc: Doc) -> None:
"""Add a Doc's annotations to the DocBin for serialization.
doc (Doc): The Doc object to add.
@ -100,7 +107,7 @@ class DocBin:
if self.store_user_data:
self.user_data.append(srsly.msgpack_dumps(doc.user_data))
def get_docs(self, vocab):
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
"""Recover Doc objects from the annotations, using the given vocab.
vocab (Vocab): The shared vocab.
@ -125,7 +132,7 @@ class DocBin:
doc.user_data.update(user_data)
yield doc
def merge(self, other):
def merge(self, other: "DocBin") -> None:
"""Extend the annotations of this DocBin with the annotations from
another. Will raise an error if the pre-defined attrs of the two
DocBins don't match.
@ -144,7 +151,7 @@ class DocBin:
if self.store_user_data:
self.user_data.extend(other.user_data)
def to_bytes(self):
def to_bytes(self) -> bytes:
"""Serialize the DocBin's annotations to a bytestring.
RETURNS (bytes): The serialized DocBin.
@ -156,7 +163,6 @@ class DocBin:
lengths = [len(tokens) for tokens in self.tokens]
tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
msg = {
"version": self.version,
"attrs": self.attrs,
@ -171,7 +177,7 @@ class DocBin:
msg["user_data"] = self.user_data
return zlib.compress(srsly.msgpack_dumps(msg))
def from_bytes(self, bytes_data):
def from_bytes(self, bytes_data: bytes) -> "DocBin":
"""Deserialize the DocBin's annotations from a bytestring.
bytes_data (bytes): The data to load from.

View File

@ -173,7 +173,6 @@ cdef class Doc:
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object.
DOCS: https://spacy.io/api/doc#init
"""

View File

@ -94,7 +94,6 @@ cdef class Span:
kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation
of the span.
RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/span#init
"""

View File

@ -58,7 +58,6 @@ cdef class Vectors:
data (numpy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data.
name (str): A name to identify the vectors table.
RETURNS (Vectors): The newly created object.
DOCS: https://spacy.io/api/vectors#init
"""

View File

@ -74,7 +74,6 @@ cdef class Vocab:
lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability.
vectors_name (unicode): Optional name to identify the vectors table.
RETURNS (Vocab): The newly constructed object.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
if lookups in (None, True, False):

View File

@ -4,6 +4,7 @@ teaser: Pre-defined model architectures included with the core library
source: spacy/ml/models
menu:
- ['Tok2Vec', 'tok2vec']
- ['Transformers', 'transformers']
- ['Parser & NER', 'parser']
- ['Text Classification', 'textcat']
- ['Entity Linking', 'entitylinker']
@ -13,7 +14,7 @@ TODO: intro and how architectures work, link to
[`registry`](/api/top-level#registry),
[custom models](/usage/training#custom-models) usage etc.
## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}}
## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
@ -21,12 +22,14 @@ TODO: intro and how architectures work, link to
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
### spacy-transformers.TransformerModel.v1 {#TransformerModel}
## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"}
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini

View File

@ -13,25 +13,84 @@ datasets in the [DocBin](/api/docbin) (`.spacy`) format.
Create a `Corpus`. The input data can be a file or a directory of files.
| Name | Type | Description |
| ----------- | ------------ | ---------------------------------------------------------------- |
| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). |
| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). |
| `limit` | int | Maximum number of examples returned. |
| **RETURNS** | `Corpus` | The newly constructed object. |
> #### Example
>
> ```python
> from spacy.gold import Corpus
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> ```
<!-- TODO: document remaining methods / decide which to document -->
## Corpus.walk_corpus {#walk_corpus tag="staticmethod"}
## Corpus.make_examples {#make_examples tag="method"}
## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"}
## Corpus.read_docbin {#read_docbin tag="method"}
## Corpus.count_train {#count_train tag="method"}
| Name | Type | Description |
| ------- | ------------ | ---------------------------------------------------------------- |
| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). |
| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). |
| `limit` | int | Maximum number of examples returned. `0` for no limit (default). |
## Corpus.train_dataset {#train_dataset tag="method"}
Yield examples from the training data.
> #### Example
>
> ```python
> from spacy.gold import Corpus
> import spacy
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> nlp = spacy.blank("en")
> train_data = corpus.train_dataset(nlp)
> ```
| Name | Type | Description |
| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `nlp` | `Language` | The current `nlp` object. |
| _keyword-only_ | | |
| `shuffle` | bool | Whether to shuffle the examples. Defaults to `True`. |
| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. |
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. `0` for no limit (default).  |
| **YIELDS** | `Example` | The examples. |
## Corpus.dev_dataset {#dev_dataset tag="method"}
Yield examples from the development data.
> #### Example
>
> ```python
> from spacy.gold import Corpus
> import spacy
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> nlp = spacy.blank("en")
> dev_data = corpus.dev_dataset(nlp)
> ```
| Name | Type | Description |
| -------------- | ---------- | ---------------------------------------------------------------------------- |
| `nlp` | `Language` | The current `nlp` object. |
| _keyword-only_ | | |
| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. |
| **YIELDS** | `Example` | The examples. |
## Corpus.count_train {#count_train tag="method"}
Get the word count of all training examples.
> #### Example
>
> ```python
> from spacy.gold import Corpus
> import spacy
>
> corpus = Corpus("./train.spacy", "./dev.spacy")
> nlp = spacy.blank("en")
> word_count = corpus.count_train(nlp)
> ```
| Name | Type | Description |
| ----------- | ---------- | ------------------------- |
| `nlp` | `Language` | The current `nlp` object. |
| **RETURNS** | int | The word count. |
<!-- TODO: document remaining methods? / decide which to document -->

View File

@ -87,13 +87,12 @@ Create a `Token` object from a `TokenC*` pointer.
> token = Token.cinit(&doc.c[3], doc, 3)
> ```
| Name | Type | Description |
| ----------- | --------- | ------------------------------------------------------------ |
| `vocab` | `Vocab` | A reference to the shared `Vocab`. |
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. |
| `offset` | `int` | The offset of the token within the document. |
| `doc` | `Doc` | The parent document. |
| **RETURNS** | `Token` | The newly constructed object. |
| Name | Type | Description |
| -------- | --------- | ------------------------------------------------------------ |
| `vocab` | `Vocab` | A reference to the shared `Vocab`. |
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. |
| `offset` | `int` | The offset of the token within the document. |
| `doc` | `Doc` | The parent document. |
## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"}

View File

@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
## DependencyParser.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example

View File

@ -30,12 +30,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ```
| Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `words` | iterable | A list of strings to add to the container. |
| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. |
| **RETURNS** | `Doc` | The newly constructed object. |
| Name | Type | Description |
| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `words` | iterable | A list of strings to add to the container. |
| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. |
## Doc.\_\_getitem\_\_ {#getitem tag="method"}

View File

@ -44,11 +44,11 @@ Create a `DocBin` object to hold serialized annotations.
> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
> ```
| Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. |
| Argument | Type | Description |
| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. |
## DocBin.\_\len\_\_ {#len tag="method"}

View File

@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
## EntityLinker.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
method, a knowledge base should have been defined with
[`set_kb`](/api/entitylinker#set_kb).

View File

@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
## EntityRecognizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example

View File

@ -37,7 +37,6 @@ both documents.
| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. |
| _keyword-only_ | | |
| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. |
| **RETURNS** | `Example` | The newly constructed object. |
## Example.from_dict {#from_dict tag="classmethod"}

View File

@ -27,11 +27,10 @@ Create the knowledge base.
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
> ```
| Name | Type | Description |
| ---------------------- | --------------- | ---------------------------------------- |
| `vocab` | `Vocab` | A `Vocab` object. |
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
| Name | Type | Description |
| ---------------------- | ------- | ---------------------------------------- |
| `vocab` | `Vocab` | A `Vocab` object. |
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
@ -255,7 +254,6 @@ but instead these objects are returned by the
| `entity_freq` | float | The entity frequency as recorded in the KB. |
| `alias_hash` | int | The hash of the textual mention or alias. |
| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` |
| **RETURNS** | `Candidate` | The newly constructed object. |
## Candidate attributes {#candidate_attributes}

View File

@ -15,6 +15,58 @@ the tagger or parser that are called on a document in order. You can also add
your own processing pipeline components that take a `Doc` object, modify it and
return it.
## Language.\_\_init\_\_ {#init tag="method"}
Initialize a `Language` object.
> #### Example
>
> ```python
> # Construction from subclass
> from spacy.lang.en import English
> nlp = English()
>
> # Construction from scratch
> from spacy.vocab import Vocab
> from spacy.language import Language
> nlp = Language(Vocab())
> ```
| Name | Type | Description |
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
| _keyword-only_ | | |
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
## Language.from_config {#from_config tag="classmethod"}
Create a `Language` object from a loaded config. Will set up the tokenizer and
language data, add pipeline components based on the pipeline and components
define in the config and validate the results. If no config is provided, the
default config of the given language is used. This is also how spaCy loads a
model under the hood based on its [`config.cfg`](/api/data-formats#config).
> #### Example
>
> ```python
> from thinc.api import Config
> from spacy.language import Language
>
> config = Config().from_disk("./config.cfg")
> nlp = Language.from_config(config)
> ```
| Name | Type | Description |
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
| _keyword-only_ | |
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
| **RETURNS** | `Language` | The initialized object. |
## Language.component {#component tag="classmethod" new="3"}
Register a custom pipeline component under a given name. This allows
@ -101,57 +153,6 @@ examples, see the
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
## Language.\_\_init\_\_ {#init tag="method"}
Initialize a `Language` object.
> #### Example
>
> ```python
> from spacy.vocab import Vocab
> from spacy.language import Language
> nlp = Language(Vocab())
>
> from spacy.lang.en import English
> nlp = English()
> ```
| Name | Type | Description |
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
| _keyword-only_ | | |
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
| **RETURNS** | `Language` | The newly constructed object. |
## Language.from_config {#from_config tag="classmethod"}
Create a `Language` object from a loaded config. Will set up the tokenizer and
language data, add pipeline components based on the pipeline and components
define in the config and validate the results. If no config is provided, the
default config of the given language is used. This is also how spaCy loads a
model under the hood based on its [`config.cfg`](/api/data-formats#config).
> #### Example
>
> ```python
> from thinc.api import Config
> from spacy.language import Language
>
> config = Config().from_disk("./config.cfg")
> nlp = Language.from_config(config)
> ```
| Name | Type | Description |
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
| _keyword-only_ | |
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
| **RETURNS** | `Language` | The initialized object. |
## Language.\_\_call\_\_ {#call tag="method"}
Apply the pipeline to some text. The text can span multiple sentences, and can
@ -164,11 +165,13 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ```
| Name | Type | Description |
| ----------- | ----------- | --------------------------------------------------------------------------------- |
| `text` | str | The text to be processed. |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| **RETURNS** | `Doc` | A container for accessing the annotations. |
| Name | Type | Description |
| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ |
| `text` | str | The text to be processed. |
| _keyword-only_ | | |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. |
## Language.pipe {#pipe tag="method"}
@ -183,15 +186,57 @@ more efficient than processing texts one-by-one.
> assert doc.is_parsed
> ```
| Name | Type | Description |
| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts` | `Iterable[str]` | A sequence of strings. |
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
| `batch_size` | int | The number of texts to buffer. |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
| **YIELDS** | `Doc` | Documents in the order of the original text. |
| Name | Type | Description |
| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts` | `Iterable[str]` | A sequence of strings. |
| _keyword-only_ | | |
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
| `batch_size` | int | The number of texts to buffer. |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
| **YIELDS** | `Doc` | Documents in the order of the original text. |
## Language.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example
>
> ```python
> optimizer = nlp.begin_training(get_examples)
> ```
| Name | Type | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
| _keyword-only_ | | |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
Continue training a pretrained model. Create and return an optimizer, and
initialize "rehearsal" for any pipeline component that has a `rehearse` method.
Rehearsal is used to prevent models from "forgetting" their initialized
"knowledge". To perform rehearsal, collect samples of text you want the models
to retain performance on, and call [`nlp.rehearse`](/api/language#rehearse) with
a batch of [Example](/api/example) objects.
> #### Example
>
> ```python
> optimizer = nlp.resume_training()
> nlp.rehearse(examples, sgd=optimizer)
> ```
| Name | Type | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
## Language.update {#update tag="method"}
@ -206,15 +251,37 @@ Update the models in the pipeline.
> nlp.update([example], sgd=optimizer)
> ```
| Name | Type | Description |
| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
| _keyword-only_ | | |
| `drop` | float | The dropout rate. |
| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
| Name | Type | Description |
| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
| _keyword-only_ | | |
| `drop` | float | The dropout rate. |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
## Language.rehearse {#rehearse tag="method,experimental"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
the "catastrophic forgetting" problem. This feature is experimental.
> #### Example
>
> ```python
> optimizer = nlp.resume_training()
> losses = nlp.rehearse(examples, sgd=optimizer)
> ```
| Name | Type | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| _keyword-only_ | | |
| `drop` | float | The dropout rate. |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
## Language.evaluate {#evaluate tag="method"}
@ -227,33 +294,15 @@ Evaluate a model's pipeline components.
> print(scores)
> ```
| Name | Type | Description |
| -------------------------------------------- | ------------------------------- | ------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| `verbose` | bool | Print debugging information. |
| `batch_size` | int | The batch size to use. |
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
| **RETURNS** | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores. |
## Language.begin_training {#begin_training tag="method"}
Allocate models, pre-process training data and acquire an
[`Optimizer`](https://thinc.ai/docs/api-optimizers).
> #### Example
>
> ```python
> optimizer = nlp.begin_training(get_examples)
> ```
| Name | Type | Description |
| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ |
| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. |
| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. |
| `component_cfg` <Tag variant="new">2.1</Tag> | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. |
| `**cfg` | - | Config parameters (sent to all components). |
| **RETURNS** | `Optimizer` | An optimizer. |
| Name | Type | Description |
| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| _keyword-only_ | | |
| `verbose` | bool | Print debugging information. |
| `batch_size` | int | The batch size to use. |
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
## Language.use_params {#use_params tag="contextmanager, method"}
@ -296,6 +345,7 @@ To create a component and add it to the pipeline, you should always use
| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | str | Name of the registered component factory. |
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
| _keyword-only_ | | |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
| **RETURNS** | callable | The pipeline component. |
@ -418,10 +468,13 @@ Replace a component in the pipeline.
> nlp.replace_pipe("parser", my_custom_parser)
> ```
| Name | Type | Description |
| ----------- | -------- | --------------------------------- |
| `name` | str | Name of the component to replace. |
| `component` | callable | The pipeline component to insert. |
| Name | Type | Description |
| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the component to replace. |
| `component` | callable | The pipeline component to insert. |
| _keyword-only_ | | |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. |
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
@ -492,11 +545,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
</Infobox>
| Name | Type | Description |
| ----------- | --------------- | ------------------------------------------------------------------------------------ |
| `disable` | str / list | Name(s) of pipeline components to disable. |
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
| Name | Type | Description |
| -------------- | --------------- | ------------------------------------------------------------------------------------ |
| _keyword-only_ | | |
| `disable` | str / list | Name(s) of pipeline components to disable. |
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
@ -767,8 +821,8 @@ serialization by passing in the string names via the `exclude` argument.
The `FactoryMeta` contains the information about the component and its default
provided by the [`@Language.component`](/api/language#component) or
[`@Language.factory`](/api/language#factory) decorator. It's created whenever a
component is added to the pipeline and stored on the `Language` class for each
component instance and factory instance.
component is defined and stored on the `Language` class for each component
instance and factory instance.
| Name | Type | Description |
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |

View File

@ -31,7 +31,6 @@ when a `Language` subclass and its `Vocab` is initialized.
| Name | Type | Description |
| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `lookups` <Tag variant="new">2.2</Tag> | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. |
| **RETURNS** | `Lemmatizer` | The newly created object. |
## Lemmatizer.\_\_call\_\_ {#call tag="method"}

View File

@ -13,11 +13,10 @@ lemmatization depends on the part-of-speech tag).
Create a `Lexeme` object.
| Name | Type | Description |
| ----------- | -------- | ----------------------------- |
| `vocab` | `Vocab` | The parent vocabulary. |
| `orth` | int | The orth id of the lexeme. |
| **RETURNS** | `Lexeme` | The newly constructed object. |
| Name | Type | Description |
| ------- | ------- | -------------------------- |
| `vocab` | `Vocab` | The parent vocabulary. |
| `orth` | int | The orth id of the lexeme. |
## Lexeme.set_flag {#set_flag tag="method"}

View File

@ -236,10 +236,9 @@ Initialize a new table.
> assert table["foo"] == "bar"
> ```
| Name | Type | Description |
| ----------- | ------- | ---------------------------------- |
| `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
| Name | Type | Description |
| ------ | ---- | ---------------------------------- |
| `name` | str | Optional table name for reference. |
### Table.from_dict {#table.from_dict tag="classmethod"}

View File

@ -19,11 +19,10 @@ string where an integer is expected) or unexpected property names.
> matcher = Matcher(nlp.vocab)
> ```
| Name | Type | Description |
| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. |
| **RETURNS** | `Matcher` | The newly constructed object. |
| Name | Type | Description |
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. |
## Matcher.\_\_call\_\_ {#call tag="method"}

View File

@ -6,7 +6,6 @@ source: spacy/tokens/morphanalysis.pyx
Stores a single morphological analysis.
## MorphAnalysis.\_\_init\_\_ {#init tag="method"}
Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of
@ -21,12 +20,10 @@ morphological features.
> m = MorphAnalysis(nlp.vocab, feats)
> ```
| Name | Type | Description |
| ----------- | ------------------ | ----------------------------- |
| `vocab` | `Vocab` | The vocab. |
| `features` | `Union[Dict, str]` | The morphological features. |
| **RETURNS** | `MorphAnalysis` | The newly constructed object. |
| Name | Type | Description |
| ---------- | ------------------ | --------------------------- |
| `vocab` | `Vocab` | The vocab. |
| `features` | `Union[Dict, str]` | The morphological features. |
## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"}
@ -44,7 +41,6 @@ Whether a feature/value pair is in the analysis.
| ----------- | ----- | ------------------------------------- |
| **RETURNS** | `str` | A feature/value pair in the analysis. |
## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"}
Iterate over the feature/value pairs in the analysis.
@ -61,7 +57,6 @@ Iterate over the feature/value pairs in the analysis.
| ---------- | ----- | ------------------------------------- |
| **YIELDS** | `str` | A feature/value pair in the analysis. |
## MorphAnalysis.\_\_len\_\_ {#len tag="method"}
Returns the number of features in the analysis.
@ -78,7 +73,6 @@ Returns the number of features in the analysis.
| ----------- | ----- | --------------------------------------- |
| **RETURNS** | `int` | The number of features in the analysis. |
## MorphAnalysis.\_\_str\_\_ {#str tag="method"}
Returns the morphological analysis in the UD FEATS string format.
@ -92,10 +86,9 @@ Returns the morphological analysis in the UD FEATS string format.
> ```
| Name | Type | Description |
| ----------- | ----- | ---------------------------------|
| ----------- | ----- | -------------------------------- |
| **RETURNS** | `str` | The analysis in UD FEATS format. |
## MorphAnalysis.get {#get tag="method"}
Retrieve values for a feature by field.
@ -108,11 +101,10 @@ Retrieve values for a feature by field.
> assert morph.get("Feat1") == ["Val1", "Val2"]
> ```
| Name | Type | Description |
| ----------- | ------ | ----------------------------------- |
| `field` | `str` | The field to retrieve. |
| **RETURNS** | `list` | A list of the individual features. |
| Name | Type | Description |
| ----------- | ------ | ---------------------------------- |
| `field` | `str` | The field to retrieve. |
| **RETURNS** | `list` | A list of the individual features. |
## MorphAnalysis.to_dict {#to_dict tag="method"}
@ -128,10 +120,9 @@ map.
> ```
| Name | Type | Description |
| ----------- | ------ | -----------------------------------------|
| ----------- | ------ | ---------------------------------------- |
| **RETURNS** | `dict` | The dict representation of the analysis. |
## MorphAnalysis.from_id {#from_id tag="classmethod"}
Create a morphological analysis from a given hash ID.
@ -149,5 +140,3 @@ Create a morphological analysis from a given hash ID.
| ------- | ------- | -------------------------------- |
| `vocab` | `Vocab` | The vocab. |
| `key` | `int` | The hash of the features string. |

View File

@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
## Morphologizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example

View File

@ -4,12 +4,11 @@ tag: class
source: spacy/morphology.pyx
---
Store the possible morphological analyses for a language, and index them
by hash. To save space on each token, tokens only know the hash of their
Store the possible morphological analyses for a language, and index them by
hash. To save space on each token, tokens only know the hash of their
morphological analysis, so queries of morphological attributes are delegated to
this class.
## Morphology.\_\_init\_\_ {#init tag="method"}
Create a Morphology object using the tag map, lemmatizer and exceptions.
@ -22,21 +21,18 @@ Create a Morphology object using the tag map, lemmatizer and exceptions.
> morphology = Morphology(strings, tag_map, lemmatizer)
> ```
| Name | Type | Description |
| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
| `strings` | `StringStore` | The string store. |
| `tag_map` | `Dict[str, Dict]` | The tag map. |
| `lemmatizer`| `Lemmatizer` | The lemmatizer. |
| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
| **RETURNS** | `Morphology` | The newly constructed object. |
| Name | Type | Description |
| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- |
| `strings` | `StringStore` | The string store. |
| `tag_map` | `Dict[str, Dict]` | The tag map. |
| `lemmatizer` | `Lemmatizer` | The lemmatizer. |
| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` |
## Morphology.add {#add tag="method"}
Insert a morphological analysis in the morphology table, if not already
present. The morphological analysis may be provided in the UD FEATS format as a
string or in the tag map dictionary format. Returns the hash of the new
analysis.
Insert a morphological analysis in the morphology table, if not already present.
The morphological analysis may be provided in the UD FEATS format as a string or
in the tag map dictionary format. Returns the hash of the new analysis.
> #### Example
>
@ -46,10 +42,9 @@ analysis.
> assert hash == nlp.vocab.strings[feats]
> ```
| Name | Type | Description |
| ----------- | ------------------- | --------------------------- |
| `features` | `Union[Dict, str]` | The morphological features. |
| Name | Type | Description |
| ---------- | ------------------ | --------------------------- |
| `features` | `Union[Dict, str]` | The morphological features. |
## Morphology.get {#get tag="method"}
@ -63,33 +58,30 @@ analysis.
Get the FEATS string for the hash of the morphological analysis.
| Name | Type | Description |
| ----------- | ------ | --------------------------------------- |
| `morph` | int | The hash of the morphological analysis. |
| Name | Type | Description |
| ------- | ---- | --------------------------------------- |
| `morph` | int | The hash of the morphological analysis. |
## Morphology.load_tag_map {#load_tag_map tag="method"}
Replace the current tag map with the provided tag map.
| Name | Type | Description |
| ----------- | ------------------ | ------------ |
| `tag_map` | `Dict[str, Dict]` | The tag map. |
| Name | Type | Description |
| --------- | ----------------- | ------------ |
| `tag_map` | `Dict[str, Dict]` | The tag map. |
## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"}
Replace the current morphological exceptions with the provided exceptions.
| Name | Type | Description |
| ------------- | ------------------ | ----------------------------- |
| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
| Name | Type | Description |
| ------------- | ----------------- | ----------------------------- |
| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. |
## Morphology.add_special_case {#add_special_case tag="method"}
Add a special-case rule to the morphological analyzer. Tokens whose tag and
orth match the rule will receive the specified properties.
Add a special-case rule to the morphological analyzer. Tokens whose tag and orth
match the rule will receive the specified properties.
> #### Example
>
@ -98,27 +90,24 @@ orth match the rule will receive the specified properties.
> morphology.add_special_case("DT", "the", attrs)
> ```
| Name | Type | Description |
| ----------- | ---- | ---------------------------------------------- |
| `tag_str` | str | The fine-grained tag. |
| `orth_str` | str | The token text. |
| `attrs` | dict | The features to assign for this token and tag. |
| Name | Type | Description |
| ---------- | ---- | ---------------------------------------------- |
| `tag_str` | str | The fine-grained tag. |
| `orth_str` | str | The token text. |
| `attrs` | dict | The features to assign for this token and tag. |
## Morphology.exc {#exc tag="property"}
The current morphological exceptions.
| Name | Type | Description |
| ---------- | ----- | --------------------------------------------------- |
| **YIELDS** | dict | The current dictionary of morphological exceptions. |
| Name | Type | Description |
| ---------- | ---- | --------------------------------------------------- |
| **YIELDS** | dict | The current dictionary of morphological exceptions. |
## Morphology.lemmatize {#lemmatize tag="method"}
TODO
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
Convert a string FEATS representation to a dictionary of features and values in
@ -132,11 +121,10 @@ the same format as the tag map.
> assert d == {"Feat1": "Val1", "Feat2": "Val2"}
> ```
| Name | Type | Description |
| ----------- | ---- | ------------------------------------------------------------- |
| Name | Type | Description |
| ----------- | ---- | ------------------------------------------------------------------ |
| `feats` | str | The morphological features in Universal Dependencies FEATS format. |
| **RETURNS** | dict | The morphological features as a dictionary. |
| **RETURNS** | dict | The morphological features as a dictionary. |
## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"}
@ -150,12 +138,11 @@ Convert a dictionary of features and values to a string FEATS representation.
> assert f == "Feat1=Val1|Feat2=Val2"
> ```
| Name | Type | Description |
| Name | Type | Description |
| ------------ | ----------------- | --------------------------------------------------------------------- |
| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. |
| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. |
## Attributes {#attributes}
| Name | Type | Description |

View File

@ -35,12 +35,11 @@ be shown.
> matcher = PhraseMatcher(nlp.vocab)
> ```
| Name | Type | Description |
| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
| `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
| **RETURNS** | `PhraseMatcher` | The newly constructed object. |
| Name | Type | Description |
| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
| `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
## PhraseMatcher.\_\_call\_\_ {#call tag="method"}

View File

@ -95,7 +95,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
## Pipe.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example
@ -198,7 +198,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> optimizer = nlp.begin_training()
> optimizer = nlp.resume_training()
> losses = pipe.rehearse(examples, sgd=optimizer)
> ```

View File

@ -28,10 +28,9 @@ Create a new `Scorer`.
> scorer = Scorer(nlp)
> ```
| Name | Type | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. |
| **RETURNS** | `Scorer` | The newly created object. |
| Name | Type | Description |
| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. |
## Scorer.score {#score tag="method"}

View File

@ -116,7 +116,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
## SentenceRecognizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example
@ -201,7 +201,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
>
> ```python
> senter = nlp.add_pipe("senter")
> optimizer = nlp.begin_training()
> optimizer = nlp.resume_training()
> losses = senter.rehearse(examples, sgd=optimizer)
> ```

View File

@ -18,15 +18,14 @@ Create a Span object from the slice `doc[start : end]`.
> assert [t.text for t in span] == ["it", "back", "!"]
> ```
| Name | Type | Description |
| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. |
| `start` | int | The index of the first token of the span. |
| `end` | int | The index of the first token after the span. |
| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. |
| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. |
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
| **RETURNS** | `Span` | The newly constructed object. |
| Name | Type | Description |
| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. |
| `start` | int | The index of the first token of the span. |
| `end` | int | The index of the first token after the span. |
| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. |
| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. |
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
## Span.\_\_getitem\_\_ {#getitem tag="method"}

View File

@ -19,10 +19,9 @@ Create the `StringStore`.
> stringstore = StringStore(["apple", "orange"])
> ```
| Name | Type | Description |
| ----------- | ------------- | ------------------------------------------ |
| `strings` | iterable | A sequence of strings to add to the store. |
| **RETURNS** | `StringStore` | The newly constructed object. |
| Name | Type | Description |
| --------- | -------- | ------------------------------------------ |
| `strings` | iterable | A sequence of strings to add to the store. |
## StringStore.\_\_len\_\_ {#len tag="method"}

View File

@ -114,7 +114,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
## Tagger.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example
@ -199,7 +199,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
>
> ```python
> tagger = nlp.add_pipe("tagger")
> optimizer = nlp.begin_training()
> optimizer = nlp.resume_training()
> losses = tagger.rehearse(examples, sgd=optimizer)
> ```

View File

@ -133,7 +133,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
## TextCategorizer.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example
@ -218,7 +218,7 @@ the "catastrophic forgetting" problem. This feature is experimental.
>
> ```python
> textcat = nlp.add_pipe("textcat")
> optimizer = nlp.begin_training()
> optimizer = nlp.resume_training()
> losses = textcat.rehearse(examples, sgd=optimizer)
> ```

View File

@ -110,7 +110,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
## Tok2Vec.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. Return an
Initialize the pipe for training, using data examples if available. Returns an
[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
> #### Example

View File

@ -17,12 +17,11 @@ Construct a `Token` object.
> assert token.text == "Give"
> ```
| Name | Type | Description |
| ----------- | ------- | ------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `doc` | `Doc` | The parent document. |
| `offset` | int | The index of the token within the document. |
| **RETURNS** | `Token` | The newly constructed object. |
| Name | Type | Description |
| -------- | ------- | ------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `doc` | `Doc` | The parent document. |
| `offset` | int | The index of the token within the document. |
## Token.\_\_len\_\_ {#len tag="method"}
@ -393,73 +392,73 @@ The L2 norm of the token's vector representation.
## Attributes {#attributes}
| Name | Type | Description |
| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. |
| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. |
| `text` | str | Verbatim text content. |
| `text_with_ws` | str | Text content, with trailing space character if present. |
| `whitespace_` | str | Trailing space character if present. |
| `orth` | int | ID of the verbatim text content. |
| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. |
| `head` | `Token` | The syntactic parent, or "governor", of this token. |
| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. |
| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. |
| `i` | int | The index of the token within the parent document. |
| `ent_type` | int | Named entity type. |
| `ent_type_` | str | Named entity type. |
| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. |
| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | str | Knowledge base ID that refers to the named entity this token is a part of, if any. |
| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
| `lemma` | int | Base form of the token, with no inflectional suffixes. |
| `lemma_` | str | Base form of the token, with no inflectional suffixes. |
| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
| `lower` | int | Lowercase form of the token. |
| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. |
| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |
| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. |
| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. |
| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. |
| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. |
| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. |
| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. |
| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. |
| `is_punct` | bool | Is the token punctuation? |
| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? |
| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? |
| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. |
| `is_bracket` | bool | Is the token a bracket? |
| `is_quote` | bool | Is the token a quotation mark? |
| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the token a currency symbol? |
| `like_url` | bool | Does the token resemble a URL? |
| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. |
| `like_email` | bool | Does the token resemble an email address? |
| `is_oov` | bool | Does the token have a word vector? |
| `is_stop` | bool | Is the token part of a "stop list"? |
| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
| `tag` | int | Fine-grained part-of-speech. |
| `tag_` | str | Fine-grained part-of-speech. |
| `morph` | `MorphAnalysis` | Morphological analysis. |
| `morph_` | str | Morphological analysis in UD FEATS format. |
| `dep` | int | Syntactic dependency relation. |
| `dep_` | str | Syntactic dependency relation. |
| `lang` | int | Language of the parent document's vocabulary. |
| `lang_` | str | Language of the parent document's vocabulary. |
| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). |
| `idx` | int | The character offset of the token within the parent document. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. |
| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
| `cluster` | int | Brown cluster ID. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
| Name | Type | Description |
| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. |
| `sent` <Tag variant="new">2.0.12</Tag> | `Span` | The sentence span that this token is a part of. |
| `text` | str | Verbatim text content. |
| `text_with_ws` | str | Text content, with trailing space character if present. |
| `whitespace_` | str | Trailing space character if present. |
| `orth` | int | ID of the verbatim text content. |
| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. |
| `head` | `Token` | The syntactic parent, or "governor", of this token. |
| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. |
| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. |
| `i` | int | The index of the token within the parent document. |
| `ent_type` | int | Named entity type. |
| `ent_type_` | str | Named entity type. |
| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. |
| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | str | Knowledge base ID that refers to the named entity this token is a part of, if any. |
| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
| `lemma` | int | Base form of the token, with no inflectional suffixes. |
| `lemma_` | str | Base form of the token, with no inflectional suffixes. |
| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
| `lower` | int | Lowercase form of the token. |
| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. |
| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |
| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. |
| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. |
| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. |
| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. |
| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. |
| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. |
| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. |
| `is_punct` | bool | Is the token punctuation? |
| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? |
| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? |
| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. |
| `is_bracket` | bool | Is the token a bracket? |
| `is_quote` | bool | Is the token a quotation mark? |
| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the token a currency symbol? |
| `like_url` | bool | Does the token resemble a URL? |
| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. |
| `like_email` | bool | Does the token resemble an email address? |
| `is_oov` | bool | Does the token have a word vector? |
| `is_stop` | bool | Is the token part of a "stop list"? |
| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
| `tag` | int | Fine-grained part-of-speech. |
| `tag_` | str | Fine-grained part-of-speech. |
| `morph` | `MorphAnalysis` | Morphological analysis. |
| `morph_` | str | Morphological analysis in UD FEATS format. |
| `dep` | int | Syntactic dependency relation. |
| `dep_` | str | Syntactic dependency relation. |
| `lang` | int | Language of the parent document's vocabulary. |
| `lang_` | str | Language of the parent document's vocabulary. |
| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). |
| `idx` | int | The character offset of the token within the parent document. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. |
| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
| `cluster` | int | Brown cluster ID. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |

View File

@ -34,16 +34,15 @@ the
> tokenizer = nlp.tokenizer
> ```
| Name | Type | Description |
| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
| **RETURNS** | `Tokenizer` | The newly constructed object. |
| Name | Type | Description |
| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
## Tokenizer.\_\_call\_\_ {#call tag="method"}

View File

@ -0,0 +1,107 @@
---
title: Transformer
teaser: Pipeline component for multi-task learning with transformer models
tag: class
source: github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
new: 3
api_base_class: /api/pipe
api_string_name: transformer
---
> #### Installation
>
> ```bash
> $ pip install spacy-transformers
> ```
<Infobox title="Important note" variant="warning">
This component is available via the extension package
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). It
exposes the component via entry points, so if you have the package installed,
using `factory = "transformer"` in your
[training config](/usage/training#config) or `nlp.add_pipe("transformer")` will
work out-of-the-box.
</Infobox>
This pipeline component lets you use transformer models in your pipeline. The
component assigns the output of the transformer to the Doc's extension
attributes. We also calculate an alignment between the word-piece tokens and the
spaCy tokenization, so that we can use the last hidden states to set the
`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
token, the spaCy token receives the sum of their values. To access the values,
you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. For
more details, see the [usage documentation](/usage/transformers).
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config). See the
[model architectures](/api/architectures) documentation for details on the
architectures and their arguments and hyperparameters.
> #### Example
>
> ```python
> from spacy_transformers import Transformer, DEFAULT_CONFIG
>
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
> ```
| Setting | Type | Description | Default |
| ------------------- | ------------------------------------------ | ------------------------------- | ------------------------------------------------------------------- |
| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
| `annotation_setter` | Callable | <!-- TODO: --> | [`null_annotation_setter`](/api/transformer#null_annotation_setter) |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) |
```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
```
## Transformer.\_\_init\_\_ {#init tag="method"}
> #### Example
>
> ```python
> # Construction via add_pipe with default model
> trf = nlp.add_pipe("transformer")
>
> # Construction via add_pipe with custom model
> config = {"model": {"@architectures": "my_transformer"}}
> trf = nlp.add_pipe("transformer", config=config)
>
> # Construction from class
> from spacy_transformers import Transformer
> trf = Transformer(nlp.vocab, model)
> ```
Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe).
| Name | Type | Description |
| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `annotation_setter` | `Callable` | <!-- TODO: --> |
| _keyword-only_ | | |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
<!-- TODO: document rest -->
## TransformerData {#transformerdata tag="dataclass"}
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
## Custom attributes {#custom-attributes}
The component sets the following
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
| Name | Type | Description |
| -------------- | ----------------- | -------------- |
| `Doc.trf_data` | `TransformerData` | <!-- TODO: --> |

View File

@ -37,7 +37,6 @@ you can add vectors to later.
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
| `keys` | iterable | A sequence of keys aligned with the data. |
| `name` | str | A name to identify the vectors table. |
| **RETURNS** | `Vectors` | The newly created object. |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}

View File

@ -31,7 +31,6 @@ Create the vocabulary.
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |
| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. |
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
| **RETURNS** | `Vocab` | The newly constructed object. |
## Vocab.\_\_len\_\_ {#len tag="method"}

View File

@ -3,4 +3,154 @@ title: Transformers
teaser: Using transformer models like BERT in spaCy
---
TODO: ...
spaCy v3.0 lets you use almost **any statistical model** to power your pipeline.
You can use models implemented in a variety of frameworks, including TensorFlow,
PyTorch and MXNet. To keep things sane, spaCy expects models from these
frameworks to be wrapped with a common interface, using our machine learning
library [Thinc](https://thinc.ai). A transformer model is just a statistical
model, so the
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package
actually has very little work to do: we just have to provide a few functions
that do the required plumbing. We also provide a pipeline component,
[`Transformer`](/api/transformer), that lets you do multi-task learning and lets
you save the transformer outputs for later use.
<Project id="en_core_bert">
Try out a BERT-based model pipeline using this project template: swap in your
data, edit the settings and hyperparameters and train, evaluate, package and
visualize your model.
</Project>
<!-- TODO: the text below has been copied from the spacy-transformers repo and needs to be updated and adjusted
### Training usage
The recommended workflow for training is to use spaCy's
[config system](/usage/training#config), usually via the
[`spacy train`](/api/cli#train) command. The config system lets you describe a
tree of objects by referring to creation functions, including functions you
register yourself. Here's a config snippet for the `Transformer` component,
along with matching Python code.
```ini
[nlp]
lang = "en"
pipeline = ["transformer"]
[components.transformer]
factory = "transformer"
extra_annotation_setter = null
max_batch_size = 32
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "bert-base-cased"
tokenizer_config = {"use_fast": true}
[components.transformer.model.get_spans]
@span_getters = "get_doc_spans.v1"
```
```python
from spacy_transformers import Transformer
trf = Transformer(
nlp.vocab,
TransformerModel(
"bert-base-cased",
get_spans=get_doc_spans,
tokenizer_config={"use_fast": True},
),
annotation_setter=null_annotation_setter,
max_batch_size=32,
)
```
The `components.transformer` block adds the `transformer` component to the
pipeline, and the `components.transformer.model` block describes the creation of
a Thinc [`Model`](https://thinc.ai/docs/api-model) object that will be passed
into the component. The block names a function registered in the
`@architectures` registry. This function will be looked up and called using the
provided arguments. You're not limited to just that function --- you can write
your own or use someone else's. The only limitation is that it must return an
object of type `Model[List[Doc], FullTransformerBatch]`: that is, a Thinc model
that takes a list of `Doc` objects, and returns a `FullTransformerBatch` object
with the transformer data.
The same idea applies to task models that power the downstream components. Most
of spaCy's built-in model creation functions support a `tok2vec` argument, which
should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This is
where we'll plug in our transformer model, using the `Tok2VecTransformer` layer,
which sneakily delegates to the `Transformer` pipeline component.
```ini
[nlp]
lang = "en"
pipeline = ["ner"]
[components.ner]
factory = "ner"
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy-transformers.Tok2VecListener.v1"
grad_factor = 1.0
[nlp.pipeline.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
```
The `Tok2VecListener` layer expects a `pooling` layer, which needs to be of type
`Model[Ragged, Floats2d]`. This layer determines how the vector for each spaCy
token will be computed from the zero or more source rows the token is aligned
against. Here we use the `reduce_mean` layer, which averages the wordpiece rows.
We could instead use `reduce_last`, `reduce_max`, or a custom function you write
yourself.
You can have multiple components all listening to the same transformer model,
and all passing gradients back to it. By default, all of the gradients will be
equally weighted. You can control this with the `grad_factor` setting, which
lets you reweight the gradients from the different listeners. For instance,
setting `grad_factor = 0` would disable gradients from one of the listeners,
while `grad_factor = 2.0` would multiply them by 2. This is similar to having a
custom learning rate for each component. Instead of a constant, you can also
provide a schedule, allowing you to freeze the shared parameters at the start of
training.
### Runtime usage
Transformer models can be used as drop-in replacements for other types of neural
networks, so your spaCy pipeline can include them in a way that's completely
invisible to the user. Users will download, load and use the model in the
standard way, like any other spaCy pipeline.
Instead of using the transformers as subnetworks directly, you can also use them
via the [`Transformer`](/api/transformer) pipeline component. This sets the
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
which lets you access the transformers outputs at runtime via the
`doc._.trf_data` extension attribute. You can also customize how the
`Transformer` object sets annotations onto the `Doc`, by customizing the
`Transformer.annotation_setter` object. This callback will be called with the
raw input and output data for the whole batch, along with the batch of `Doc`
objects, allowing you to implement whatever you need.
```python
import spacy
nlp = spacy.load("en_core_trf_lg")
for doc in nlp.pipe(["some text", "some other text"]):
doc._.trf_data.tensors
tokvecs = doc._.trf_data.tensors[-1]
```
The `nlp` object in this example is just like any other spaCy pipeline
-->

View File

@ -31,18 +31,35 @@ raise errors. Many of them were also mostly internals. If you've been working
with more recent versions of spaCy v2.x, it's **unlikely** that your code relied
on them.
| Removed | Replacement |
| ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) |
| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) |
| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) |
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer), |
| Removed | Replacement |
| ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) |
| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) |
| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) |
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) |
## Migrating from v2.x {#migrating}
### Downloading and loading models {#migrating-downloading-models}
Model symlinks and shortcuts like `en` are now officially deprecated. There are
[many different models](/models) with different capabilities and not just one
"English model". In order to download and load a model, you should always use
its full name for instance, `en_core_web_sm`.
```diff
- python -m spacy download en
+ python -m spacy download en_core_web_sm
```
```diff
- nlp = spacy.load("en")
+ nlp = spacy.load("en_core_web_sm")
```
### Custom pipeline components and factories {#migrating-pipeline-components}
Custom pipeline components now have to be registered explicitly using the
@ -179,6 +196,10 @@ workflows, from data preprocessing to training and packaging your model.
<!-- TODO: write -->
#### Training via the Python API {#migrating-training-python}
<!-- TODO: this should explain the GoldParse -> Example stuff -->
#### Packaging models {#migrating-training-packaging}
The [`spacy package`](/api/cli#package) command now automatically builds the

View File

@ -81,6 +81,7 @@
"items": [
{ "text": "Tokenizer", "url": "/api/tokenizer" },
{ "text": "Tok2Vec", "url": "/api/tok2vec" },
{ "text": "Transformer", "url": "/api/transformer" },
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
{ "text": "Morphologizer", "url": "/api/morphologizer" },
{ "text": "Tagger", "url": "/api/tagger" },

View File

@ -33,11 +33,12 @@ const Link = ({
const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest)
const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest)
const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest)
const sourceWithText = (isSource || isApi) && isString(children)
const withIcon = isApi || isArch || isSource
const sourceWithText = withIcon && isString(children)
const linkClassNames = classNames(classes.root, className, {
[classes.hidden]: hidden,
[classes.nowrap]: (isApi || isSource || isArch) && !sourceWithText,
[classes.withIcon]: isApi || isSource || isArch,
[classes.nowrap]: (withIcon && !sourceWithText) || isArch,
[classes.withIcon]: withIcon,
})
const Wrapper = ws ? Whitespace : Fragment
const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null

View File

@ -22,6 +22,7 @@ export const headingTextClassName = 'heading-text'
* @returns {string} - URL to the file on GitHub.
*/
export function github(filepath, branch = 'master') {
if (filepath && filepath.startsWith('github.com')) return `https://${filepath}`
const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : ''
return `https://github.com/${repo}${path}`
}