diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 427c00caa..d23f70bee 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,7 +1,15 @@ +from typing import Union, List, Iterable, Iterator, TYPE_CHECKING +from pathlib import Path import random + from .. import util from .example import Example from ..tokens import DocBin, Doc +from ..vocab import Vocab + +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from ..language import Language # noqa: F401 class Corpus: @@ -11,20 +19,23 @@ class Corpus: DOCS: https://spacy.io/api/corpus """ - def __init__(self, train_loc, dev_loc, limit=0): + def __init__( + self, train_loc: Union[str, Path], dev_loc: Union[str, Path], limit: int = 0 + ) -> None: """Create a Corpus. train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. - limit (int): Max. number of examples returned - RETURNS (Corpus): The newly created object. + limit (int): Max. number of examples returned. + + DOCS: https://spacy.io/api/corpus#init """ self.train_loc = train_loc self.dev_loc = dev_loc self.limit = limit @staticmethod - def walk_corpus(path): + def walk_corpus(path: Union[str, Path]) -> List[Path]: path = util.ensure_path(path) if not path.is_dir(): return [path] @@ -43,7 +54,9 @@ class Corpus: locs.append(path) return locs - def _make_example(self, nlp, reference, gold_preproc): + def _make_example( + self, nlp: "Language", reference: Doc, gold_preproc: bool + ) -> Example: if gold_preproc or reference.has_unknown_spaces: return Example( Doc( @@ -56,7 +69,9 @@ class Corpus: else: return Example(nlp.make_doc(reference.text), reference) - def make_examples(self, nlp, reference_docs, max_length=0): + def make_examples( + self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0 + ) -> Iterator[Example]: for reference in reference_docs: if len(reference) == 0: continue @@ -69,7 +84,9 @@ class Corpus: elif max_length == 0 or len(ref_sent) < max_length: yield self._make_example(nlp, ref_sent.as_doc(), False) - def make_examples_gold_preproc(self, nlp, reference_docs): + def make_examples_gold_preproc( + self, nlp: "Language", reference_docs: Iterable[Doc] + ) -> Iterator[Example]: for reference in reference_docs: if reference.is_sentenced: ref_sents = [sent.as_doc() for sent in reference.sents] @@ -80,7 +97,9 @@ class Corpus: if len(eg.x): yield eg - def read_docbin(self, vocab, locs): + def read_docbin( + self, vocab: Vocab, locs: Iterable[Union[str, Path]] + ) -> Iterator[Doc]: """ Yield training examples as example dicts """ i = 0 for loc in locs: @@ -96,8 +115,14 @@ class Corpus: if self.limit >= 1 and i >= self.limit: break - def count_train(self, nlp): - """Returns count of words in train examples""" + def count_train(self, nlp: "Language") -> int: + """Returns count of words in train examples. + + nlp (Language): The current nlp. object. + RETURNS (int): The word count. + + DOCS: https://spacy.io/api/corpus#count_train + """ n = 0 i = 0 for example in self.train_dataset(nlp): @@ -108,8 +133,25 @@ class Corpus: return n def train_dataset( - self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs - ): + self, + nlp: "Language", + *, + shuffle: bool = True, + gold_preproc: bool = False, + max_length: int = 0 + ) -> Iterator[Example]: + """Yield examples from the training data. + + nlp (Language): The current nlp object. + shuffle (bool): Whether to shuffle the examples. + gold_preproc (bool): Whether to train on gold-standard sentences and tokens. + max_length (int): Maximum document length. Longer documents will be + split into sentences, if sentence boundaries are available. 0 for + no limit. + YIELDS (Example): The examples. + + DOCS: https://spacy.io/api/corpus#train_dataset + """ ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) @@ -120,7 +162,17 @@ class Corpus: random.shuffle(examples) yield from examples - def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs): + def dev_dataset( + self, nlp: "Language", *, gold_preproc: bool = False + ) -> Iterator[Example]: + """Yield examples from the development data. + + nlp (Language): The current nlp object. + gold_preproc (bool): Whether to train on gold-standard sentences and tokens. + YIELDS (Example): The examples. + + DOCS: https://spacy.io/api/corpus#dev_dataset + """ ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 1cfb681f4..adba79686 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -21,7 +21,6 @@ class Lemmatizer: lookups (Lookups): The lookups object containing the (optional) tables "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". - RETURNS (Lemmatizer): The newly constructed object. """ self.lookups = lookups if lookups is not None else Lookups() self.is_base_form = is_base_form diff --git a/spacy/lookups.py b/spacy/lookups.py index bf71ba877..7862b9805 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -52,8 +52,6 @@ class Lookups: def __init__(self) -> None: """Initialize the Lookups object. - RETURNS (Lookups): The newly created object. - DOCS: https://spacy.io/api/lookups#init """ self._tables = {} @@ -202,7 +200,6 @@ class Table(OrderedDict): data (dict): The dictionary. name (str): Optional table name for reference. - RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.from_dict """ @@ -215,7 +212,6 @@ class Table(OrderedDict): name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. - RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.init """ diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ddeeedd06..716af9909 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -36,7 +36,6 @@ cdef class DependencyMatcher: vocab (Vocab): The vocabulary object, which must be shared with the documents the matcher will operate on. - RETURNS (DependencyMatcher): The newly constructed object. """ size = 20 # TODO: make matcher work with validation diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 6c8ee4204..706cfdd68 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -37,7 +37,6 @@ cdef class Matcher: vocab (Vocab): The vocabulary object, which must be shared with the documents the matcher will operate on. - RETURNS (Matcher): The newly constructed object. """ self._extra_predicates = [] self._patterns = {} diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index a2141dc02..060c4d37f 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -32,7 +32,6 @@ cdef class PhraseMatcher: vocab (Vocab): The shared vocabulary. attr (int / str): Token attribute to match on. validate (bool): Perform additional validation when patterns are added. - RETURNS (PhraseMatcher): The newly constructed object. DOCS: https://spacy.io/api/phrasematcher#init """ diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 96a5d3d67..d6ce86e78 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -86,7 +86,6 @@ class EntityRuler: overwrite_ents (bool): If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. ent_id_sep (str): Separator used internally for entity IDs. - RETURNS (EntityRuler): The newly constructed object. DOCS: https://spacy.io/api/entityruler#init """ diff --git a/spacy/scorer.py b/spacy/scorer.py index 2bbf453e7..702c74521 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -72,7 +72,6 @@ class Scorer: def __init__(self, nlp=None, **cfg): """Initialize the Scorer. - RETURNS (Scorer): The newly created object. DOCS: https://spacy.io/api/scorer#init """ diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 9e584ce8a..136eda9ff 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -97,7 +97,6 @@ cdef class StringStore: """Create the StringStore. strings (iterable): A sequence of unicode strings to add to the store. - RETURNS (StringStore): The newly constructed object. """ self.mem = Pool() self._map = PreshMap() diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 793bb5a25..858a93ce5 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -50,7 +50,6 @@ cdef class Tokenizer: recognised as tokens. url_match (callable): A boolean function matching strings to be recognised as tokens after considering prefixes and suffixes. - RETURNS (Tokenizer): The newly constructed object. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 3943767a0..b89ce3bdd 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -312,6 +312,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): """Retokenize the document, such that the token at `doc[token_index]` is split into tokens with the orth 'orths' token_index(int): token index of the token to split. + orths: IDs of the verbatim text content of the tokens to create **attributes: Attributes to assign to each of the newly created tokens. By default, attributes are inherited from the original token. diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 96245a0e1..0a5fd0c59 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,10 +1,12 @@ +from typing import Iterable, Iterator import numpy import zlib import srsly from thinc.api import NumpyOps +from .doc import Doc +from ..vocab import Vocab from ..compat import copy_reg -from ..tokens import Doc from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors @@ -44,13 +46,18 @@ class DocBin: document from the DocBin. """ - def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]): + def __init__( + self, + attrs: Iterable[str] = ALL_ATTRS, + store_user_data: bool = False, + docs=Iterable[Doc], + ) -> None: """Create a DocBin object to hold serialized annotations. - attrs (list): List of attributes to serialize. 'orth' and 'spacy' are - always serialized, so they're not required. Defaults to None. + attrs (Iterable[str]): List of attributes to serialize. 'orth' and + 'spacy' are always serialized, so they're not required. store_user_data (bool): Whether to include the `Doc.user_data`. - RETURNS (DocBin): The newly constructed object. + docs (Iterable[Doc]): Docs to add. DOCS: https://spacy.io/api/docbin#init """ @@ -68,11 +75,11 @@ class DocBin: for doc in docs: self.add(doc) - def __len__(self): + def __len__(self) -> int: """RETURNS: The number of Doc objects added to the DocBin.""" return len(self.tokens) - def add(self, doc): + def add(self, doc: Doc) -> None: """Add a Doc's annotations to the DocBin for serialization. doc (Doc): The Doc object to add. @@ -100,7 +107,7 @@ class DocBin: if self.store_user_data: self.user_data.append(srsly.msgpack_dumps(doc.user_data)) - def get_docs(self, vocab): + def get_docs(self, vocab: Vocab) -> Iterator[Doc]: """Recover Doc objects from the annotations, using the given vocab. vocab (Vocab): The shared vocab. @@ -125,7 +132,7 @@ class DocBin: doc.user_data.update(user_data) yield doc - def merge(self, other): + def merge(self, other: "DocBin") -> None: """Extend the annotations of this DocBin with the annotations from another. Will raise an error if the pre-defined attrs of the two DocBins don't match. @@ -144,7 +151,7 @@ class DocBin: if self.store_user_data: self.user_data.extend(other.user_data) - def to_bytes(self): + def to_bytes(self) -> bytes: """Serialize the DocBin's annotations to a bytestring. RETURNS (bytes): The serialized DocBin. @@ -156,7 +163,6 @@ class DocBin: lengths = [len(tokens) for tokens in self.tokens] tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([]) spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) - msg = { "version": self.version, "attrs": self.attrs, @@ -171,7 +177,7 @@ class DocBin: msg["user_data"] = self.user_data return zlib.compress(srsly.msgpack_dumps(msg)) - def from_bytes(self, bytes_data): + def from_bytes(self, bytes_data: bytes) -> "DocBin": """Deserialize the DocBin's annotations from a bytestring. bytes_data (bytes): The data to load from. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index adc7059e5..0ba5abb52 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -173,7 +173,6 @@ cdef class Doc: words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. - RETURNS (Doc): The newly constructed object. DOCS: https://spacy.io/api/doc#init """ diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 203308749..5b55d8e88 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -94,7 +94,6 @@ cdef class Span: kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. - RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/span#init """ diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 0cc7409a7..bcea87e67 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -58,7 +58,6 @@ cdef class Vectors: data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. name (str): A name to identify the vectors table. - RETURNS (Vectors): The newly created object. DOCS: https://spacy.io/api/vectors#init """ diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 2115789e6..f41ad2356 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -74,7 +74,6 @@ cdef class Vocab: lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. vectors_name (unicode): Optional name to identify the vectors table. - RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index abc2b7bfa..a87c2a1e8 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -4,6 +4,7 @@ teaser: Pre-defined model architectures included with the core library source: spacy/ml/models menu: - ['Tok2Vec', 'tok2vec'] + - ['Transformers', 'transformers'] - ['Parser & NER', 'parser'] - ['Text Classification', 'textcat'] - ['Entity Linking', 'entitylinker'] @@ -13,7 +14,7 @@ TODO: intro and how architectures work, link to [`registry`](/api/top-level#registry), [custom models](/usage/training#custom-models) usage etc. -## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}} +## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"} ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} @@ -21,12 +22,14 @@ TODO: intro and how architectures work, link to ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM} +## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} + +### spacy-transformers.TransformerModel.v1 {#TransformerModel} + ## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"} ### spacy.TransitionBasedParser.v1 {#TransitionBasedParser} - - > #### Example Config > > ```ini diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 3256849c3..38e19129d 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -13,25 +13,84 @@ datasets in the [DocBin](/api/docbin) (`.spacy`) format. Create a `Corpus`. The input data can be a file or a directory of files. -| Name | Type | Description | -| ----------- | ------------ | ---------------------------------------------------------------- | -| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | -| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | -| `limit` | int | Maximum number of examples returned. | -| **RETURNS** | `Corpus` | The newly constructed object. | +> #### Example +> +> ```python +> from spacy.gold import Corpus +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> ``` - - -## Corpus.walk_corpus {#walk_corpus tag="staticmethod"} - -## Corpus.make_examples {#make_examples tag="method"} - -## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"} - -## Corpus.read_docbin {#read_docbin tag="method"} - -## Corpus.count_train {#count_train tag="method"} +| Name | Type | Description | +| ------- | ------------ | ---------------------------------------------------------------- | +| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | +| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | +| `limit` | int | Maximum number of examples returned. `0` for no limit (default). | ## Corpus.train_dataset {#train_dataset tag="method"} +Yield examples from the training data. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> train_data = corpus.train_dataset(nlp) +> ``` + +| Name | Type | Description | +| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `nlp` | `Language` | The current `nlp` object. | +| _keyword-only_ | | | +| `shuffle` | bool | Whether to shuffle the examples. Defaults to `True`. | +| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | +| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. `0` for no limit (default).  | +| **YIELDS** | `Example` | The examples. | + ## Corpus.dev_dataset {#dev_dataset tag="method"} + +Yield examples from the development data. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> dev_data = corpus.dev_dataset(nlp) +> ``` + +| Name | Type | Description | +| -------------- | ---------- | ---------------------------------------------------------------------------- | +| `nlp` | `Language` | The current `nlp` object. | +| _keyword-only_ | | | +| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | +| **YIELDS** | `Example` | The examples. | + +## Corpus.count_train {#count_train tag="method"} + +Get the word count of all training examples. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> word_count = corpus.count_train(nlp) +> ``` + +| Name | Type | Description | +| ----------- | ---------- | ------------------------- | +| `nlp` | `Language` | The current `nlp` object. | +| **RETURNS** | int | The word count. | + + diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 9dea04284..6e54fb112 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -87,13 +87,12 @@ Create a `Token` object from a `TokenC*` pointer. > token = Token.cinit(&doc.c[3], doc, 3) > ``` -| Name | Type | Description | -| ----------- | --------- | ------------------------------------------------------------ | -| `vocab` | `Vocab` | A reference to the shared `Vocab`. | -| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | -| `offset` | `int` | The offset of the token within the document. | -| `doc` | `Doc` | The parent document. | -| **RETURNS** | `Token` | The newly constructed object. | +| Name | Type | Description | +| -------- | --------- | ------------------------------------------------------------ | +| `vocab` | `Vocab` | A reference to the shared `Vocab`. | +| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | +| `offset` | `int` | The offset of the token within the document. | +| `doc` | `Doc` | The parent document. | ## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 425b669ce..f6ed7492d 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and ## DependencyParser.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index b5871f2ab..69608c958 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -30,12 +30,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `words` | iterable | A list of strings to add to the container. | -| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | -| **RETURNS** | `Doc` | The newly constructed object. | +| Name | Type | Description | +| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `words` | iterable | A list of strings to add to the container. | +| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 07f95f91d..65d1153d1 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -44,11 +44,11 @@ Create a `DocBin` object to hold serialized annotations. > doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) > ``` -| Argument | Type | Description | -| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | -| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | -| **RETURNS** | `DocBin` | The newly constructed object. | +| Argument | Type | Description | +| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | +| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | +| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. | ## DocBin.\_\len\_\_ {#len tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index b2b1eec32..c29f0326c 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and ## EntityLinker.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 63404e087..b1d40a9c3 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and ## EntityRecognizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 0d06c79a1..e6299fc31 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -37,7 +37,6 @@ both documents. | `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. | | _keyword-only_ | | | | `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. | -| **RETURNS** | `Example` | The newly constructed object. | ## Example.from_dict {#from_dict tag="classmethod"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index f088815fd..7b2c4edf4 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -27,11 +27,10 @@ Create the knowledge base. > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > ``` -| Name | Type | Description | -| ---------------------- | --------------- | ---------------------------------------- | -| `vocab` | `Vocab` | A `Vocab` object. | -| `entity_vector_length` | int | Length of the fixed-size entity vectors. | -| **RETURNS** | `KnowledgeBase` | The newly constructed object. | +| Name | Type | Description | +| ---------------------- | ------- | ---------------------------------------- | +| `vocab` | `Vocab` | A `Vocab` object. | +| `entity_vector_length` | int | Length of the fixed-size entity vectors. | ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} @@ -255,7 +254,6 @@ but instead these objects are returned by the | `entity_freq` | float | The entity frequency as recorded in the KB. | | `alias_hash` | int | The hash of the textual mention or alias. | | `prior_prob` | float | The prior probability of the `alias` referring to the `entity` | -| **RETURNS** | `Candidate` | The newly constructed object. | ## Candidate attributes {#candidate_attributes} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index d685c014b..0f7797d7f 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -15,6 +15,58 @@ the tagger or parser that are called on a document in order. You can also add your own processing pipeline components that take a `Doc` object, modify it and return it. +## Language.\_\_init\_\_ {#init tag="method"} + +Initialize a `Language` object. + +> #### Example +> +> ```python +> # Construction from subclass +> from spacy.lang.en import English +> nlp = English() +> +> # Construction from scratch +> from spacy.vocab import Vocab +> from spacy.language import Language +> nlp = Language(Vocab()) +> ``` + +| Name | Type | Description | +| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | +| _keyword-only_ | | | +| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | +| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | +| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | + +## Language.from_config {#from_config tag="classmethod"} + +Create a `Language` object from a loaded config. Will set up the tokenizer and +language data, add pipeline components based on the pipeline and components +define in the config and validate the results. If no config is provided, the +default config of the given language is used. This is also how spaCy loads a +model under the hood based on its [`config.cfg`](/api/data-formats#config). + +> #### Example +> +> ```python +> from thinc.api import Config +> from spacy.language import Language +> +> config = Config().from_disk("./config.cfg") +> nlp = Language.from_config(config) +> ``` + +| Name | Type | Description | +| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | +| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | +| _keyword-only_ | | +| `disable` | `Iterable[str]` | List of pipeline component names to disable. | +| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | +| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | +| **RETURNS** | `Language` | The initialized object. | + ## Language.component {#component tag="classmethod" new="3"} Register a custom pipeline component under a given name. This allows @@ -101,57 +153,6 @@ examples, see the | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | -## Language.\_\_init\_\_ {#init tag="method"} - -Initialize a `Language` object. - -> #### Example -> -> ```python -> from spacy.vocab import Vocab -> from spacy.language import Language -> nlp = Language(Vocab()) -> -> from spacy.lang.en import English -> nlp = English() -> ``` - -| Name | Type | Description | -| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | -| _keyword-only_ | | | -| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | -| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | -| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | -| **RETURNS** | `Language` | The newly constructed object. | - -## Language.from_config {#from_config tag="classmethod"} - -Create a `Language` object from a loaded config. Will set up the tokenizer and -language data, add pipeline components based on the pipeline and components -define in the config and validate the results. If no config is provided, the -default config of the given language is used. This is also how spaCy loads a -model under the hood based on its [`config.cfg`](/api/data-formats#config). - -> #### Example -> -> ```python -> from thinc.api import Config -> from spacy.language import Language -> -> config = Config().from_disk("./config.cfg") -> nlp = Language.from_config(config) -> ``` - -| Name | Type | Description | -| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | -| _keyword-only_ | | -| `disable` | `Iterable[str]` | List of pipeline component names to disable. | -| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | -| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | -| **RETURNS** | `Language` | The initialized object. | - ## Language.\_\_call\_\_ {#call tag="method"} Apply the pipeline to some text. The text can span multiple sentences, and can @@ -164,11 +165,13 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` -| Name | Type | Description | -| ----------- | ----------- | --------------------------------------------------------------------------------- | -| `text` | str | The text to be processed. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Doc` | A container for accessing the annotations. | +| Name | Type | Description | +| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ | +| `text` | str | The text to be processed. | +| _keyword-only_ | | | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. | ## Language.pipe {#pipe tag="method"} @@ -183,15 +186,57 @@ more efficient than processing texts one-by-one. > assert doc.is_parsed > ``` -| Name | Type | Description | -| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | `Iterable[str]` | A sequence of strings. | -| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | -| `batch_size` | int | The number of texts to buffer. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | -| **YIELDS** | `Doc` | Documents in the order of the original text. | +| Name | Type | Description | +| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | `Iterable[str]` | A sequence of strings. | +| _keyword-only_ | | | +| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | +| `batch_size` | int | The number of texts to buffer. | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | +| **YIELDS** | `Doc` | Documents in the order of the original text. | + +## Language.begin_training {#begin_training tag="method"} + +Initialize the pipe for training, using data examples if available. Returns an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. + +> #### Example +> +> ```python +> optimizer = nlp.begin_training(get_examples) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | +| _keyword-only_ | | | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Language.resume_training {#resume_training tag="method,experimental" new="3"} + +Continue training a pretrained model. Create and return an optimizer, and +initialize "rehearsal" for any pipeline component that has a `rehearse` method. +Rehearsal is used to prevent models from "forgetting" their initialized +"knowledge". To perform rehearsal, collect samples of text you want the models +to retain performance on, and call [`nlp.rehearse`](/api/language#rehearse) with +a batch of [Example](/api/example) objects. + +> #### Example +> +> ```python +> optimizer = nlp.resume_training() +> nlp.rehearse(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | ## Language.update {#update tag="method"} @@ -206,15 +251,37 @@ Update the models in the pipeline. > nlp.update([example], sgd=optimizer) > ``` -| Name | Type | Description | -| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | -| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Type | Description | +| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | + +## Language.rehearse {#rehearse tag="method,experimental"} + +Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the +current model to make predictions similar to an initial model, to try to address +the "catastrophic forgetting" problem. This feature is experimental. + +> #### Example +> +> ```python +> optimizer = nlp.resume_training() +> losses = nlp.rehearse(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## Language.evaluate {#evaluate tag="method"} @@ -227,33 +294,15 @@ Evaluate a model's pipeline components. > print(scores) > ``` -| Name | Type | Description | -| -------------------------------------------- | ------------------------------- | ------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| `verbose` | bool | Print debugging information. | -| `batch_size` | int | The batch size to use. | -| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores. | - -## Language.begin_training {#begin_training tag="method"} - -Allocate models, pre-process training data and acquire an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). - -> #### Example -> -> ```python -> optimizer = nlp.begin_training(get_examples) -> ``` - -| Name | Type | Description | -| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ | -| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | -| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| `**cfg` | - | Config parameters (sent to all components). | -| **RETURNS** | `Optimizer` | An optimizer. | +| Name | Type | Description | +| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `verbose` | bool | Print debugging information. | +| `batch_size` | int | The batch size to use. | +| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. | ## Language.use_params {#use_params tag="contextmanager, method"} @@ -296,6 +345,7 @@ To create a component and add it to the pipeline, you should always use | ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | `factory_name` | str | Name of the registered component factory. | | `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | +| _keyword-only_ | | | | `config` 3 | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | | `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | **RETURNS** | callable | The pipeline component. | @@ -418,10 +468,13 @@ Replace a component in the pipeline. > nlp.replace_pipe("parser", my_custom_parser) > ``` -| Name | Type | Description | -| ----------- | -------- | --------------------------------- | -| `name` | str | Name of the component to replace. | -| `component` | callable | The pipeline component to insert. | +| Name | Type | Description | +| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str | Name of the component to replace. | +| `component` | callable | The pipeline component to insert. | +| _keyword-only_ | | | +| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. | +| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | ## Language.rename_pipe {#rename_pipe tag="method" new="2"} @@ -492,11 +545,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------------------ | -| `disable` | str / list | Name(s) of pipeline components to disable. | -| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | -| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------------------ | +| _keyword-only_ | | | +| `disable` | str / list | Name(s) of pipeline components to disable. | +| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | +| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} @@ -767,8 +821,8 @@ serialization by passing in the string names via the `exclude` argument. The `FactoryMeta` contains the information about the component and its default provided by the [`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory) decorator. It's created whenever a -component is added to the pipeline and stored on the `Language` class for each -component instance and factory instance. +component is defined and stored on the `Language` class for each component +instance and factory instance. | Name | Type | Description | | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 237bfa468..73f8aa71f 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -31,7 +31,6 @@ when a `Language` subclass and its `Vocab` is initialized. | Name | Type | Description | | -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | -| **RETURNS** | `Lemmatizer` | The newly created object. | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index b39664a55..625a26412 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -13,11 +13,10 @@ lemmatization depends on the part-of-speech tag). Create a `Lexeme` object. -| Name | Type | Description | -| ----------- | -------- | ----------------------------- | -| `vocab` | `Vocab` | The parent vocabulary. | -| `orth` | int | The orth id of the lexeme. | -| **RETURNS** | `Lexeme` | The newly constructed object. | +| Name | Type | Description | +| ------- | ------- | -------------------------- | +| `vocab` | `Vocab` | The parent vocabulary. | +| `orth` | int | The orth id of the lexeme. | ## Lexeme.set_flag {#set_flag tag="method"} diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index b91d92646..099b5306e 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -236,10 +236,9 @@ Initialize a new table. > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ----------- | ------- | ---------------------------------- | -| `name` | str | Optional table name for reference. | -| **RETURNS** | `Table` | The newly constructed object. | +| Name | Type | Description | +| ------ | ---- | ---------------------------------- | +| `name` | str | Optional table name for reference. | ### Table.from_dict {#table.from_dict tag="classmethod"} diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index c59a58c81..925c9ad2e 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -19,11 +19,10 @@ string where an integer is expected) or unexpected property names. > matcher = Matcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `validate` 2.1 | bool | Validate all patterns added to this matcher. | -| **RETURNS** | `Matcher` | The newly constructed object. | +| Name | Type | Description | +| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `validate` 2.1 | bool | Validate all patterns added to this matcher. | ## Matcher.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/morphanalysis.md b/website/docs/api/morphanalysis.md index 5c2356ad9..4df9a3f7f 100644 --- a/website/docs/api/morphanalysis.md +++ b/website/docs/api/morphanalysis.md @@ -6,7 +6,6 @@ source: spacy/tokens/morphanalysis.pyx Stores a single morphological analysis. - ## MorphAnalysis.\_\_init\_\_ {#init tag="method"} Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of @@ -16,17 +15,15 @@ morphological features. > > ```python > from spacy.tokens import MorphAnalysis -> +> > feats = "Feat1=Val1|Feat2=Val2" > m = MorphAnalysis(nlp.vocab, feats) > ``` -| Name | Type | Description | -| ----------- | ------------------ | ----------------------------- | -| `vocab` | `Vocab` | The vocab. | -| `features` | `Union[Dict, str]` | The morphological features. | -| **RETURNS** | `MorphAnalysis` | The newly constructed object. | - +| Name | Type | Description | +| ---------- | ------------------ | --------------------------- | +| `vocab` | `Vocab` | The vocab. | +| `features` | `Union[Dict, str]` | The morphological features. | ## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"} @@ -44,7 +41,6 @@ Whether a feature/value pair is in the analysis. | ----------- | ----- | ------------------------------------- | | **RETURNS** | `str` | A feature/value pair in the analysis. | - ## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"} Iterate over the feature/value pairs in the analysis. @@ -61,7 +57,6 @@ Iterate over the feature/value pairs in the analysis. | ---------- | ----- | ------------------------------------- | | **YIELDS** | `str` | A feature/value pair in the analysis. | - ## MorphAnalysis.\_\_len\_\_ {#len tag="method"} Returns the number of features in the analysis. @@ -78,7 +73,6 @@ Returns the number of features in the analysis. | ----------- | ----- | --------------------------------------- | | **RETURNS** | `int` | The number of features in the analysis. | - ## MorphAnalysis.\_\_str\_\_ {#str tag="method"} Returns the morphological analysis in the UD FEATS string format. @@ -92,10 +86,9 @@ Returns the morphological analysis in the UD FEATS string format. > ``` | Name | Type | Description | -| ----------- | ----- | ---------------------------------| +| ----------- | ----- | -------------------------------- | | **RETURNS** | `str` | The analysis in UD FEATS format. | - ## MorphAnalysis.get {#get tag="method"} Retrieve values for a feature by field. @@ -108,11 +101,10 @@ Retrieve values for a feature by field. > assert morph.get("Feat1") == ["Val1", "Val2"] > ``` -| Name | Type | Description | -| ----------- | ------ | ----------------------------------- | -| `field` | `str` | The field to retrieve. | -| **RETURNS** | `list` | A list of the individual features. | - +| Name | Type | Description | +| ----------- | ------ | ---------------------------------- | +| `field` | `str` | The field to retrieve. | +| **RETURNS** | `list` | A list of the individual features. | ## MorphAnalysis.to_dict {#to_dict tag="method"} @@ -128,10 +120,9 @@ map. > ``` | Name | Type | Description | -| ----------- | ------ | -----------------------------------------| +| ----------- | ------ | ---------------------------------------- | | **RETURNS** | `dict` | The dict representation of the analysis. | - ## MorphAnalysis.from_id {#from_id tag="classmethod"} Create a morphological analysis from a given hash ID. @@ -149,5 +140,3 @@ Create a morphological analysis from a given hash ID. | ------- | ------- | -------------------------------- | | `vocab` | `Vocab` | The vocab. | | `key` | `int` | The hash of the features string. | - - diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 8ac300de3..a153bd51c 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and ## Morphologizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index ad279bff7..8fb89c15f 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -4,12 +4,11 @@ tag: class source: spacy/morphology.pyx --- -Store the possible morphological analyses for a language, and index them -by hash. To save space on each token, tokens only know the hash of their +Store the possible morphological analyses for a language, and index them by +hash. To save space on each token, tokens only know the hash of their morphological analysis, so queries of morphological attributes are delegated to this class. - ## Morphology.\_\_init\_\_ {#init tag="method"} Create a Morphology object using the tag map, lemmatizer and exceptions. @@ -22,21 +21,18 @@ Create a Morphology object using the tag map, lemmatizer and exceptions. > morphology = Morphology(strings, tag_map, lemmatizer) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| `strings` | `StringStore` | The string store. | -| `tag_map` | `Dict[str, Dict]` | The tag map. | -| `lemmatizer`| `Lemmatizer` | The lemmatizer. | -| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | -| **RETURNS** | `Morphology` | The newly constructed object. | - +| Name | Type | Description | +| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- | +| `strings` | `StringStore` | The string store. | +| `tag_map` | `Dict[str, Dict]` | The tag map. | +| `lemmatizer` | `Lemmatizer` | The lemmatizer. | +| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | ## Morphology.add {#add tag="method"} -Insert a morphological analysis in the morphology table, if not already -present. The morphological analysis may be provided in the UD FEATS format as a -string or in the tag map dictionary format. Returns the hash of the new -analysis. +Insert a morphological analysis in the morphology table, if not already present. +The morphological analysis may be provided in the UD FEATS format as a string or +in the tag map dictionary format. Returns the hash of the new analysis. > #### Example > @@ -46,10 +42,9 @@ analysis. > assert hash == nlp.vocab.strings[feats] > ``` -| Name | Type | Description | -| ----------- | ------------------- | --------------------------- | -| `features` | `Union[Dict, str]` | The morphological features. | - +| Name | Type | Description | +| ---------- | ------------------ | --------------------------- | +| `features` | `Union[Dict, str]` | The morphological features. | ## Morphology.get {#get tag="method"} @@ -63,33 +58,30 @@ analysis. Get the FEATS string for the hash of the morphological analysis. -| Name | Type | Description | -| ----------- | ------ | --------------------------------------- | -| `morph` | int | The hash of the morphological analysis. | - +| Name | Type | Description | +| ------- | ---- | --------------------------------------- | +| `morph` | int | The hash of the morphological analysis. | ## Morphology.load_tag_map {#load_tag_map tag="method"} Replace the current tag map with the provided tag map. -| Name | Type | Description | -| ----------- | ------------------ | ------------ | -| `tag_map` | `Dict[str, Dict]` | The tag map. | - +| Name | Type | Description | +| --------- | ----------------- | ------------ | +| `tag_map` | `Dict[str, Dict]` | The tag map. | ## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"} Replace the current morphological exceptions with the provided exceptions. -| Name | Type | Description | -| ------------- | ------------------ | ----------------------------- | -| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | - +| Name | Type | Description | +| ------------- | ----------------- | ----------------------------- | +| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | ## Morphology.add_special_case {#add_special_case tag="method"} -Add a special-case rule to the morphological analyzer. Tokens whose tag and -orth match the rule will receive the specified properties. +Add a special-case rule to the morphological analyzer. Tokens whose tag and orth +match the rule will receive the specified properties. > #### Example > @@ -98,27 +90,24 @@ orth match the rule will receive the specified properties. > morphology.add_special_case("DT", "the", attrs) > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------- | -| `tag_str` | str | The fine-grained tag. | -| `orth_str` | str | The token text. | -| `attrs` | dict | The features to assign for this token and tag. | - +| Name | Type | Description | +| ---------- | ---- | ---------------------------------------------- | +| `tag_str` | str | The fine-grained tag. | +| `orth_str` | str | The token text. | +| `attrs` | dict | The features to assign for this token and tag. | ## Morphology.exc {#exc tag="property"} The current morphological exceptions. -| Name | Type | Description | -| ---------- | ----- | --------------------------------------------------- | -| **YIELDS** | dict | The current dictionary of morphological exceptions. | - +| Name | Type | Description | +| ---------- | ---- | --------------------------------------------------- | +| **YIELDS** | dict | The current dictionary of morphological exceptions. | ## Morphology.lemmatize {#lemmatize tag="method"} TODO - ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} Convert a string FEATS representation to a dictionary of features and values in @@ -132,11 +121,10 @@ the same format as the tag map. > assert d == {"Feat1": "Val1", "Feat2": "Val2"} > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------------------------- | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------------------------ | | `feats` | str | The morphological features in Universal Dependencies FEATS format. | -| **RETURNS** | dict | The morphological features as a dictionary. | - +| **RETURNS** | dict | The morphological features as a dictionary. | ## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"} @@ -150,12 +138,11 @@ Convert a dictionary of features and values to a string FEATS representation. > assert f == "Feat1=Val1|Feat2=Val2" > ``` -| Name | Type | Description | +| Name | Type | Description | | ------------ | ----------------- | --------------------------------------------------------------------- | | `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. | | **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. | - ## Attributes {#attributes} | Name | Type | Description | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 991016094..866aca096 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -35,12 +35,11 @@ be shown. > matcher = PhraseMatcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | -| `validate` 2.1 | bool | Validate patterns added to the matcher. | -| **RETURNS** | `PhraseMatcher` | The newly constructed object. | +| Name | Type | Description | +| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | +| `validate` 2.1 | bool | Validate patterns added to the matcher. | ## PhraseMatcher.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index c03a1b4da..a2d055d88 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -95,7 +95,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and ## Pipe.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -198,7 +198,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = pipe.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 8daefd241..f50a13099 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -28,10 +28,9 @@ Create a new `Scorer`. > scorer = Scorer(nlp) > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | -| **RETURNS** | `Scorer` | The newly created object. | +| Name | Type | Description | +| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | ## Scorer.score {#score tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 2c0944b1f..f7d2ac00f 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -116,7 +116,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the ## SentenceRecognizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -201,7 +201,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > senter = nlp.add_pipe("senter") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = senter.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 668013e76..9237b5538 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,15 +18,14 @@ Create a Span object from the slice `doc[start : end]`. > assert [t.text for t in span] == ["it", "back", "!"] > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | int | The index of the first token of the span. | -| `end` | int | The index of the first token after the span. | -| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | -| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | -| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object. | +| Name | Type | Description | +| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `start` | int | The index of the first token of the span. | +| `end` | int | The index of the first token after the span. | +| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | +| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | ## Span.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index c00c59832..b66d755ed 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -19,10 +19,9 @@ Create the `StringStore`. > stringstore = StringStore(["apple", "orange"]) > ``` -| Name | Type | Description | -| ----------- | ------------- | ------------------------------------------ | -| `strings` | iterable | A sequence of strings to add to the store. | -| **RETURNS** | `StringStore` | The newly constructed object. | +| Name | Type | Description | +| --------- | -------- | ------------------------------------------ | +| `strings` | iterable | A sequence of strings to add to the store. | ## StringStore.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 351492aa9..cc7401016 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -114,7 +114,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and ## Tagger.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -199,7 +199,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = tagger.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index c4327dca7..c0dd07c1e 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -133,7 +133,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and ## TextCategorizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -218,7 +218,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = textcat.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 29f91afe6..11167c428 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -110,7 +110,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods. ## Tok2Vec.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 1cb833089..ca6b57a5b 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -17,12 +17,11 @@ Construct a `Token` object. > assert token.text == "Give" > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `doc` | `Doc` | The parent document. | -| `offset` | int | The index of the token within the document. | -| **RETURNS** | `Token` | The newly constructed object. | +| Name | Type | Description | +| -------- | ------- | ------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `doc` | `Doc` | The parent document. | +| `offset` | int | The index of the token within the document. | ## Token.\_\_len\_\_ {#len tag="method"} @@ -393,73 +392,73 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | str | Verbatim text content. | -| `text_with_ws` | str | Text content, with trailing space character if present. | -| `whitespace_` | str | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | str | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | str | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | -| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Does the token have a word vector? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | str | Fine-grained part-of-speech. | -| `morph` | `MorphAnalysis` | Morphological analysis. | -| `morph_` | str | Morphological analysis in UD FEATS format. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | str | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | str | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | +| `text` | str | Verbatim text content. | +| `text_with_ws` | str | Text content, with trailing space character if present. | +| `whitespace_` | str | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | str | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | str | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Does the token have a word vector? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | str | Fine-grained part-of-speech. | +| `morph` | `MorphAnalysis` | Morphological analysis. | +| `morph_` | str | Morphological analysis in UD FEATS format. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | str | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | str | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 47e5aa9b3..02023cf9f 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,16 +34,15 @@ the > tokenizer = nlp.tokenizer > ``` -| Name | Type | Description | -| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | -| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md new file mode 100644 index 000000000..aab02fe68 --- /dev/null +++ b/website/docs/api/transformer.md @@ -0,0 +1,107 @@ +--- +title: Transformer +teaser: Pipeline component for multi-task learning with transformer models +tag: class +source: github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py +new: 3 +api_base_class: /api/pipe +api_string_name: transformer +--- + +> #### Installation +> +> ```bash +> $ pip install spacy-transformers +> ``` + + + +This component is available via the extension package +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). It +exposes the component via entry points, so if you have the package installed, +using `factory = "transformer"` in your +[training config](/usage/training#config) or `nlp.add_pipe("transformer")` will +work out-of-the-box. + + + +This pipeline component lets you use transformer models in your pipeline. The +component assigns the output of the transformer to the Doc's extension +attributes. We also calculate an alignment between the word-piece tokens and the +spaCy tokenization, so that we can use the last hidden states to set the +`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy +token, the spaCy token receives the sum of their values. To access the values, +you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. For +more details, see the [usage documentation](/usage/transformers). + +## Config and implementation {#config} + +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). See the +[model architectures](/api/architectures) documentation for details on the +architectures and their arguments and hyperparameters. + +> #### Example +> +> ```python +> from spacy_transformers import Transformer, DEFAULT_CONFIG +> +> nlp.add_pipe("transformer", config=DEFAULT_CONFIG) +> ``` + +| Setting | Type | Description | Default | +| ------------------- | ------------------------------------------ | ------------------------------- | ------------------------------------------------------------------- | +| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | +| `annotation_setter` | Callable | | [`null_annotation_setter`](/api/transformer#null_annotation_setter) | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | + +```python +https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py +``` + +## Transformer.\_\_init\_\_ {#init tag="method"} + +> #### Example +> +> ```python +> # Construction via add_pipe with default model +> trf = nlp.add_pipe("transformer") +> +> # Construction via add_pipe with custom model +> config = {"model": {"@architectures": "my_transformer"}} +> trf = nlp.add_pipe("transformer", config=config) +> +> # Construction from class +> from spacy_transformers import Transformer +> trf = Transformer(nlp.vocab, model) +> ``` + +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#create_pipe). + +| Name | Type | Description | +| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `annotation_setter` | `Callable` | | +| _keyword-only_ | | | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | + + + +## TransformerData {#transformerdata tag="dataclass"} + +## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} + +## Custom attributes {#custom-attributes} + +The component sets the following +[custom extension attributes](/usage/processing-pipeline#custom-components-attributes): + +| Name | Type | Description | +| -------------- | ----------------- | -------------- | +| `Doc.trf_data` | `TransformerData` | | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index a0f7ef88b..bfb49e9a2 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -37,7 +37,6 @@ you can add vectors to later. | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `keys` | iterable | A sequence of keys aligned with the data. | | `name` | str | A name to identify the vectors table. | -| **RETURNS** | `Vectors` | The newly created object. | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index af9feb82c..c68af2047 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -31,7 +31,6 @@ Create the vocabulary. | `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | | `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `vectors_name` 2.2 | str | A name to identify the vectors table. | -| **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index c54165e72..d5ce4e891 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -3,4 +3,154 @@ title: Transformers teaser: Using transformer models like BERT in spaCy --- -TODO: ... +spaCy v3.0 lets you use almost **any statistical model** to power your pipeline. +You can use models implemented in a variety of frameworks, including TensorFlow, +PyTorch and MXNet. To keep things sane, spaCy expects models from these +frameworks to be wrapped with a common interface, using our machine learning +library [Thinc](https://thinc.ai). A transformer model is just a statistical +model, so the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +actually has very little work to do: we just have to provide a few functions +that do the required plumbing. We also provide a pipeline component, +[`Transformer`](/api/transformer), that lets you do multi-task learning and lets +you save the transformer outputs for later use. + + + +Try out a BERT-based model pipeline using this project template: swap in your +data, edit the settings and hyperparameters and train, evaluate, package and +visualize your model. + + + + diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 24803e953..1f13b6328 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -31,18 +31,35 @@ raise errors. Many of them were also mostly internals. If you've been working with more recent versions of spaCy v2.x, it's **unlikely** that your code relied on them. -| Removed | Replacement | -| ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) | -| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) | -| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) | -| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | -| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | -| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | -| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer), | +| Removed | Replacement | +| ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) | +| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) | +| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) | +| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | +| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | +| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | +| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) | ## Migrating from v2.x {#migrating} +### Downloading and loading models {#migrating-downloading-models} + +Model symlinks and shortcuts like `en` are now officially deprecated. There are +[many different models](/models) with different capabilities and not just one +"English model". In order to download and load a model, you should always use +its full name – for instance, `en_core_web_sm`. + +```diff +- python -m spacy download en ++ python -m spacy download en_core_web_sm +``` + +```diff +- nlp = spacy.load("en") ++ nlp = spacy.load("en_core_web_sm") +``` + ### Custom pipeline components and factories {#migrating-pipeline-components} Custom pipeline components now have to be registered explicitly using the @@ -179,6 +196,10 @@ workflows, from data preprocessing to training and packaging your model. +#### Training via the Python API {#migrating-training-python} + + + #### Packaging models {#migrating-training-packaging} The [`spacy package`](/api/cli#package) command now automatically builds the diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 96e1ea8d6..0795eecc9 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -81,6 +81,7 @@ "items": [ { "text": "Tokenizer", "url": "/api/tokenizer" }, { "text": "Tok2Vec", "url": "/api/tok2vec" }, + { "text": "Transformer", "url": "/api/transformer" }, { "text": "Lemmatizer", "url": "/api/lemmatizer" }, { "text": "Morphologizer", "url": "/api/morphologizer" }, { "text": "Tagger", "url": "/api/tagger" }, diff --git a/website/src/components/link.js b/website/src/components/link.js index a2ab46476..de4edba27 100644 --- a/website/src/components/link.js +++ b/website/src/components/link.js @@ -33,11 +33,12 @@ const Link = ({ const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest) const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest) const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest) - const sourceWithText = (isSource || isApi) && isString(children) + const withIcon = isApi || isArch || isSource + const sourceWithText = withIcon && isString(children) const linkClassNames = classNames(classes.root, className, { [classes.hidden]: hidden, - [classes.nowrap]: (isApi || isSource || isArch) && !sourceWithText, - [classes.withIcon]: isApi || isSource || isArch, + [classes.nowrap]: (withIcon && !sourceWithText) || isArch, + [classes.withIcon]: withIcon, }) const Wrapper = ws ? Whitespace : Fragment const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null diff --git a/website/src/components/util.js b/website/src/components/util.js index 1935a8085..844f2c133 100644 --- a/website/src/components/util.js +++ b/website/src/components/util.js @@ -22,6 +22,7 @@ export const headingTextClassName = 'heading-text' * @returns {string} - URL to the file on GitHub. */ export function github(filepath, branch = 'master') { + if (filepath && filepath.startsWith('github.com')) return `https://${filepath}` const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : '' return `https://github.com/${repo}${path}` }