diff --git a/spacy/about.py b/spacy/about.py
index 03de62539..eb4d2128c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
-__version__ = "3.0.0a5"
+__version__ = "3.0.0a6"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 7202ccacf..ce0eb27a0 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -35,7 +35,7 @@ def pretrain_cli(
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
- epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
+ epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
):
diff --git a/spacy/gold/batchers.py b/spacy/gold/batchers.py
index 57c6b4b3a..ec1f35815 100644
--- a/spacy/gold/batchers.py
+++ b/spacy/gold/batchers.py
@@ -1,4 +1,4 @@
-from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable
+from typing import Union, Iterable, Sequence, TypeVar, List, Callable
from typing import Optional, Any
from functools import partial
import itertools
@@ -19,6 +19,22 @@ def configure_minibatch_by_padded_size(
discard_oversize: bool,
get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT:
+ """Create a batcher that uses the `batch_by_padded_size` strategy.
+
+ The padded size is defined as the maximum length of sequences within the
+ batch multiplied by the number of sequences in the batch.
+
+ size (int or Iterable[int]): The largest padded size to batch sequences into.
+ Can be a single integer, or a sequence, allowing for variable batch sizes.
+ buffer (int): The number of sequences to accumulate before sorting by length.
+ A larger buffer will result in more even sizing, but if the buffer is
+ very large, the iteration order will be less random, which can result
+ in suboptimal training.
+ discard_oversize (bool): Whether to discard sequences that are by themselves
+ longer than the largest padded batch size.
+ get_length (Callable or None): Function to get the length of a sequence item.
+ The `len` function is used by default.
+ """
# Avoid displacing optional values from the underlying function.
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(
@@ -38,6 +54,16 @@ def configure_minibatch_by_words(
discard_oversize: bool,
get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT:
+ """Create a batcher that uses the "minibatch by words" strategy.
+
+ size (int or Iterable[int]): The target number of words per batch.
+ Can be a single integer, or a sequence, allowing for variable batch sizes.
+ tolerance (float): What percentage of the size to allow batches to exceed.
+ discard_oversize (bool): Whether to discard sequences that by themselves
+ exceed the tolerated size.
+ get_length (Callable or None): Function to get the length of a sequence
+ item. The `len` function is used by default.
+ """
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(
minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
@@ -48,22 +74,43 @@ def configure_minibatch_by_words(
def configure_minibatch(
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT:
+ """Create a batcher that creates batches of the specified size.
+
+ size (int or Iterable[int]): The target number of items per batch.
+ Can be a single integer, or a sequence, allowing for variable batch sizes.
+ """
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(minibatch, size=size, **optionals)
def minibatch_by_padded_size(
- docs: Iterator["Doc"],
+ seqs: Iterable[ItemT],
size: Sizing,
buffer: int = 256,
discard_oversize: bool = False,
get_length: Callable = len,
-) -> Iterator[Iterator["Doc"]]:
+) -> Iterable[List[ItemT]]:
+ """Minibatch a sequence by the size of padded batches that would result,
+ with sequences binned by length within a window.
+
+ The padded size is defined as the maximum length of sequences within the
+ batch multiplied by the number of sequences in the batch.
+
+ size (int): The largest padded size to batch sequences into.
+ buffer (int): The number of sequences to accumulate before sorting by length.
+ A larger buffer will result in more even sizing, but if the buffer is
+ very large, the iteration order will be less random, which can result
+ in suboptimal training.
+ discard_oversize (bool): Whether to discard sequences that are by themselves
+ longer than the largest padded batch size.
+ get_length (Callable or None): Function to get the length of a sequence item.
+ The `len` function is used by default.
+ """
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
- for outer_batch in minibatch(docs, size=buffer):
+ for outer_batch in minibatch(seqs, size=buffer):
outer_batch = list(outer_batch)
target_size = next(size_)
for indices in _batch_by_length(outer_batch, target_size, get_length):
@@ -76,12 +123,24 @@ def minibatch_by_padded_size(
def minibatch_by_words(
- docs, size, tolerance=0.2, discard_oversize=False, get_length=len
-):
+ seqs: Iterable[ItemT],
+ size: Sizing,
+ tolerance=0.2,
+ discard_oversize=False,
+ get_length=len,
+) -> Iterable[List[ItemT]]:
"""Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by
themselves, or be discarded if discard_oversize=True.
- The argument 'docs' can be a list of strings, Docs or Examples.
+
+ seqs (Iterable[Sequence]): The sequences to minibatch.
+ size (int or Iterable[int]): The target number of words per batch.
+ Can be a single integer, or a sequence, allowing for variable batch sizes.
+ tolerance (float): What percentage of the size to allow batches to exceed.
+ discard_oversize (bool): Whether to discard sequences that by themselves
+ exceed the tolerated size.
+ get_length (Callable or None): Function to get the length of a sequence
+ item. The `len` function is used by default.
"""
if isinstance(size, int):
size_ = itertools.repeat(size)
@@ -95,20 +154,20 @@ def minibatch_by_words(
overflow = []
batch_size = 0
overflow_size = 0
- for doc in docs:
- n_words = get_length(doc)
+ for seq in seqs:
+ n_words = get_length(seq)
# if the current example exceeds the maximum batch size, it is returned separately
# but only if discard_oversize=False.
if n_words > target_size + tol_size:
if not discard_oversize:
- yield [doc]
+ yield [seq]
# add the example to the current batch if there's no overflow yet and it still fits
elif overflow_size == 0 and (batch_size + n_words) <= target_size:
- batch.append(doc)
+ batch.append(seq)
batch_size += n_words
# add the example to the overflow buffer if it fits in the tolerance margin
elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
- overflow.append(doc)
+ overflow.append(seq)
overflow_size += n_words
# yield the previous batch and start a new one. The new one gets the overflow examples.
else:
@@ -122,11 +181,11 @@ def minibatch_by_words(
overflow_size = 0
# this example still fits
if (batch_size + n_words) <= target_size:
- batch.append(doc)
+ batch.append(seq)
batch_size += n_words
# this example fits in overflow
elif (batch_size + n_words) <= (target_size + tol_size):
- overflow.append(doc)
+ overflow.append(seq)
overflow_size += n_words
# this example does not fit with the previous overflow: start another new batch
else:
@@ -134,7 +193,7 @@ def minibatch_by_words(
yield batch
target_size = next(size_)
tol_size = target_size * tolerance
- batch = [doc]
+ batch = [seq]
batch_size = n_words
batch.extend(overflow)
if batch:
diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py
index b8bef39b9..be389f117 100644
--- a/spacy/lang/en/lemmatizer.py
+++ b/spacy/lang/en/lemmatizer.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
from ...pipeline import Lemmatizer
from ...tokens import Token
diff --git a/spacy/language.py b/spacy/language.py
index 96661915a..85aac15ef 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
-from .lookups import load_lookups
from .tokenizer import Tokenizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema
@@ -1439,10 +1438,7 @@ class Language:
or lang_cls is not cls
):
raise ValueError(Errors.E943.format(value=type(lang_cls)))
- nlp = lang_cls(
- vocab=vocab,
- create_tokenizer=create_tokenizer,
- )
+ nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 716af9909..e0a54e6f1 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -68,11 +68,11 @@ cdef class DependencyMatcher:
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
- return self._normalize_key(key) in self._patterns
+ return self.has_key(key)
- def validateInput(self, pattern, key):
+ def validate_input(self, pattern, key):
idx = 0
- visitedNodes = {}
+ visited_nodes = {}
for relation in pattern:
if "PATTERN" not in relation or "SPEC" not in relation:
raise ValueError(Errors.E098.format(key=key))
@@ -83,7 +83,7 @@ cdef class DependencyMatcher:
and "NBOR_NAME" not in relation["SPEC"]
):
raise ValueError(Errors.E099.format(key=key))
- visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
+ visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
else:
if not(
"NODE_NAME" in relation["SPEC"]
@@ -92,22 +92,28 @@ cdef class DependencyMatcher:
):
raise ValueError(Errors.E100.format(key=key))
if (
- relation["SPEC"]["NODE_NAME"] in visitedNodes
- or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
+ relation["SPEC"]["NODE_NAME"] in visited_nodes
+ or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
):
raise ValueError(Errors.E101.format(key=key))
- visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
- visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
+ visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
+ visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
idx = idx + 1
def add(self, key, patterns, *_patterns, on_match=None):
+ """Add a new matcher rule to the matcher.
+
+ key (str): The match ID.
+ patterns (list): The patterns to add for the given key.
+ on_match (callable): Optional callback executed on match.
+ """
if patterns is None or hasattr(patterns, "__call__"): # old API
on_match = patterns
patterns = _patterns
for pattern in patterns:
if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key))
- self.validateInput(pattern,key)
+ self.validate_input(pattern,key)
key = self._normalize_key(key)
_patterns = []
for pattern in patterns:
@@ -187,8 +193,7 @@ cdef class DependencyMatcher:
key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule.
"""
- key = self._normalize_key(key)
- return key in self._patterns
+ return self._normalize_key(key) in self._patterns
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
@@ -202,6 +207,13 @@ cdef class DependencyMatcher:
return (self._callbacks[key], self._patterns[key])
def __call__(self, Doc doc):
+ """Find all token sequences matching the supplied pattern.
+
+ doclike (Doc or Span): The document to match over.
+ RETURNS (list): A list of `(key, start, end)` tuples,
+ describing the matches. A match tuple describes a span
+ `doc[start:end]`. The `label_id` and `key` are both integers.
+ """
matched_key_trees = []
matches = self.token_matcher(doc)
for key in list(self._patterns.keys()):
@@ -241,25 +253,25 @@ cdef class DependencyMatcher:
on_match(self, doc, i, matched_key_trees)
return matched_key_trees
- def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
+ def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
cdef bool isValid;
if(patternLength == len(id_to_position.keys())):
isValid = True
for node in range(patternLength):
if(node in tree):
for idx, (relop,nbor) in enumerate(tree[node]):
- computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
+ computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
isNbor = False
for computed_nbor in computed_nbors:
- if(computed_nbor.i == visitedNodes[nbor]):
+ if(computed_nbor.i == visited_nodes[nbor]):
isNbor = True
isValid = isValid & isNbor
if(isValid):
- matched_trees.append(visitedNodes)
+ matched_trees.append(visited_nodes)
return
allPatternNodes = numpy.asarray(id_to_position[patternLength])
for patternNode in allPatternNodes:
- self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
+ self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
# Given a node and an edge operator, to return the list of nodes
# from the doc that belong to node+operator. This is used to store
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index a0f3f1655..16ab73735 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -70,7 +70,7 @@ cdef class Matcher:
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
- return self._normalize_key(key) in self._patterns
+ return self.has_key(key)
def add(self, key, patterns, *, on_match=None, greedy: str=None):
"""Add a match-rule to the matcher. A match-rule consists of: an ID
@@ -162,8 +162,7 @@ cdef class Matcher:
key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule.
"""
- key = self._normalize_key(key)
- return key in self._patterns
+ return self._normalize_key(key) in self._patterns
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
@@ -179,7 +178,7 @@ cdef class Matcher:
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn.
- docs (iterable): A stream of documents.
+ docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index f85b5626a..801229af5 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
- "multitasks": [],
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
@@ -51,17 +50,52 @@ def make_parser(
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
- multitasks: Iterable,
learn_tokens: bool,
min_action_freq: int
):
+ """Create a transition-based DependencyParser component. The dependency parser
+ jointly learns sentence segmentation and labelled dependency parsing, and can
+ optionally learn to merge tokens that had been over-segmented by the tokenizer.
+
+ The parser uses a variant of the non-monotonic arc-eager transition-system
+ described by Honnibal and Johnson (2014), with the addition of a "break"
+ transition to perform the sentence segmentation. Nivre's pseudo-projective
+ dependency transformation is used to allow the parser to predict
+ non-projective parses.
+
+ The parser is trained using an imitation learning objective. The parser follows
+ the actions predicted by the current weights, and at each state, determines
+ which actions are compatible with the optimal parse that could be reached
+ from the current state. The weights such that the scores assigned to the
+ set of optimal actions is increased, while scores assigned to other
+ actions are decreased. Note that more than one action may be optimal for
+ a given state.
+
+ model (Model): The model for the transition-based parser. The model needs
+ to have a specific substructure of named components --- see the
+ spacy.ml.tb_framework.TransitionModel for details.
+ moves (List[str]): A list of transition names. Inferred from the data if not
+ provided.
+ update_with_oracle_cut_size (int):
+ During training, cut long sequences into shorter segments by creating
+ intermediate states based on the gold-standard history. The model is
+ not very sensitive to this parameter, so you usually won't need to change
+ it. 100 is a good default.
+ learn_tokens (bool): Whether to learn to merge subtokens that are split
+ relative to the gold standard. Experimental.
+ min_action_freq (int): The minimum frequency of labelled actions to retain.
+ Rarer labelled actions have their label backed-off to "dep". While this
+ primarily affects the label accuracy, it can also affect the attachment
+ structure, as the labels are used to represent the pseudo-projectivity
+ transformation.
+ """
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
- multitasks=multitasks,
+ multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 840070c23..080273f57 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -62,6 +62,16 @@ def make_entity_linker(
incl_prior: bool,
incl_context: bool,
):
+ """Construct an EntityLinker component.
+
+ model (Model[List[Doc], Floats2d]): A model that learns document vector
+ representations. Given a batch of Doc objects, it should return a single
+ array, with one row per item in the batch.
+ kb (KnowledgeBase): The knowledge-base to link entities to.
+ labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
+ incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
+ incl_context (bool): Whether or not to include the local context in the model.
+ """
return EntityLinker(
nlp.vocab,
model,
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 06c9f9a25..efc494181 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -75,8 +75,8 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
- labels_morph (dict): TODO:
- labels_pos (dict): TODO:
+ labels_morph (dict): Mapping of morph + POS tags to morph labels.
+ labels_pos (dict): Mapping of morph + POS tags to POS tags.
DOCS: https://spacy.io/api/morphologizer#init
"""
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index d13152a4f..a3bc3d920 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
- "multitasks": [],
- "learn_tokens": False,
- "min_action_freq": 30,
"model": DEFAULT_NER_MODEL,
},
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
@@ -50,19 +47,40 @@ def make_ner(
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
- multitasks: Iterable,
- learn_tokens: bool,
- min_action_freq: int
):
+ """Create a transition-based EntityRecognizer component. The entity recognizer
+ identifies non-overlapping labelled spans of tokens.
+
+ The transition-based algorithm used encodes certain assumptions that are
+ effective for "traditional" named entity recognition tasks, but may not be
+ a good fit for every span identification problem. Specifically, the loss
+ function optimizes for whole entity accuracy, so if your inter-annotator
+ agreement on boundary tokens is low, the component will likely perform poorly
+ on your problem. The transition-based algorithm also assumes that the most
+ decisive information about your entities will be close to their initial tokens.
+ If your entities are long and characterised by tokens in their middle, the
+ component will likely do poorly on your task.
+
+ model (Model): The model for the transition-based parser. The model needs
+ to have a specific substructure of named components --- see the
+ spacy.ml.tb_framework.TransitionModel for details.
+ moves (list[str]): A list of transition names. Inferred from the data if not
+ provided.
+ update_with_oracle_cut_size (int):
+ During training, cut long sequences into shorter segments by creating
+ intermediate states based on the gold-standard history. The model is
+ not very sensitive to this parameter, so you usually won't need to change
+ it. 100 is a good default.
+ """
return EntityRecognizer(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
- multitasks=multitasks,
- learn_tokens=learn_tokens,
- min_action_freq=min_action_freq
+ multitasks=[],
+ min_action_freq=1,
+ learn_tokens=False,
)
@@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
TransitionSystem = BiluoPushDown
def add_multitask_objective(self, mt_component):
+ """Register another component as a multi-task objective. Experimental."""
self._multitasks.append(mt_component)
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
+ """Setup multi-task objective components. Experimental and internal."""
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks:
labeller.model.set_dim("nO", len(self.labels))
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index aa0399b33..9be562b61 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -1,8 +1,9 @@
# cython: infer_types=True, profile=True, binding=True
+from typing import List
import numpy
import srsly
-
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
+from thinc.types import Floats2d
import warnings
from ..tokens.doc cimport Doc
@@ -42,7 +43,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
scores=["tag_acc"],
default_score_weights={"tag_acc": 1.0},
)
-def make_tagger(nlp: Language, name: str, model: Model):
+def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]):
+ """Construct a part-of-speech tagger component.
+
+ model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
+ the tag probabilities. The output vectors should match the number of tags
+ in size, and be normalized as probabilities (all scores between 0 and 1,
+ with the rows summing to 1).
+ """
return Tagger(nlp.vocab, model, name)
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 06b72f8c7..d632825bd 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -1,5 +1,6 @@
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
+from thinc.types import Floats2d
import numpy
from .pipe import Pipe
@@ -69,8 +70,22 @@ subword_features = true
default_score_weights={"cats_score": 1.0},
)
def make_textcat(
- nlp: Language, name: str, model: Model, labels: Iterable[str]
+ nlp: Language,
+ name: str,
+ model: Model[List[Doc], List[Floats2d]],
+ labels: Iterable[str],
) -> "TextCategorizer":
+ """Create a TextCategorizer compoment. The text categorizer predicts categories
+ over a whole document. It can learn one or more labels, and the labels can
+ be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
+ (i.e. zero or more labels may be true per doc). The multi-label setting is
+ controlled by the model instance that's provided.
+
+ model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
+ scores for each category.
+ labels (list): A list of categories to learn. If empty, the model infers the
+ categories from the data.
+ """
return TextCategorizer(nlp.vocab, model, name, labels=labels)
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index db6843e8f..c9f0a99e9 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -32,11 +32,28 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
class Tok2Vec(Pipe):
+ """Apply a "token-to-vector" model and set its outputs in the doc.tensor
+ attribute. This is mostly useful to share a single subnetwork between multiple
+ components, e.g. to have one embedding and CNN network shared between a
+ parser, tagger and NER.
+
+ In order to use the `Tok2Vec` predictions, subsequent components should use
+ the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
+ layer will read data from the `doc.tensor` attribute during prediction.
+ During training, the `Tok2Vec` component will save its prediction and backprop
+ callback for each batch, so that the subsequent components can backpropagate
+ to the shared weights. This implementation is used because it allows us to
+ avoid relying on object identity within the models to achieve the parameter
+ sharing.
+ """
+
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
"""Initialize a tok2vec component.
vocab (Vocab): The shared vocabulary.
- model (thinc.api.Model): The Thinc Model powering the pipeline component.
+ model (thinc.api.Model[List[Doc], List[Floats2d]]):
+ The Thinc Model powering the pipeline component. It should take
+ a list of Doc objects as input, and output a list of 2d float arrays.
name (str): The component instance name.
DOCS: https://spacy.io/api/tok2vec#init
@@ -48,9 +65,18 @@ class Tok2Vec(Pipe):
self.cfg = {}
def add_listener(self, listener: "Tok2VecListener") -> None:
+ """Add a listener for a downstream component. Usually internals."""
self.listeners.append(listener)
def find_listeners(self, model: Model) -> None:
+ """Walk over a model, looking for layers that are Tok2vecListener
+ subclasses that have an upstream_name that matches this component.
+ Listeners can also set their upstream_name attribute to the wildcard
+ string '*' to match any `Tok2Vec`.
+
+ You're unlikely to ever need multiple `Tok2Vec` components, so it's
+ fine to leave your listeners upstream_name on '*'.
+ """
for node in model.walk():
if isinstance(node, Tok2VecListener) and node.upstream_name in (
"*",
@@ -59,7 +85,8 @@ class Tok2Vec(Pipe):
self.add_listener(node)
def __call__(self, doc: Doc) -> Doc:
- """Add context-sensitive embeddings to the Doc.tensor attribute.
+ """Add context-sensitive embeddings to the Doc.tensor attribute, allowing
+ them to be used as features by downstream components.
docs (Doc): The Doc to preocess.
RETURNS (Doc): The processed Doc.
@@ -205,11 +232,27 @@ class Tok2Vec(Pipe):
class Tok2VecListener(Model):
"""A layer that gets fed its answers from an upstream connection,
for instance from a component earlier in the pipeline.
+
+ The Tok2VecListener layer is used as a sublayer within a component such
+ as a parser, NER or text categorizer. Usually you'll have multiple listeners
+ connecting to a single upstream Tok2Vec component, that's earlier in the
+ pipeline. The Tok2VecListener layers act as proxies, passing the predictions
+ from the Tok2Vec component into downstream components, and communicating
+ gradients back upstream.
"""
name = "tok2vec-listener"
def __init__(self, upstream_name: str, width: int) -> None:
+ """
+ upstream_name (str): A string to identify the 'upstream' Tok2Vec component
+ to communicate with. The upstream name should either be the wildcard
+ string '*', or the name of the `Tok2Vec` component. You'll almost
+ never have multiple upstream Tok2Vec components, so the wildcard
+ string will almost always be fine.
+ width (int):
+ The width of the vectors produced by the upstream tok2vec component.
+ """
Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
self.upstream_name = upstream_name
self._batch_id = None
@@ -217,15 +260,25 @@ class Tok2VecListener(Model):
self._backprop = None
@classmethod
- def get_batch_id(cls, inputs) -> int:
+ def get_batch_id(cls, inputs: List[Doc]) -> int:
+ """Calculate a content-sensitive hash of the batch of documents, to check
+ whether the next batch of documents is unexpected.
+ """
return sum(sum(token.orth for token in doc) for doc in inputs)
def receive(self, batch_id: int, outputs, backprop) -> None:
+ """Store a batch of training predictions and a backprop callback. The
+ predictions and callback are produced by the upstream Tok2Vec component,
+ and later will be used when the listener's component's model is called.
+ """
self._batch_id = batch_id
self._outputs = outputs
self._backprop = backprop
def verify_inputs(self, inputs) -> bool:
+ """Check that the batch of Doc objects matches the ones we have a
+ prediction for.
+ """
if self._batch_id is None and self._outputs is None:
raise ValueError(Errors.E954)
else:
@@ -237,6 +290,7 @@ class Tok2VecListener(Model):
def forward(model: Tok2VecListener, inputs, is_train: bool):
+ """Supply the outputs from the upstream Tok2Vec component."""
if is_train:
model.verify_inputs(inputs)
return model._outputs, model._backprop
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 4a81d39d0..d77881ad0 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -426,7 +426,7 @@ class Scorer:
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
}
if len(labels) == 2 and not multi_label and positive_label:
- positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
+ positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
results[f"{attr}_score"] = positive_label_f
results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label:
diff --git a/spacy/tests/morphology/test_morph_pickle.py b/spacy/tests/morphology/test_morph_pickle.py
index 0758a6c01..d9b0e3476 100644
--- a/spacy/tests/morphology/test_morph_pickle.py
+++ b/spacy/tests/morphology/test_morph_pickle.py
@@ -15,5 +15,7 @@ def morphology():
def test_morphology_pickle_roundtrip(morphology):
b = pickle.dumps(morphology)
reloaded_morphology = pickle.loads(b)
- assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
- assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
+ feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
+ assert feat == "Feat1=Val1|Feat2=Val2"
+ feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
+ assert feat == "Feat3=Val3|Feat4=Val4"
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index dbeb0a9cb..0ffe74273 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -144,10 +144,7 @@ def test_accept_blocked_token():
# 1. test normal behaviour
nlp1 = English()
doc1 = nlp1("I live in New York")
- config = {
- "learn_tokens": False,
- "min_action_freq": 30,
- }
+ config = {}
ner1 = nlp1.create_pipe("ner", config=config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@@ -166,10 +163,7 @@ def test_accept_blocked_token():
# 2. test blocking behaviour
nlp2 = English()
doc2 = nlp2("I live in New York")
- config = {
- "learn_tokens": False,
- "min_action_freq": 30,
- }
+ config = {}
ner2 = nlp2.create_pipe("ner", config=config)
# set "New York" to a blocked entity
@@ -224,10 +218,7 @@ def test_overwrite_token():
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O
- config = {
- "learn_tokens": False,
- "min_action_freq": 30,
- }
+ config = {}
ner2 = nlp.create_pipe("ner", config=config)
ner2.moves.add_action(5, "")
ner2.add_label("GPE")
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 644fa0f01..8a70fdeeb 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -1,8 +1,7 @@
import pytest
-
from spacy import util, registry
from spacy.lang.en import English
-from spacy.lookups import Lookups, load_lookups
+from spacy.lookups import Lookups
from ..util import make_tempdir
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
index 5f27a0afa..1af4a5121 100644
--- a/spacy/tests/pipeline/test_tagger.py
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -1,10 +1,8 @@
import pytest
-
from spacy import util
from spacy.gold import Example
from spacy.lang.en import English
from spacy.language import Language
-from spacy.symbols import POS, NOUN
from ..util import make_tempdir
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 363a16a11..17add7391 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -117,9 +117,7 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring
- scores = nlp.evaluate(
- train_examples, scorer_cfg={"positive_label": "POSITIVE"}
- )
+ scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
assert scores["cats_micro_f"] == 1.0
assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index b642ca229..5c93ea3c8 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -1,11 +1,9 @@
import pytest
import random
-
from spacy import util
from spacy.gold import Example
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER
-from spacy.symbols import POS, VERB
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.lookups import Lookups
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
index 0ac895546..d6a4600e3 100644
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@@ -6,8 +6,7 @@ from spacy.lang.en import English
from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
-from spacy.lookups import Lookups
-from spacy.symbols import ORTH, LEMMA, POS, VERB
+from spacy.symbols import ORTH, LEMMA, POS
def test_issue1061():
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index 83afb11f3..4988575ea 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -271,10 +271,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
nlp = Language()
- config = {
- "learn_tokens": False,
- "min_action_freq": 30,
- }
+ config = {}
ner = nlp.create_pipe("ner", config=config)
example = Example.from_dict(
Doc(ner.vocab, words=["word"]),
diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py
index e42779ad7..de554a5ec 100644
--- a/spacy/tests/regression/test_issue3501-4000.py
+++ b/spacy/tests/regression/test_issue3501-4000.py
@@ -157,7 +157,11 @@ def test_issue3540(en_vocab):
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
- attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
+ attrs = {
+ "POS": ["PROPN", "PROPN"],
+ "LEMMA": ["New", "York"],
+ "DEP": ["pobj", "compound"],
+ }
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py
index e1d03eaf5..423015106 100644
--- a/spacy/tests/regression/test_issue4001-4500.py
+++ b/spacy/tests/regression/test_issue4001-4500.py
@@ -138,10 +138,7 @@ def test_issue4042_bug2():
if not output_dir.exists():
output_dir.mkdir()
ner1.to_disk(output_dir)
- config = {
- "learn_tokens": False,
- "min_action_freq": 30,
- }
+ config = {}
ner2 = nlp1.create_pipe("ner", config=config)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2
@@ -303,10 +300,7 @@ def test_issue4313():
beam_width = 16
beam_density = 0.0001
nlp = English()
- config = {
- "learn_tokens": False,
- "min_action_freq": 30,
- }
+ config = {}
ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL")
ner.begin_training([])
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index 0b3b4a9fc..96d4e1ca4 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -185,20 +185,16 @@ def test_issue4725_1():
vocab = Vocab(vectors_name="test_vocab_add_vector")
nlp = English(vocab=vocab)
config = {
- "learn_tokens": False,
- "min_action_freq": 342,
"update_with_oracle_cut_size": 111,
}
ner = nlp.create_pipe("ner", config=config)
with make_tempdir() as tmp_path:
with (tmp_path / "ner.pkl").open("wb") as file_:
pickle.dump(ner, file_)
- assert ner.cfg["min_action_freq"] == 342
assert ner.cfg["update_with_oracle_cut_size"] == 111
with (tmp_path / "ner.pkl").open("rb") as file_:
ner2 = pickle.load(file_)
- assert ner2.cfg["min_action_freq"] == 342
assert ner2.cfg["update_with_oracle_cut_size"] == 111
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 6865cd1e5..ebc804235 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -236,3 +236,33 @@ def test_language_from_config_before_after_init_invalid():
config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
with pytest.raises(ValueError):
English.from_config(config)
+
+
+def test_language_custom_tokenizer():
+ """Test that a fully custom tokenizer can be plugged in via the registry."""
+ name = "test_language_custom_tokenizer"
+
+ class CustomTokenizer:
+ """Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
+
+ def __init__(self, nlp, prefix):
+ self.vocab = nlp.vocab
+ self.prefix = prefix
+
+ def __call__(self, text):
+ words = [f"{self.prefix}{word}" for word in text.split(" ")]
+ return Doc(self.vocab, words=words)
+
+ @registry.tokenizers(name)
+ def custom_create_tokenizer(prefix: str = "_"):
+ def create_tokenizer(nlp):
+ return CustomTokenizer(nlp, prefix=prefix)
+
+ return create_tokenizer
+
+ config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
+ nlp = English.from_config(config)
+ doc = nlp("hello world")
+ assert [t.text for t in doc] == ["_hello", "_world"]
+ doc = list(nlp.pipe(["hello world"]))[0]
+ assert [t.text for t in doc] == ["_hello", "_world"]
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 8d28a78c3..8b07102ce 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -3,7 +3,7 @@ title: Model Architectures
teaser: Pre-defined model architectures included with the core library
source: spacy/ml/models
menu:
- - ['Tok2Vec', 'tok2vec']
+ - ['Tok2Vec', 'tok2vec-arch']
- ['Transformers', 'transformers']
- ['Parser & NER', 'parser']
- ['Tagging', 'tagger']
@@ -70,6 +70,47 @@ blog post for background.
| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. |
| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. |
+### spacy.Tok2VecListener.v1 {#Tok2VecListener}
+
+> #### Example config
+>
+> ```ini
+> [components.tok2vec]
+> factory = "tok2vec"
+>
+> [components.tok2vec.model]
+> @architectures = "spacy.HashEmbedCNN.v1"
+> width = 342
+>
+> [components.tagger]
+> factory = "tagger"
+>
+> [components.tagger.model]
+> @architectures = "spacy.Tagger.v1"
+>
+> [components.tagger.model.tok2vec]
+> @architectures = "spacy.Tok2VecListener.v1"
+> width = ${components.tok2vec.model:width}
+> ```
+
+A listener is used as a sublayer within a component such as a
+[`DependencyParser`](/api/dependencyparser),
+[`EntityRecognizer`](/api/entityrecognizer)or
+[`TextCategorizer`](/api/textcategorizer). Usually you'll have multiple
+listeners connecting to a single upstream [`Tok2Vec`](/api/tok2vec) component
+that's earlier in the pipeline. The listener layers act as **proxies**, passing
+the predictions from the `Tok2Vec` component into downstream components, and
+communicating gradients back upstream.
+
+Instead of defining its own `Tok2Vec` instance, a model architecture like
+[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
+argument that connects to the shared `tok2vec` component in the pipeline.
+
+| Name | Type | Description |
+| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `width` | int | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. |
+| `upstream` | str | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. |
+
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
@@ -195,7 +236,7 @@ and residual connections.
> depth = 4
> ```
-Encode context using bidirectonal LSTM layers. Requires
+Encode context using bidirectional LSTM layers. Requires
[PyTorch](https://pytorch.org).
| Name | Type | Description |
@@ -237,8 +278,6 @@ architectures into your training config.
### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}
-
-
> #### Example Config
>
> ```ini
@@ -250,10 +289,41 @@ architectures into your training config.
> @layers = "reduce_mean.v1"
> ```
-| Name | Type | Description |
-| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- |
-| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. |
-| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. |
+Create a `TransformerListener` layer, which will connect to a
+[`Transformer`](/api/transformer) component earlier in the pipeline. The layer
+takes a list of [`Doc`](/api/doc) objects as input, and produces a list of
+2-dimensional arrays as output, with each array having one row per token. Most
+spaCy models expect a sublayer with this signature, making it easy to connect
+them to a transformer model via this sublayer. Transformer models usually
+operate over wordpieces, which usually don't align one-to-one against spaCy
+tokens. The layer therefore requires a reduction operation in order to calculate
+a single token vector given zero or more wordpiece vectors.
+
+| Name | Type | Description |
+| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
+| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
+
+### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
+
+> #### Example Config
+>
+> ```ini
+> # TODO:
+> ```
+
+Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
+**not** allow multiple components to share the transformer weights, and does
+**not** allow the transformer to set annotations into the [`Doc`](/api/doc)
+object, but it's a **simpler solution** if you only need the transformer within
+one component.
+
+| Name | Type | Description |
+| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `get_spans` | callable | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
+| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
+| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
+| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
## Parser & NER architectures {#parser}
@@ -417,20 +487,18 @@ network has an internal CNN Tok2Vec layer and uses attention.
> nO = null
> ```
-| Name | Type | Description |
-| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
-| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. |
-| `width` | int | Output dimension of the feature encoding step. |
-| `embed_size` | int | Input dimension of the feature encoding step. |
-| `conv_depth` | int | Depth of the Tok2Vec layer. |
-| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. |
-| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
-| `dropout` | float | The dropout rate. |
-| `nO` | int | Output dimension, determined by the number of different labels. |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| Name | Type | Description |
+| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
+| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. |
+| `width` | int | Output dimension of the feature encoding step. |
+| `embed_size` | int | Input dimension of the feature encoding step. |
+| `conv_depth` | int | Depth of the Tok2Vec layer. |
+| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. |
+| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
+| `dropout` | float | The dropout rate. |
+| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |
### spacy.TextCatCNN.v1 {#TextCatCNN}
@@ -457,14 +525,12 @@ A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.
-| Name | Type | Description |
-| ------------------- | ------------------------------------------ | --------------------------------------------------------------- |
-| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
-| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
-| `nO` | int | Output dimension, determined by the number of different labels. |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| Name | Type | Description |
+| --------------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
+| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
+| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |
### spacy.TextCatBOW.v1 {#TextCatBOW}
@@ -482,17 +548,17 @@ If the `nO` dimension is not set, the TextCategorizer component will set it when
An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short.
-| Name | Type | Description |
-| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
-| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
-| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. |
-| `nO` | int | Output dimension, determined by the number of different labels. |
-
-If the `nO` dimension is not set, the TextCategorizer component will set it when
-`begin_training` is called.
+| Name | Type | Description |
+| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
+| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
+| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. |
+| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
+| `begin_training` is called. |
+
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
@@ -558,8 +624,6 @@ A function that creates a default, empty `KnowledgeBase` from a
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
[`Span`](/api/span) object denoting a named entity, and returns a list of
-plausible [`Candidate` objects](/api/kb/#candidate_init).
-
-The default `CandidateGenerator` simply uses the text of a mention to find its
-potential aliases in the Knowledgebase. Note that this function is
-case-dependent.
+plausible [`Candidate` objects](/api/kb/#candidate_init). The default
+`CandidateGenerator` simply uses the text of a mention to find its potential
+aliases in the `KnowledgeBase`. Note that this function is case-dependent.
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 377b2456f..c4a774cd0 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
## Pretrain {#pretrain new="2.1" tag="experimental"}
-
-
-Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
+Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
components on [raw text](/api/data-formats#pretrain), using an approximate
language-modeling objective. Specifically, we load pretrained vectors, and train
a component like a CNN, BiLSTM, etc to predict vectors which match the
@@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
then include a **path to one of these pretrained weights files** in your
[training config](/usage/training#config) as the `init_tok2vec` setting when you
train your model. This technique may be especially helpful if you have little
-labelled data.
+labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
+for more info.
@@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
| `output_dir` | positional | Directory to write models to on each epoch. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
-| `--resume-path`, `-r` | option | TODO: |
-| `--epoch-resume`, `-er` | option | TODO: |
+| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
+| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index c0a87756d..af7cb26de 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
[`spacy train`](/api/cli#train). They use
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
hood. For details on how to use training configs, see the
-[usage documentation](/usage/training#config).
-
-
+[usage documentation](/usage/training#config). To get started with a blank
+config or fill a partial config with all defaults, you can use the
+[`init config`](/api/cli#init-config) command.
> #### What does the @ mean?
>
@@ -52,8 +52,6 @@ your config and check that it's valid, you can run the
-
-
### nlp {#config-nlp tag="section"}
> #### Example
@@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
[language model pretraining](/usage/training#pretraining). It's used when you
run [`spacy pretrain`](/api/cli#pretrain).
-
-
| Name | Type | Description | Default |
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
| `max_epochs` | int | Maximum number of epochs. | `1000` |
diff --git a/website/docs/api/dependencymatcher.md b/website/docs/api/dependencymatcher.md
index 3638575df..4f192783f 100644
--- a/website/docs/api/dependencymatcher.md
+++ b/website/docs/api/dependencymatcher.md
@@ -5,4 +5,194 @@ tag: class
source: spacy/matcher/dependencymatcher.pyx
---
-TODO: write
+The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
+and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
+using the
+[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
+It requires a pretrained [`DependencyParser`](/api/parser) or other component
+that sets the `Token.dep` attribute.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+> {
+> "SPEC": {"NODE_NAME": "founded"},
+> "PATTERN": {"ORTH": "founded"}
+> },
+> {
+> "SPEC": {
+> "NODE_NAME": "founder",
+> "NBOR_RELOP": ">",
+> "NBOR_NAME": "founded"
+> },
+> "PATTERN": {"DEP": "nsubj"}
+> },
+> {
+> "SPEC": {
+> "NODE_NAME": "object",
+> "NBOR_RELOP": ">",
+> "NBOR_NAME": "founded"
+> },
+> "PATTERN": {"DEP": "dobj"}
+> }
+> ]
+> ```
+
+A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
+with each dictionary describing a node to match. Each pattern should have the
+following top-level keys:
+
+| Name | Type | Description |
+| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
+| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
+| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. |
+
+The `SPEC` includes the following fields:
+
+| Name | Type | Description |
+| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. |
+| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
+| `NBOR_NAME` | str | The unique name of the node that this node is connected to. |
+
+## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
+
+Create a rule-based `DependencyMatcher`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import DependencyMatcher
+> matcher = DependencyMatcher(nlp.vocab)
+> ```
+
+| Name | Type | Description |
+| ------- | ------- | ------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
+
+## DependencyMatcher.\_\call\_\_ {#call tag="method"}
+
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
+
+> #### Example
+>
+> ```python
+> from spacy.matcher import Matcher
+>
+> matcher = Matcher(nlp.vocab)
+> pattern = [
+> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Founder", [pattern])
+> doc = nlp("Bill Gates founded Microsoft.")
+> matches = matcher(doc)
+> ```
+
+| Name | Type | Description |
+| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
+| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
+
+## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
+
+Get the number of rules (edges) added to the dependency matcher. Note that this
+only returns the number of rules (identical with the number of IDs), not the
+number of individual patterns.
+
+> #### Example
+>
+> ```python
+> matcher = DependencyMatcher(nlp.vocab)
+> assert len(matcher) == 0
+> pattern = [
+> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
+> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
+> ]
+> matcher.add("Rule", [pattern])
+> assert len(matcher) == 1
+> ```
+
+| Name | Type | Description |
+| ----------- | ---- | -------------------- |
+| **RETURNS** | int | The number of rules. |
+
+## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
+
+Check whether the matcher contains rules for a match ID.
+
+> #### Example
+>
+> ```python
+> matcher = Matcher(nlp.vocab)
+> assert "Rule" not in matcher
+> matcher.add("Rule", [pattern])
+> assert "Rule" in matcher
+> ```
+
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key` | str | The match ID. |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+
+## DependencyMatcher.add {#add tag="method"}
+
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
+
+> #### Example
+>
+> ```python
+> def on_match(matcher, doc, id, matches):
+> print('Matched!', matches)
+>
+> matcher = Matcher(nlp.vocab)
+> matcher.add("TEST_PATTERNS", patterns)
+> ```
+
+| Name | Type | Description |
+| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str | An ID for the thing you're matching. |
+| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
+| _keyword-only_ | | |
+| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+
+## DependencyMatcher.remove {#remove tag="method"}
+
+Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
+exist.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern]])
+> assert "Rule" in matcher
+> matcher.remove("Rule")
+> assert "Rule" not in matcher
+> ```
+
+| Name | Type | Description |
+| ----- | ---- | ------------------------- |
+| `key` | str | The ID of the match rule. |
+
+## DependencyMatcher.get {#get tag="method"}
+
+Retrieve the pattern stored for a key. Returns the rule as an
+`(on_match, patterns)` tuple containing the callback and available patterns.
+
+> #### Example
+>
+> ```python
+> matcher.add("Rule", [pattern], on_match=on_match)
+> on_match, patterns = matcher.get("Rule")
+> ```
+
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------- |
+| `key` | str | The ID of the match rule. |
+| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index e56e85e64..6c9222781 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -8,6 +8,23 @@ api_string_name: parser
api_trainable: true
---
+A transition-based dependency parser component. The dependency parser jointly
+learns sentence segmentation and labelled dependency parsing, and can optionally
+learn to merge tokens that had been over-segmented by the tokenizer. The parser
+uses a variant of the **non-monotonic arc-eager transition-system** described by
+[Honnibal and Johnson (2014)](https://www.aclweb.org/anthology/D15-1162/), with
+the addition of a "break" transition to perform the sentence segmentation.
+[Nivre (2005)](https://www.aclweb.org/anthology/P05-1013/)'s **pseudo-projective
+dependency transformation** is used to allow the parser to predict
+non-projective parses.
+
+The parser is trained using an **imitation learning objective**. It follows the
+actions predicted by the current weights, and at each state, determines which
+actions are compatible with the optimal parse that could be reached from the
+current state. The weights such that the scores assigned to the set of optimal
+actions is increased, while scores assigned to other actions are decreased. Note
+that more than one action may be optimal for a given state.
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -23,18 +40,21 @@ architectures and their arguments and hyperparameters.
> from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
> config = {
> "moves": None,
-> # TODO: rest
+> "update_with_oracle_cut_size": 100,
+> "learn_tokens": False,
+> "min_action_freq": 30,
> "model": DEFAULT_PARSER_MODEL,
> }
> nlp.add_pipe("parser", config=config)
> ```
-
-
-| Setting | Type | Description | Default |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list | | `None` |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
+| Setting | Type | Description | Default |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | `None` |
+| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
+| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | `False` |
+| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30` |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx
@@ -61,19 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-
-
-| Name | Type | Description |
-| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | The shared vocabulary. |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
-| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
-| `moves` | list | |
-| _keyword-only_ | | |
-| `update_with_oracle_cut_size` | int | |
-| `multitasks` | `Iterable` | |
-| `learn_tokens` | bool | |
-| `min_action_freq` | int | |
+| Name | Type | Description |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The shared vocabulary. |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
+| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
+| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
+| _keyword-only_ | | |
+| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
+| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. |
+| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. |
## DependencyParser.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 0ab17f953..a6368e62b 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -8,6 +8,18 @@ api_string_name: ner
api_trainable: true
---
+A transition-based named entity recognition component. The entity recognizer
+identifies **non-overlapping labelled spans** of tokens. The transition-based
+algorithm used encodes certain assumptions that are effective for "traditional"
+named entity recognition tasks, but may not be a good fit for every span
+identification problem. Specifically, the loss function optimizes for **whole
+entity accuracy**, so if your inter-annotator agreement on boundary tokens is
+low, the component will likely perform poorly on your problem. The
+transition-based algorithm also assumes that the most decisive information about
+your entities will be close to their initial tokens. If your entities are long
+and characterized by tokens in their middle, the component will likely not be a
+good fit for your task.
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -23,18 +35,17 @@ architectures and their arguments and hyperparameters.
> from spacy.pipeline.ner import DEFAULT_NER_MODEL
> config = {
> "moves": None,
-> # TODO: rest
+> "update_with_oracle_cut_size": 100,
> "model": DEFAULT_NER_MODEL,
> }
> nlp.add_pipe("ner", config=config)
> ```
-
-
-| Setting | Type | Description | Default |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
-| `moves` | list | | `None` |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
+| Setting | Type | Description | Default |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- |
+| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
+| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx
@@ -61,19 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-
-
-| Name | Type | Description |
-| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | The shared vocabulary. |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
-| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
-| `moves` | list | |
-| _keyword-only_ | | |
-| `update_with_oracle_cut_size` | int | |
-| `multitasks` | `Iterable` | |
-| `learn_tokens` | bool | |
-| `min_action_freq` | int | |
+| Name | Type | Description |
+| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The shared vocabulary. |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
+| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
+| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
+| _keyword-only_ | | |
+| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 7464a029e..79782fd72 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -242,6 +242,21 @@ a batch of [Example](/api/example) objects.
Update the models in the pipeline.
+
+
+The `Language.update` method now takes a batch of [`Example`](/api/example)
+objects instead of the raw texts and annotations or `Doc` and `GoldParse`
+objects. An [`Example`](/api/example) streamlines how data is passed around. It
+stores two `Doc` objects: one for holding the gold-standard reference data, and
+one for holding the predictions of the pipeline.
+
+For most use cases, you shouldn't have to write your own training scripts
+anymore. Instead, you can use [`spacy train`](/api/cli#train) with a config file
+and custom registered functions if needed. See the
+[training documentation](/usage/training) for details.
+
+
+
> #### Example
>
> ```python
@@ -253,7 +268,7 @@ Update the models in the pipeline.
| Name | Type | Description |
| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
-| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
+| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| _keyword-only_ | | |
| `drop` | float | The dropout rate. |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 6a6bb1244..f1242d193 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -9,6 +9,28 @@ api_string_name: lemmatizer
api_trainable: false
---
+Component for assigning base forms to tokens using rules based on part-of-speech
+tags, or lookup tables. Functionality to train the component is coming soon.
+Different [`Language`](/api/language) subclasses can implement their own
+lemmatizer components via
+[language-specific factories](/usage/processing-pipelines#factories-language).
+The default data used is provided by the
+[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
+extension package.
+
+
+
+As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be
+added to your pipeline, and not a hidden part of the vocab that runs behind the
+scenes. This makes it easier to customize how lemmas should be assigned in your
+pipeline.
+
+If the lemmatization mode is set to `"rule"` and requires part-of-speech tags to
+be assigned, make sure a [`Tagger`](/api/tagger) or another component assigning
+tags is available in the pipeline and runs _before_ the lemmatizer.
+
+
+
## Config and implementation
The default config is defined by the pipeline component factory and describes
@@ -29,7 +51,7 @@ lemmatizers, see the
| Setting | Type | Description | Default |
| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
-| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` |
+| `mode` | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. | `"lookup"` |
| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` |
| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` |
@@ -55,15 +77,15 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-| Name | Type | Description |
-| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | [`Vocab`](/api/vocab) | The vocab. |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). |
-| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
-| _keyword-only_ | | |
-| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". |
-| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
-| overwrite | bool | Whether to overwrite existing lemmas. |
+| Name | Type | Description |
+| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | [`Vocab`](/api/vocab) | The vocab. |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). |
+| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
+| _keyword-only_ | | |
+| mode | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. |
+| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. |
+| overwrite | bool | Whether to overwrite existing lemmas. |
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 925c9ad2e..b481f1972 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -5,6 +5,82 @@ tag: class
source: spacy/matcher/matcher.pyx
---
+The `Matcher` lets you find words and phrases using rules describing their token
+attributes. Rules can refer to token annotations (like the text or
+part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
+Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
+tokens in context. For in-depth examples and workflows for combining rules and
+statistical models, see the [usage guide](/usage/rule-based-matching) on
+rule-based matching.
+
+## Pattern format {#patterns}
+
+> ```json
+> ### Example
+> [
+> {"LOWER": "i"},
+> {"LEMMA": {"IN": ["like", "love"]}},
+> {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+A pattern added to the `Matcher` consists of a list of dictionaries. Each
+dictionary describes **one token** and its attributes. The available token
+pattern keys correspond to a number of
+[`Token` attributes](/api/token#attributes). The supported attributes for
+rule-based matching are:
+
+| Attribute | Type | Description |
+| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
+| `ORTH` | str | The exact verbatim text of a token. |
+| `TEXT` 2.1 | str | The exact verbatim text of a token. |
+| `LOWER` | str | The lowercase form of the token text. |
+| `LENGTH` | int | The length of the token text. |
+| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
+| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
+| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
+| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
+| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
+| `ENT_TYPE` | str | The token's entity label. |
+| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
+| `OP` | str | Operator or quantifier to determine how often to match a token pattern. |
+
+Operators and quantifiers define **how often** a token pattern should be
+matched:
+
+> ```json
+> ### Example
+> [
+> {"POS": "ADJ", "OP": "*"},
+> {"POS": "NOUN", "OP": "+"}
+> ]
+> ```
+
+| OP | Description |
+| --- | ---------------------------------------------------------------- |
+| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
+| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
+| `+` | Require the pattern to match 1 or more times. |
+| `*` | Allow the pattern to match zero or more times. |
+
+Token patterns can also map to a **dictionary of properties** instead of a
+single value to indicate whether the expected value is a member of a list or how
+it compares to another value.
+
+> ```json
+> ### Example
+> [
+> {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
+> {"POS": "PROPN", "LENGTH": {">=": 10}},
+> ]
+> ```
+
+| Attribute | Type | Description |
+| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
+| `IN` | any | Attribute value is member of a list. |
+| `NOT_IN` | any | Attribute value is _not_ member of a list. |
+| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
+
## Matcher.\_\_init\_\_ {#init tag="method"}
Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.
| Name | Type | Description |
| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs` | iterable | A stream of documents. |
+| `docs` | iterable | A stream of documents or spans. |
| `batch_size` | int | The number of documents to accumulate into a working set. |
| `return_matches` 2.1 | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. |
| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
@@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.
## Matcher.add {#add tag="method" new="2"}
-Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
-callback function to act on the matches. The callback function will receive the
-arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
-the given ID, the patterns will be extended. An `on_match` callback will be
-overwritten.
+Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
+optional callback function to act on the matches. The callback function will
+receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
+exists for the given ID, the patterns will be extended. An `on_match` callback
+will be overwritten.
> #### Example
>
@@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
-| Name | Type | Description |
-| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | str | An ID for the thing you're matching. |
-| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
-| _keyword-only_ | | |
-| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| Name | Type | Description |
+| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
+| `match_id` | str | An ID for the thing you're matching. |
+| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
+| _keyword-only_ | | |
+| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
+| `greedy` 3 | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. |
## Matcher.remove {#remove tag="method" new="2"}
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index bfe5c3c77..942440234 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-
-
| Name | Type | Description |
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
-| `labels_morph` | dict | |
-| `labels_pos` | dict | |
+| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. |
+| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. |
## Morphologizer.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index 866aca096..71c7a463b 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -9,7 +9,8 @@ new: 2
The `PhraseMatcher` lets you efficiently match large terminology lists. While
the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
-objects.
+objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
+examples.
## PhraseMatcher.\_\_init\_\_ {#init tag="method"}
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index d9b8f4caf..233171779 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -28,10 +28,10 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("tagger", config=config)
> ```
-| Setting | Type | Description | Default |
-| ---------------- | ------------------------------------------ | -------------------------------------- | ----------------------------------- |
-| `set_morphology` | bool | Whether to set morphological features. | `False` |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
+| Setting | Type | Description | Default |
+| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
+| `set_morphology` | bool | Whether to set morphological features. | `False` |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
@@ -58,13 +58,13 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
-| Name | Type | Description |
-| ---------------- | ------- | ------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | The shared vocabulary. |
-| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
-| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
-| _keyword-only_ | | |
-| `set_morphology` | bool | Whether to set morphological features. |
+| Name | Type | Description |
+| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The shared vocabulary. |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). |
+| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
+| _keyword-only_ | | |
+| `set_morphology` | bool | Whether to set morphological features. |
## Tagger.\_\_call\_\_ {#call tag="method"}
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 1efd5831c..5af540828 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -9,6 +9,12 @@ api_string_name: textcat
api_trainable: true
---
+The text categorizer predicts **categories over a whole document**. It can learn
+one or more labels, and the labels can be mutually exclusive (i.e. one true
+label per document) or non-mutually exclusive (i.e. zero or more labels may be
+true per document). The multi-label setting is controlled by the model instance
+that's provided.
+
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@@ -29,10 +35,10 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("textcat", config=config)
> ```
-| Setting | Type | Description | Default |
-| -------- | ------------------------------------------ | ------------------ | ----------------------------------------------------- |
-| `labels` | `Iterable[str]` | The labels to use. | `[]` |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
+| Setting | Type | Description | Default |
+| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- |
+| `labels` | `List[str]` | A list of categories to learn. If empty, the model infers the categories from the data. | `[]` |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
@@ -67,23 +73,6 @@ shortcut for this and instantiate the component using its string name and
| _keyword-only_ | | |
| `labels` | `Iterable[str]` | The labels to use. |
-
-
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned.
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index f810793ce..dce595023 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -8,7 +8,20 @@ api_string_name: tok2vec
api_trainable: true
---
-
+Apply a "token-to-vector" model and set its outputs in the `Doc.tensor`
+attribute. This is mostly useful to **share a single subnetwork** between
+multiple components, e.g. to have one embedding and CNN network shared between a
+[`DependencyParser`](/api/dependencyparser), [`Tagger`](/api/tagger) and
+[`EntityRecognizer`](/api/entityrecognizer).
+
+In order to use the `Tok2Vec` predictions, subsequent components should use the
+[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec
+subnetwork of their model. This layer will read data from the `doc.tensor`
+attribute during prediction. During training, the `Tok2Vec` component will save
+its prediction and backprop callback for each batch, so that the subsequent
+components can backpropagate to the shared weights. This implementation is used
+because it allows us to avoid relying on object identity within the models to
+achieve the parameter sharing.
## Config and implementation {#config}
@@ -27,9 +40,9 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("tok2vec", config=config)
> ```
-| Setting | Type | Description | Default |
-| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
+| Setting | Type | Description | Default |
+| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
@@ -64,9 +77,11 @@ shortcut for this and instantiate the component using its string name and
## Tok2Vec.\_\_call\_\_ {#call tag="method"}
-Apply the pipe to one document. The document is modified in place, and returned.
-This usually happens under the hood when the `nlp` object is called on a text
-and all pipeline components are applied to the `Doc` in order. Both
+Apply the pipe to one document and add context-sensitive embeddings to the
+`Doc.tensor` attribute, allowing them to be used as features by downstream
+components. The document is modified in place, and returned. This usually
+happens under the hood when the `nlp` object is called on a text and all
+pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/tok2vec#call) and [`pipe`](/api/tok2vec#pipe) delegate to the
[`predict`](/api/tok2vec#predict) and
[`set_annotations`](/api/tok2vec#set_annotations) methods.
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index b63a4adba..0b3167901 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -340,7 +340,7 @@ See the [`Transformer`](/api/transformer) API reference and
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
-
+
#### batch_by_words.v1 {#batch_by_words tag="registered function"}
@@ -361,19 +361,16 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
> get_length = null
> ```
-
-
-| Name | Type | Description |
-| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `tolerance` | float | |
-| `discard_oversize` | bool | Discard items that are longer than the specified batch length. |
-| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
+| Name | Type | Description |
+| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `seqs` | `Iterable[Any]` | The sequences to minibatch. |
+| `size` | `Iterable[int]` / int | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
+| `tolerance` | float | What percentage of the size to allow batches to exceed. |
+| `discard_oversize` | bool | Whether to discard sequences that by themselves exceed the tolerated size. |
+| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
-
-
> #### Example config
>
> ```ini
@@ -383,34 +380,37 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
> get_length = null
> ```
-
+Create a batcher that creates batches of the specified size.
-| Name | Type | Description |
-| ------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
+| Name | Type | Description |
+| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size` | `Iterable[int]` / int | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
+| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
#### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
-
-
> #### Example config
>
> ```ini
> [training.batcher]
-> @batchers = "batch_by_words.v1"
+> @batchers = "batch_by_padded.v1"
> size = 100
-> buffer = TODO:
+> buffer = 256
> discard_oversize = false
> get_length = null
> ```
-| Name | Type | Description |
-| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
-| `buffer` | int | |
-| `discard_oversize` | bool | Discard items that are longer than the specified batch length. |
-| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
+Minibatch a sequence by the size of padded batches that would result, with
+sequences binned by length within a window. The padded size is defined as the
+maximum length of sequences within the batch multiplied by the number of
+sequences in the batch.
+
+| Name | Type | Description |
+| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `size` | `Iterable[int]` / int | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
+| `buffer` | int | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. |
+| `discard_oversize` | bool | Whether to discard sequences that are by themselves longer than the largest padded batch size. |
+| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
## Training data and alignment {#gold source="spacy/gold"}
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 6b6be6bd0..57f06cd9e 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -25,8 +25,15 @@ work out-of-the-box.
-This pipeline component lets you use transformer models in your pipeline. The
-component assigns the output of the transformer to the Doc's extension
+This pipeline component lets you use transformer models in your pipeline.
+Supports all models that are available via the
+[HuggingFace `transformers`](https://huggingface.co/transformers) library.
+Usually you will connect subsequent components to the shared transformer using
+the [TransformerListener](/api/architectures#TransformerListener) layer. This
+works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and
+[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer.
+
+The component assigns the output of the transformer to the `Doc`'s extension
attributes. We also calculate an alignment between the word-piece tokens and the
spaCy tokenization, so that we can use the last hidden states to set the
`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
@@ -53,11 +60,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
> ```
-| Setting | Type | Description | Default |
-| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
-| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
-| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) |
+| Setting | Type | Description | Default |
+| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
+| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
+| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter` |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. | [TransformerModel](/api/architectures#TransformerModel) |
```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
@@ -86,18 +93,22 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
> trf = Transformer(nlp.vocab, model)
> ```
-Create a new pipeline instance. In your application, you would normally use a
-shortcut for this and instantiate the component using its string name and
-[`nlp.add_pipe`](/api/language#create_pipe).
+Construct a `Transformer` component. One or more subsequent spaCy components can
+use the transformer outputs as features in its model, with gradients
+backpropagated to the single shared weights. The activations from the
+transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension
+attribute. You can also provide a callback to set additional annotations. In
+your application, you would normally use a shortcut for this and instantiate the
+component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
-| Name | Type | Description |
-| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | The shared vocabulary. |
-| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
-| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. |
-| _keyword-only_ | | |
-| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
-| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
+| Name | Type | Description |
+| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The shared vocabulary. |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. |
+| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. |
+| _keyword-only_ | | |
+| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
+| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
## Transformer.\_\_call\_\_ {#call tag="method"}
@@ -184,7 +195,10 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## Transformer.set_annotations {#set_annotations tag="method"}
-Modify a batch of documents, using pre-computed scores.
+Assign the extracted features to the Doc objects. By default, the
+[`TransformerData`](/api/transformer#transformerdata) object is written to the
+[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter
+callback is then called, if provided.
> #### Example
>
@@ -201,8 +215,19 @@ Modify a batch of documents, using pre-computed scores.
## Transformer.update {#update tag="method"}
-Learn from a batch of documents and gold-standard information, updating the
-pipe's model. Delegates to [`predict`](/api/transformer#predict).
+Prepare for an update to the transformer. Like the [`Tok2Vec`](/api/tok2vec)
+component, the `Transformer` component is unusual in that it does not receive
+"gold standard" annotations to calculate a weight update. The optimal output of
+the transformer data is unknown – it's a hidden layer inside the network that is
+updated by backpropagating from output layers.
+
+The `Transformer` component therefore does **not** perform a weight update
+during its own `update` method. Instead, it runs its transformer model and
+communicates the output and the backpropagation callback to any **downstream
+components** that have been connected to it via the
+[TransformerListener](/api/architectures#TransformerListener) sublayer. If there
+are multiple listeners, the last layer will actually backprop to the transformer
+and call the optimizer, while the others simply increment the gradients.
> #### Example
>
@@ -212,15 +237,15 @@ pipe's model. Delegates to [`predict`](/api/transformer#predict).
> losses = trf.update(examples, sgd=optimizer)
> ```
-| Name | Type | Description |
-| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
-| _keyword-only_ | | |
-| `drop` | float | The dropout rate. |
-| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). |
-| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
-| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
-| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
+| Name | Type | Description |
+| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. |
+| _keyword-only_ | | |
+| `drop` | float | The dropout rate. |
+| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). |
+| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
+| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
## Transformer.create_optimizer {#create_optimizer tag="method"}
@@ -394,21 +419,23 @@ Split a `TransformerData` object that represents a batch into a list with one
| ----------- | ----------------------- | ----------- |
| **RETURNS** | `List[TransformerData]` | |
-## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
-
-
+## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
return a lists of [`Span`](/api/span) objects for each doc, to be processed by
-the transformer. The returned spans can overlap. Span getters can be referenced
-in the config's `[components.transformer.model.get_spans]` block to customize
-the sequences processed by the transformer. You can also register custom span
-getters using the `@registry.span_getters` decorator.
+the transformer. This is used to manage long documents, by cutting them into
+smaller sequences before running the transformer. The spans are allowed to
+overlap, and you can also omit sections of the Doc if they are not relevant.
+
+Span getters can be referenced in the `[components.transformer.model.get_spans]`
+block of the config to customize the sequences processed by the transformer. You
+can also register custom span getters using the `@spacy.registry.span_getters`
+decorator.
> #### Example
>
> ```python
-> @registry.span_getters("sent_spans.v1")
+> @spacy.registry.span_getters("sent_spans.v1")
> def configure_get_sent_spans() -> Callable:
> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
> return [list(doc.sents) for doc in docs]
@@ -421,15 +448,55 @@ getters using the `@registry.span_getters` decorator.
| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. |
| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. |
-The following built-in functions are available:
+### doc_spans.v1 {#doc_spans tag="registered function"}
-
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "doc_spans.v1"
+> ```
-| Name | Description |
-| ------------------ | ------------------------------------------------------------------ |
-| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
-| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
-| `strided_spans.v1` | |
+Create a span getter that uses the whole document as its spans. This is the best
+approach if your [`Doc`](/api/doc) objects already refer to relatively short
+texts.
+
+### sent_spans.v1 {#sent_spans tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "sent_spans.v1"
+> ```
+
+Create a span getter that uses sentence boundary markers to extract the spans.
+This requires sentence boundaries to be set (e.g. by the
+[`Sentencizer`](/api/sentencizer)), and may result in somewhat uneven batches,
+depending on the sentence lengths. However, it does provide the transformer with
+more meaningful windows to attend over.
+
+### strided_spans.v1 {#strided_spans tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [transformer.model.get_spans]
+> @span_getters = "strided_spans.v1"
+> window = 128
+> stride = 96
+> ```
+
+Create a span getter for strided spans. If you set the `window` and `stride` to
+the same value, the spans will cover each token once. Setting `stride` lower
+than `window` will allow for an overlap, so that some tokens are counted twice.
+This can be desirable, because it allows all tokens to have both a left and
+right context.
+
+| Name | Type | Description |
+| --------- | ---- | ---------------- |
+| `window` | int | The window size. |
+| `stride` | int | The stride size. |
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md
index d321b7cb9..42ba44dc8 100644
--- a/website/docs/usage/101/_architecture.md
+++ b/website/docs/usage/101/_architecture.md
@@ -1,54 +1,88 @@
-The central data structures in spaCy are the `Doc` and the `Vocab`. The `Doc`
-object owns the **sequence of tokens** and all their annotations. The `Vocab`
-object owns a set of **look-up tables** that make common information available
-across documents. By centralizing strings, word vectors and lexical attributes,
-we avoid storing multiple copies of this data. This saves memory, and ensures
-there's a **single source of truth**.
+The central data structures in spaCy are the [`Language`](/api/language) class,
+the [`Vocab`](/api/vocab) and the [`Doc`](/api/doc) object. The `Language` class
+is used to process a text and turn it into a `Doc` object. It's typically stored
+as a variable called `nlp`. The `Doc` object owns the **sequence of tokens** and
+all their annotations. By centralizing strings, word vectors and lexical
+attributes in the `Vocab`, we avoid storing multiple copies of this data. This
+saves memory, and ensures there's a **single source of truth**.
Text annotations are also designed to allow a single source of truth: the `Doc`
-object owns the data, and `Span` and `Token` are **views that point into it**.
-The `Doc` object is constructed by the `Tokenizer`, and then **modified in
-place** by the components of the pipeline. The `Language` object coordinates
-these components. It takes raw text and sends it through the pipeline, returning
-an **annotated document**. It also orchestrates training and serialization.
+object owns the data, and [`Span`](/api/span) and [`Token`](/api/token) are
+**views that point into it**. The `Doc` object is constructed by the
+[`Tokenizer`](/api/tokenizer), and then **modified in place** by the components
+of the pipeline. The `Language` object coordinates these components. It takes
+raw text and sends it through the pipeline, returning an **annotated document**.
+It also orchestrates training and serialization.
-
+
data:image/s3,"s3://crabby-images/daf73/daf7338d33969e00b318ce22512e188c9ba68794" alt="Library architecture"
### Container objects {#architecture-containers}
-| Name | Description |
-| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
-| [`Span`](/api/span) | A slice from a `Doc` object. |
-| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
-| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
-| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
+| Name | Description |
+| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`. |
+| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
+| [`Span`](/api/span) | A slice from a `Doc` object. |
+| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
+| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
+| [`Example`](/api/example) | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions. |
+| [`DocBin`](/api/docbin) | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training). |
### Processing pipeline {#architecture-pipeline}
-| Name | Description |
-| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
-| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. |
-| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
-| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
-| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. |
-| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. |
-| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. |
-| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. |
-| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
-| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
-| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
-| [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. |
-| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
+The processing pipeline consists of one or more **pipeline components** that are
+called on the `Doc` in order. The tokenizer runs before the components. Pipeline
+components can be added using [`Language.add_pipe`](/api/language#add_pipe).
+They can contain a statistical model and trained weights, or only make
+rule-based modifications to the `Doc`. spaCy provides a range of built-in
+components for different language processing tasks and also allows adding
+[custom components](/usage/processing-pipelines#custom-components).
+
+data:image/s3,"s3://crabby-images/cdcfb/cdcfbe67acd0d80da0e8ef143db90379033251d8" alt="The processing pipeline"
+
+| Name | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. |
+| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. |
+| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. |
+| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
+| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. |
+| [`Tagger`](/api/tagger) | Predict part-of-speech tags. |
+| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. |
+| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. |
+| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. |
+| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
+| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. |
+| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. |
+| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
+| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
+| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
+| [`Pipe`](/api/pipe) | Base class that all trainable pipeline components inherit from. |
+
+### Matchers {#architecture-matchers}
+
+Matchers help you find and extract information from [`Doc`](/api/doc) objects
+based on match patterns describing the sequences you're looking for. A matcher
+operates on a `Doc` and gives you access to the matched tokens **in context**.
+
+| Name | Description |
+| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
+| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
+| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
### Other classes {#architecture-other}
-| Name | Description |
-| --------------------------------- | ----------------------------------------------------------------------------- |
-| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
-| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
-| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
-| [`Example`](/api/example) | Collection for training annotations. |
+| Name | Description |
+| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
+| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
+| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
+| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
+| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
+| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
+| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
+| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
+| [`Scorer`](/api/scorer) | Compute evaluation scores. |
+| [`Corpus`](/api/corpis) | Class for managing annotated corpora for training and evaluation data. |
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 5ad59482f..589cef44c 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -750,16 +750,13 @@ print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that']
The special case doesn't have to match an entire whitespace-delimited substring.
The tokenizer will incrementally split off punctuation, and keep looking up the
-remaining substring:
+remaining substring. The special case rules also have precedence over the
+punctuation splitting.
```python
assert "gimme" not in [w.text for w in nlp("gimme!")]
assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
-```
-The special case rules have precedence over the punctuation splitting:
-
-```python
nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}])
assert len(nlp("...gimme...?")) == 1
```
@@ -813,19 +810,6 @@ domain. There are six things you may need to define:
6. An optional boolean function `url_match`, which is similar to `token_match`
except that prefixes and suffixes are removed before applying the match.
-
-
-In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match`
-above and there was no match pattern applied before prefixes and suffixes were
-analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its
-behavior in v2.2.1 and earlier with precedence over prefixes and suffixes.
-
-The `url_match` is introduced in v2.3.0 to handle cases like URLs where the
-tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a
-URL) before applying the match.
-
-
-
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
to use `re.compile()` to build a regular expression object, and pass its
`.search()` and `.finditer()` methods:
@@ -905,12 +889,13 @@ function that behaves the same way.
-If you're using a statistical model, writing to the `nlp.Defaults` or
-`English.Defaults` directly won't work, since the regular expressions are read
-from the model and will be compiled when you load it. If you modify
-`nlp.Defaults`, you'll only see the effect if you call
-[`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the tokenizer
-loaded from a statistical model, you should modify `nlp.tokenizer` directly.
+If you're using a statistical model, writing to the
+[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
+work, since the regular expressions are read from the model and will be compiled
+when you load it. If you modify `nlp.Defaults`, you'll only see the effect if
+you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the
+tokenizer loaded from a statistical model, you should modify `nlp.tokenizer`
+directly.
@@ -961,51 +946,50 @@ and language-specific definitions such as
[`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py)
for German.
-### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer}
+### Hooking a custom tokenizer into the pipeline {#custom-tokenizer}
The tokenizer is the first component of the processing pipeline and the only one
that can't be replaced by writing to `nlp.pipeline`. This is because it has a
different signature from all the other components: it takes a text and returns a
-`Doc`, whereas all other components expect to already receive a tokenized `Doc`.
+[`Doc`](/api/doc), whereas all other components expect to already receive a
+tokenized `Doc`.
data:image/s3,"s3://crabby-images/a31f3/a31f30668c68ba9564de0e0a7c1a25aa9494f65e" alt="The processing pipeline"
To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a
-custom function that takes a text, and returns a `Doc`.
+custom function that takes a text, and returns a [`Doc`](/api/doc).
+
+> #### Creating a Doc
+>
+> Constructing a [`Doc`](/api/doc) object manually requires at least two
+> arguments: the shared `Vocab` and a list of words. Optionally, you can pass in
+> a list of `spaces` values indicating whether the token at this position is
+> followed by a space (default `True`). See the section on
+> [pre-tokenized text](#own-annotations) for more info.
+>
+> ```python
+> words = ["Let", "'s", "go", "!"]
+> spaces = [False, True, False, False]
+> doc = Doc(nlp.vocab, words=words, spaces=spaces)
+> ```
```python
-nlp = spacy.load("en_core_web_sm")
+nlp = spacy.blank("en")
nlp.tokenizer = my_tokenizer
```
-| Argument | Type | Description |
-| ----------- | ----- | ------------------------- |
-| `text` | str | The raw text to tokenize. |
-| **RETURNS** | `Doc` | The tokenized document. |
+| Argument | Type | Description |
+| ----------- | ----------------- | ------------------------- |
+| `text` | str | The raw text to tokenize. |
+| **RETURNS** | [`Doc`](/api/doc) | The tokenized document. |
-
+#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example}
-In spaCy v1.x, you had to add a custom tokenizer by passing it to the `make_doc`
-keyword argument, or by passing a tokenizer "factory" to `create_make_doc`. This
-was unnecessarily complicated. Since spaCy v2.0, you can write to
-`nlp.tokenizer` instead. If your tokenizer needs the vocab, you can write a
-function and use `nlp.vocab`.
-
-```diff
-- nlp = spacy.load("en_core_web_sm", make_doc=my_tokenizer)
-- nlp = spacy.load("en_core_web_sm", create_make_doc=my_tokenizer_factory)
-
-+ nlp.tokenizer = my_tokenizer
-+ nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
-```
-
-
-
-### Example: A custom whitespace tokenizer {#custom-tokenizer-example}
-
-To construct the tokenizer, we usually want attributes of the `nlp` pipeline.
-Specifically, we want the tokenizer to hold a reference to the vocabulary
-object. Let's say we have the following class as our tokenizer:
+Here's an example of the most basic whitespace tokenizer. It takes the shared
+vocab, so it can construct `Doc` objects. When it's called on a text, it returns
+a `Doc` object consisting of the text split on single space characters. We can
+then overwrite the `nlp.tokenizer` attribute with an instance of our custom
+tokenizer.
```python
### {executable="true"}
@@ -1017,68 +1001,189 @@ class WhitespaceTokenizer:
self.vocab = vocab
def __call__(self, text):
- words = text.split(' ')
- # All tokens 'own' a subsequent space character in this tokenizer
- spaces = [True] * len(words)
- return Doc(self.vocab, words=words, spaces=spaces)
+ words = text.split(" ")
+ return Doc(self.vocab, words=words)
-nlp = spacy.load("en_core_web_sm")
+nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
-print([t.text for t in doc])
+print([token.text for token in doc])
```
-As you can see, we need a `Vocab` instance to construct this — but we won't have
-it until we get back the loaded `nlp` object. The simplest solution is to build
-the tokenizer in two steps. This also means that you can reuse the "tokenizer
-factory" and initialize it with different instances of `Vocab`.
+#### Example 2: Third-party tokenizers (BERT word pieces) {#custom-tokenizer-example2}
-### Bringing your own annotations {#own-annotations}
+You can use the same approach to plug in any other third-party tokenizers. Your
+custom callable just needs to return a `Doc` object with the tokens produced by
+your tokenizer. In this example, the wrapper uses the **BERT word piece
+tokenizer**, provided by the
+[`tokenizers`](https://github.com/huggingface/tokenizers) library. The tokens
+available in the `Doc` object returned by spaCy now match the exact word pieces
+produced by the tokenizer.
-spaCy generally assumes by default that your data is raw text. However,
+> #### 💡 Tip: spacy-transformers
+>
+> If you're working with transformer models like BERT, check out the
+> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
+> extension package and [documentation](/usage/transformers). It includes a
+> pipeline component for using pretrained transformer weights and **training
+> transformer models** in spaCy, as well as helpful utilities for aligning word
+> pieces to linguistic tokenization.
+
+```python
+### Custom BERT word piece tokenizer
+from tokenizers import BertWordPieceTokenizer
+from spacy.tokens import Doc
+import spacy
+
+class BertTokenizer:
+ def __init__(self, vocab, vocab_file, lowercase=True):
+ self.vocab = vocab
+ self._tokenizer = BertWordPieceTokenizer(vocab_file, lowercase=lowercase)
+
+ def __call__(self, text):
+ tokens = self._tokenizer.encode(text)
+ words = []
+ spaces = []
+ for i, (text, (start, end)) in enumerate(zip(tokens.tokens, tokens.offsets)):
+ words.append(text)
+ if i < len(tokens.tokens) - 1:
+ # If next start != current end we assume a space in between
+ next_start, next_end = tokens.offsets[i + 1]
+ spaces.append(next_start > end)
+ else:
+ spaces.append(True)
+ return Doc(self.vocab, words=words, spaces=spaces)
+
+nlp = spacy.blank("en")
+nlp.tokenizer = BertTokenizer(nlp.vocab, "bert-base-uncased-vocab.txt")
+doc = nlp("Justin Drew Bieber is a Canadian singer, songwriter, and actor.")
+print(doc.text, [token.text for token in doc])
+# [CLS]justin drew bi##eber is a canadian singer, songwriter, and actor.[SEP]
+# ['[CLS]', 'justin', 'drew', 'bi', '##eber', 'is', 'a', 'canadian', 'singer',
+# ',', 'songwriter', ',', 'and', 'actor', '.', '[SEP]']
+```
+
+
+
+Keep in mind that your model's result may be less accurate if the tokenization
+during training differs from the tokenization at runtime. So if you modify a
+pretrained model's tokenization afterwards, it may produce very different
+predictions. You should therefore train your model with the **same tokenizer**
+it will be using at runtime. See the docs on
+[training with custom tokenization](#custom-tokenizer-training) for details.
+
+
+
+#### Training with custom tokenization {#custom-tokenizer-training new="3"}
+
+spaCy's [training config](/usage/training#config) describe the settings,
+hyperparameters, pipeline and tokenizer used for constructing and training the
+model. The `[nlp.tokenizer]` block refers to a **registered function** that
+takes the `nlp` object and returns a tokenizer. Here, we're registering a
+function called `whitespace_tokenizer` in the
+[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
+construct your tokenizer during training, you can pass in your Python file by
+setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
+
+> #### config.cfg
+>
+> ```ini
+> [nlp.tokenizer]
+> @tokenizers = "whitespace_tokenizer"
+> ```
+
+```python
+### functions.py {highlight="1"}
+@spacy.registry.tokenizers("whitespace_tokenizer")
+def create_whitespace_tokenizer():
+ def create_tokenizer(nlp):
+ return WhitespaceTokenizer(nlp.vocab)
+
+ return create_tokenizer
+```
+
+Registered functions can also take arguments that are then passed in from the
+config. This allows you to quickly change and keep track of different settings.
+Here, the registered function called `bert_word_piece_tokenizer` takes two
+arguments: the path to a vocabulary file and whether to lowercase the text. The
+Python type hints `str` and `bool` ensure that the received values have the
+correct type.
+
+> #### config.cfg
+>
+> ```ini
+> [nlp.tokenizer]
+> @tokenizers = "bert_word_piece_tokenizer"
+> vocab_file = "bert-base-uncased-vocab.txt"
+> lowercase = true
+> ```
+
+```python
+### functions.py {highlight="1"}
+@spacy.registry.tokenizers("bert_word_piece_tokenizer")
+def create_whitespace_tokenizer(vocab_file: str, lowercase: bool):
+ def create_tokenizer(nlp):
+ return BertWordPieceTokenizer(nlp.vocab, vocab_file, lowercase)
+
+ return create_tokenizer
+```
+
+To avoid hard-coding local paths into your config file, you can also set the
+vocab path on the CLI by using the `--nlp.tokenizer.vocab_file`
+[override](/usage/training#config-overrides) when you run
+[`spacy train`](/api/cli#train). For more details on using registered functions,
+see the docs in [training with custom code](/usage/training#custom-code).
+
+
+
+Remember that a registered function should always be a function that spaCy
+**calls to create something**, not the "something" itself. In this case, it
+**creates a function** that takes the `nlp` object and returns a callable that
+takes a text and returns a `Doc`.
+
+
+
+#### Using pre-tokenized text {#own-annotations}
+
+spaCy generally assumes by default that your data is **raw text**. However,
sometimes your data is partially annotated, e.g. with pre-existing tokenization,
-part-of-speech tags, etc. The most common situation is that you have pre-defined
-tokenization. If you have a list of strings, you can create a `Doc` object
-directly. Optionally, you can also specify a list of boolean values, indicating
-whether each word has a subsequent space.
+part-of-speech tags, etc. The most common situation is that you have
+**pre-defined tokenization**. If you have a list of strings, you can create a
+[`Doc`](/api/doc) object directly. Optionally, you can also specify a list of
+boolean values, indicating whether each word is followed by a space.
+
+> #### ✏️ Things to try
+>
+> 1. Change a boolean value in the list of `spaces`. You should see it reflected
+> in the `doc.text` and whether the token is followed by a space.
+> 2. Remove `spaces=spaces` from the `Doc`. You should see that every token is
+> now followed by a space.
+> 3. Copy-paste a random sentence from the internet and manually construct a
+> `Doc` with `words` and `spaces` so that the `doc.text` matches the original
+> input text.
```python
### {executable="true"}
import spacy
from spacy.tokens import Doc
-from spacy.lang.en import English
-nlp = English()
-doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
- spaces=[False, True, False, False])
+nlp = spacy.blank("en")
+words = ["Hello", ",", "world", "!"]
+spaces = [False, True, False, False]
+doc = Doc(nlp.vocab, words=words, spaces=spaces)
+print(doc.text)
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
```
-If provided, the spaces list must be the same length as the words list. The
+If provided, the spaces list must be the **same length** as the words list. The
spaces list affects the `doc.text`, `span.text`, `token.idx`, `span.start_char`
and `span.end_char` attributes. If you don't provide a `spaces` sequence, spaCy
-will assume that all words are whitespace delimited.
+will assume that all words are followed by a space. Once you have a
+[`Doc`](/api/doc) object, you can write to its attributes to set the
+part-of-speech tags, syntactic dependencies, named entities and other
+attributes.
-```python
-### {executable="true"}
-import spacy
-from spacy.tokens import Doc
-from spacy.lang.en import English
-
-nlp = English()
-bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
-good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
- spaces=[False, True, False, False])
-
-print(bad_spaces.text) # 'Hello , world !'
-print(good_spaces.text) # 'Hello, world!'
-```
-
-Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set
-the part-of-speech tags, syntactic dependencies, named entities and other
-attributes. For details, see the respective usage pages.
-
-### Aligning tokenization {#aligning-tokenization}
+#### Aligning tokenization {#aligning-tokenization}
spaCy's tokenization is non-destructive and uses language-specific rules
optimized for compatibility with treebank annotations. Other tools and resources
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 2e07eff48..d7c3d49f8 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -979,8 +979,8 @@ added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
called on a text, it will find matches in the `doc` and add them as entities to
the `doc.ents`, using the specified pattern label as the entity label. If any
matches were to overlap, the pattern matching most tokens takes priority. If
-they also happen to be equally long, then the match occuring first in the Doc is
-chosen.
+they also happen to be equally long, then the match occurring first in the `Doc`
+is chosen.
```python
### {executable="true"}
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index d20d87863..36f934e96 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -6,25 +6,97 @@ menu:
- ['New Features', 'features']
- ['Backwards Incompatibilities', 'incompat']
- ['Migrating from v2.x', 'migrating']
- - ['Migrating plugins', 'plugins']
---
## Summary {#summary}
## New Features {#features}
+### New training workflow and config system {#features-training}
+
+### Transformer-based pipelines {#features-transformers}
+
+### Custom models using any framework {#feautres-custom-models}
+
+### Manage end-to-end workflows with projects {#features-projects}
+
+### New built-in pipeline components {#features-pipeline-components}
+
+| Name | Description |
+| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
+| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
+| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
+| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
+| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
+
+### New and improved pipeline component APIs {#features-components}
+
+- `Language.factory`, `Language.component`
+- `Language.analyze_pipes`
+- Adding components from other models
+
+### Type hints and type-based data validation {#features-types}
+
+spaCy v3.0 officially drops support for Python 2 and now requires **Python
+3.6+**. This also means that the code base can take full advantage of
+[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
+API that's implemented in pure Python (as opposed to Cython) now comes with type
+hints. The new version of spaCy's machine learning library
+[Thinc](https://thinc.ai) also features extensive
+[type support](https://thinc.ai/docs/usage-type-checking/), including custom
+types for models and arrays, and a custom `mypy` plugin that can be used to
+type-check model definitions.
+
+For data validation, spacy v3.0 adopts
+[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
+validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
+lets you to register **custom functions with typed arguments**, reference them
+in your config and see validation errors if the argument values don't match.
+
+### CLI
+
+| Name | Description |
+| --------------------------------------- | -------------------------------------------------------------------------------------------------------- |
+| [`init config`](/api/cli#init-config) | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. |
+| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors. |
+| [`project`](/api/cli#project) | Subcommand for cloning and running [spaCy projects](/usage/projects). |
+
## Backwards Incompatibilities {#incompat}
-### Removed or renamed objects, methods, attributes and arguments {#incompat-removed}
+As always, we've tried to keep the breaking changes to a minimum and focus on
+changes that were necessary to support the new features, fix problems or improve
+usability. The following section lists the relevant changes to the user-facing
+API. For specific examples of how to rewrite your code, check out the
+[migration guide](#migrating).
-| Removed | Replacement |
-| -------------------------------------------------------- | ----------------------------------------- |
-| `GoldParse` | [`Example`](/api/example) |
-| `GoldCorpus` | [`Corpus`](/api/corpus) |
-| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
-| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
+### Compatibility {#incompat-compat}
-### Removed deprecated methods, attributes and arguments {#incompat-removed-deprecated}
+- spaCy now requires **Python 3.6+**.
+
+### API changes {#incompat-api}
+
+- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
+ the component factory instead of the component function.
+- **Custom pipeline components** now needs to be decorated with the
+ [`@Language.component`](/api/language#component) or
+ [`@Language.factory`](/api/language#factory) decorator.
+- [`Language.update`](/api/language#update) now takes a batch of
+ [`Example`](/api/example) objects instead of raw texts and annotations, or
+ `Doc` and `GoldParse` objects.
+- The `Language.disable_pipes` contextmanager has been replaced by
+ [`Language.select_pipes`](/api/language#select_pipes), which can explicitly
+ disable or enable components.
+
+### Removed or renamed API {#incompat-removed}
+
+| Removed | Replacement |
+| -------------------------------------------------------- | ----------------------------------------------------- |
+| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) |
+| `GoldParse` | [`Example`](/api/example) |
+| `GoldCorpus` | [`Corpus`](/api/corpus) |
+| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
+| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
The following deprecated methods, attributes and arguments were removed in v3.0.
Most of them have been **deprecated for a while** and many would previously
@@ -214,17 +286,14 @@ python -m spacy package ./model ./packages
- python setup.py sdist
```
-## Migration notes for plugin maintainers {#plugins}
+#### Migration notes for plugin maintainers {#migrating-plugins}
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
and maintaining one of the many awesome [plugins and extensions](/universe).
-We've tried to keep breaking changes to a minimum and make it as easy as
-possible for you to upgrade your packages for spaCy v3.
-
-### Custom pipeline components
-
-The most common use case for plugins is providing pipeline components and
-extension attributes.
+We've tried to make it as easy as possible for you to upgrade your packages for
+spaCy v3. The most common use case for plugins is providing pipeline components
+and extension attributes. When migrating your plugin, double-check the
+following:
- Use the [`@Language.factory`](/api/language#factory) decorator to register
your component and assign it a name. This allows users to refer to your
diff --git a/website/src/components/code.js b/website/src/components/code.js
index a51986634..952014ed5 100644
--- a/website/src/components/code.js
+++ b/website/src/components/code.js
@@ -11,7 +11,7 @@ import Link from './link'
import GitHubCode from './github'
import classes from '../styles/code.module.sass'
-const WRAP_THRESHOLD = 15
+const WRAP_THRESHOLD = 16
export default props => (