Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-08-10 00:45:37 +02:00
commit 94da9f48de
48 changed files with 1374 additions and 499 deletions

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a5"
__version__ = "3.0.0a6"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -35,7 +35,7 @@ def pretrain_cli(
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
# fmt: on
):

View File

@ -1,4 +1,4 @@
from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable
from typing import Union, Iterable, Sequence, TypeVar, List, Callable
from typing import Optional, Any
from functools import partial
import itertools
@ -19,6 +19,22 @@ def configure_minibatch_by_padded_size(
discard_oversize: bool,
get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT:
"""Create a batcher that uses the `batch_by_padded_size` strategy.
The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.
size (int or Iterable[int]): The largest padded size to batch sequences into.
Can be a single integer, or a sequence, allowing for variable batch sizes.
buffer (int): The number of sequences to accumulate before sorting by length.
A larger buffer will result in more even sizing, but if the buffer is
very large, the iteration order will be less random, which can result
in suboptimal training.
discard_oversize (bool): Whether to discard sequences that are by themselves
longer than the largest padded batch size.
get_length (Callable or None): Function to get the length of a sequence item.
The `len` function is used by default.
"""
# Avoid displacing optional values from the underlying function.
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(
@ -38,6 +54,16 @@ def configure_minibatch_by_words(
discard_oversize: bool,
get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT:
"""Create a batcher that uses the "minibatch by words" strategy.
size (int or Iterable[int]): The target number of words per batch.
Can be a single integer, or a sequence, allowing for variable batch sizes.
tolerance (float): What percentage of the size to allow batches to exceed.
discard_oversize (bool): Whether to discard sequences that by themselves
exceed the tolerated size.
get_length (Callable or None): Function to get the length of a sequence
item. The `len` function is used by default.
"""
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(
minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
@ -48,22 +74,43 @@ def configure_minibatch_by_words(
def configure_minibatch(
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
) -> BatcherT:
"""Create a batcher that creates batches of the specified size.
size (int or Iterable[int]): The target number of items per batch.
Can be a single integer, or a sequence, allowing for variable batch sizes.
"""
optionals = {"get_length": get_length} if get_length is not None else {}
return partial(minibatch, size=size, **optionals)
def minibatch_by_padded_size(
docs: Iterator["Doc"],
seqs: Iterable[ItemT],
size: Sizing,
buffer: int = 256,
discard_oversize: bool = False,
get_length: Callable = len,
) -> Iterator[Iterator["Doc"]]:
) -> Iterable[List[ItemT]]:
"""Minibatch a sequence by the size of padded batches that would result,
with sequences binned by length within a window.
The padded size is defined as the maximum length of sequences within the
batch multiplied by the number of sequences in the batch.
size (int): The largest padded size to batch sequences into.
buffer (int): The number of sequences to accumulate before sorting by length.
A larger buffer will result in more even sizing, but if the buffer is
very large, the iteration order will be less random, which can result
in suboptimal training.
discard_oversize (bool): Whether to discard sequences that are by themselves
longer than the largest padded batch size.
get_length (Callable or None): Function to get the length of a sequence item.
The `len` function is used by default.
"""
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
for outer_batch in minibatch(docs, size=buffer):
for outer_batch in minibatch(seqs, size=buffer):
outer_batch = list(outer_batch)
target_size = next(size_)
for indices in _batch_by_length(outer_batch, target_size, get_length):
@ -76,12 +123,24 @@ def minibatch_by_padded_size(
def minibatch_by_words(
docs, size, tolerance=0.2, discard_oversize=False, get_length=len
):
seqs: Iterable[ItemT],
size: Sizing,
tolerance=0.2,
discard_oversize=False,
get_length=len,
) -> Iterable[List[ItemT]]:
"""Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by
themselves, or be discarded if discard_oversize=True.
The argument 'docs' can be a list of strings, Docs or Examples.
seqs (Iterable[Sequence]): The sequences to minibatch.
size (int or Iterable[int]): The target number of words per batch.
Can be a single integer, or a sequence, allowing for variable batch sizes.
tolerance (float): What percentage of the size to allow batches to exceed.
discard_oversize (bool): Whether to discard sequences that by themselves
exceed the tolerated size.
get_length (Callable or None): Function to get the length of a sequence
item. The `len` function is used by default.
"""
if isinstance(size, int):
size_ = itertools.repeat(size)
@ -95,20 +154,20 @@ def minibatch_by_words(
overflow = []
batch_size = 0
overflow_size = 0
for doc in docs:
n_words = get_length(doc)
for seq in seqs:
n_words = get_length(seq)
# if the current example exceeds the maximum batch size, it is returned separately
# but only if discard_oversize=False.
if n_words > target_size + tol_size:
if not discard_oversize:
yield [doc]
yield [seq]
# add the example to the current batch if there's no overflow yet and it still fits
elif overflow_size == 0 and (batch_size + n_words) <= target_size:
batch.append(doc)
batch.append(seq)
batch_size += n_words
# add the example to the overflow buffer if it fits in the tolerance margin
elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
overflow.append(doc)
overflow.append(seq)
overflow_size += n_words
# yield the previous batch and start a new one. The new one gets the overflow examples.
else:
@ -122,11 +181,11 @@ def minibatch_by_words(
overflow_size = 0
# this example still fits
if (batch_size + n_words) <= target_size:
batch.append(doc)
batch.append(seq)
batch_size += n_words
# this example fits in overflow
elif (batch_size + n_words) <= (target_size + tol_size):
overflow.append(doc)
overflow.append(seq)
overflow_size += n_words
# this example does not fit with the previous overflow: start another new batch
else:
@ -134,7 +193,7 @@ def minibatch_by_words(
yield batch
target_size = next(size_)
tol_size = target_size * tolerance
batch = [doc]
batch = [seq]
batch_size = n_words
batch.extend(overflow)
if batch:

View File

@ -1,5 +1,3 @@
from typing import Optional
from ...pipeline import Lemmatizer
from ...tokens import Token

View File

@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .tokens import Doc
from .lookups import load_lookups
from .tokenizer import Tokenizer
from .errors import Errors, Warnings
from .schemas import ConfigSchema
@ -1439,10 +1438,7 @@ class Language:
or lang_cls is not cls
):
raise ValueError(Errors.E943.format(value=type(lang_cls)))
nlp = lang_cls(
vocab=vocab,
create_tokenizer=create_tokenizer,
)
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
if after_creation is not None:
nlp = after_creation(nlp)
if not isinstance(nlp, cls):

View File

@ -68,11 +68,11 @@ cdef class DependencyMatcher:
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
return self.has_key(key)
def validateInput(self, pattern, key):
def validate_input(self, pattern, key):
idx = 0
visitedNodes = {}
visited_nodes = {}
for relation in pattern:
if "PATTERN" not in relation or "SPEC" not in relation:
raise ValueError(Errors.E098.format(key=key))
@ -83,7 +83,7 @@ cdef class DependencyMatcher:
and "NBOR_NAME" not in relation["SPEC"]
):
raise ValueError(Errors.E099.format(key=key))
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
else:
if not(
"NODE_NAME" in relation["SPEC"]
@ -92,22 +92,28 @@ cdef class DependencyMatcher:
):
raise ValueError(Errors.E100.format(key=key))
if (
relation["SPEC"]["NODE_NAME"] in visitedNodes
or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
relation["SPEC"]["NODE_NAME"] in visited_nodes
or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
):
raise ValueError(Errors.E101.format(key=key))
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
idx = idx + 1
def add(self, key, patterns, *_patterns, on_match=None):
"""Add a new matcher rule to the matcher.
key (str): The match ID.
patterns (list): The patterns to add for the given key.
on_match (callable): Optional callback executed on match.
"""
if patterns is None or hasattr(patterns, "__call__"): # old API
on_match = patterns
patterns = _patterns
for pattern in patterns:
if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key))
self.validateInput(pattern,key)
self.validate_input(pattern,key)
key = self._normalize_key(key)
_patterns = []
for pattern in patterns:
@ -187,8 +193,7 @@ cdef class DependencyMatcher:
key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule.
"""
key = self._normalize_key(key)
return key in self._patterns
return self._normalize_key(key) in self._patterns
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
@ -202,6 +207,13 @@ cdef class DependencyMatcher:
return (self._callbacks[key], self._patterns[key])
def __call__(self, Doc doc):
"""Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over.
RETURNS (list): A list of `(key, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
matched_key_trees = []
matches = self.token_matcher(doc)
for key in list(self._patterns.keys()):
@ -241,25 +253,25 @@ cdef class DependencyMatcher:
on_match(self, doc, i, matched_key_trees)
return matched_key_trees
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
cdef bool isValid;
if(patternLength == len(id_to_position.keys())):
isValid = True
for node in range(patternLength):
if(node in tree):
for idx, (relop,nbor) in enumerate(tree[node]):
computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
isNbor = False
for computed_nbor in computed_nbors:
if(computed_nbor.i == visitedNodes[nbor]):
if(computed_nbor.i == visited_nodes[nbor]):
isNbor = True
isValid = isValid & isNbor
if(isValid):
matched_trees.append(visitedNodes)
matched_trees.append(visited_nodes)
return
allPatternNodes = numpy.asarray(id_to_position[patternLength])
for patternNode in allPatternNodes:
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
# Given a node and an edge operator, to return the list of nodes
# from the doc that belong to node+operator. This is used to store

View File

@ -70,7 +70,7 @@ cdef class Matcher:
key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
return self.has_key(key)
def add(self, key, patterns, *, on_match=None, greedy: str=None):
"""Add a match-rule to the matcher. A match-rule consists of: an ID
@ -162,8 +162,7 @@ cdef class Matcher:
key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule.
"""
key = self._normalize_key(key)
return key in self._patterns
return self._normalize_key(key) in self._patterns
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
@ -179,7 +178,7 @@ cdef class Matcher:
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
"""Match a stream of documents, yielding them in turn.
docs (iterable): A stream of documents.
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
batch_size (int): Number of documents to accumulate into a working set.
return_matches (bool): Yield the match lists along with the docs, making
results (doc, matches) tuples.

View File

@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"multitasks": [],
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
@ -51,17 +50,52 @@ def make_parser(
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
multitasks: Iterable,
learn_tokens: bool,
min_action_freq: int
):
"""Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can
optionally learn to merge tokens that had been over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using an imitation learning objective. The parser follows
the actions predicted by the current weights, and at each state, determines
which actions are compatible with the optimal parse that could be reached
from the current state. The weights such that the scores assigned to the
set of optimal actions is increased, while scores assigned to other
actions are decreased. Note that more than one action may be optimal for
a given state.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (List[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=multitasks,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
)

View File

@ -62,6 +62,16 @@ def make_entity_linker(
incl_prior: bool,
incl_context: bool,
):
"""Construct an EntityLinker component.
model (Model[List[Doc], Floats2d]): A model that learns document vector
representations. Given a batch of Doc objects, it should return a single
array, with one row per item in the batch.
kb (KnowledgeBase): The knowledge-base to link entities to.
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
"""
return EntityLinker(
nlp.vocab,
model,

View File

@ -75,8 +75,8 @@ class Morphologizer(Tagger):
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
labels_morph (dict): TODO:
labels_pos (dict): TODO:
labels_morph (dict): Mapping of morph + POS tags to morph labels.
labels_pos (dict): Mapping of morph + POS tags to POS tags.
DOCS: https://spacy.io/api/morphologizer#init
"""

View File

@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"multitasks": [],
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_NER_MODEL,
},
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
@ -50,19 +47,40 @@ def make_ner(
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
multitasks: Iterable,
learn_tokens: bool,
min_action_freq: int
):
"""Create a transition-based EntityRecognizer component. The entity recognizer
identifies non-overlapping labelled spans of tokens.
The transition-based algorithm used encodes certain assumptions that are
effective for "traditional" named entity recognition tasks, but may not be
a good fit for every span identification problem. Specifically, the loss
function optimizes for whole entity accuracy, so if your inter-annotator
agreement on boundary tokens is low, the component will likely perform poorly
on your problem. The transition-based algorithm also assumes that the most
decisive information about your entities will be close to their initial tokens.
If your entities are long and characterised by tokens in their middle, the
component will likely do poorly on your task.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (list[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
"""
return EntityRecognizer(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=multitasks,
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
multitasks=[],
min_action_freq=1,
learn_tokens=False,
)
@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
TransitionSystem = BiluoPushDown
def add_multitask_objective(self, mt_component):
"""Register another component as a multi-task objective. Experimental."""
self._multitasks.append(mt_component)
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
"""Setup multi-task objective components. Experimental and internal."""
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks:
labeller.model.set_dim("nO", len(self.labels))

View File

@ -1,8 +1,9 @@
# cython: infer_types=True, profile=True, binding=True
from typing import List
import numpy
import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
from thinc.types import Floats2d
import warnings
from ..tokens.doc cimport Doc
@ -42,7 +43,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
scores=["tag_acc"],
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(nlp: Language, name: str, model: Model):
def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]):
"""Construct a part-of-speech tagger component.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
the tag probabilities. The output vectors should match the number of tags
in size, and be normalized as probabilities (all scores between 0 and 1,
with the rows summing to 1).
"""
return Tagger(nlp.vocab, model, name)

View File

@ -1,5 +1,6 @@
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
from thinc.types import Floats2d
import numpy
from .pipe import Pipe
@ -69,8 +70,22 @@ subword_features = true
default_score_weights={"cats_score": 1.0},
)
def make_textcat(
nlp: Language, name: str, model: Model, labels: Iterable[str]
nlp: Language,
name: str,
model: Model[List[Doc], List[Floats2d]],
labels: Iterable[str],
) -> "TextCategorizer":
"""Create a TextCategorizer compoment. The text categorizer predicts categories
over a whole document. It can learn one or more labels, and the labels can
be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
(i.e. zero or more labels may be true per doc). The multi-label setting is
controlled by the model instance that's provided.
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
scores for each category.
labels (list): A list of categories to learn. If empty, the model infers the
categories from the data.
"""
return TextCategorizer(nlp.vocab, model, name, labels=labels)

View File

@ -32,11 +32,28 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
class Tok2Vec(Pipe):
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor
attribute. This is mostly useful to share a single subnetwork between multiple
components, e.g. to have one embedding and CNN network shared between a
parser, tagger and NER.
In order to use the `Tok2Vec` predictions, subsequent components should use
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
layer will read data from the `doc.tensor` attribute during prediction.
During training, the `Tok2Vec` component will save its prediction and backprop
callback for each batch, so that the subsequent components can backpropagate
to the shared weights. This implementation is used because it allows us to
avoid relying on object identity within the models to achieve the parameter
sharing.
"""
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
"""Initialize a tok2vec component.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
model (thinc.api.Model[List[Doc], List[Floats2d]]):
The Thinc Model powering the pipeline component. It should take
a list of Doc objects as input, and output a list of 2d float arrays.
name (str): The component instance name.
DOCS: https://spacy.io/api/tok2vec#init
@ -48,9 +65,18 @@ class Tok2Vec(Pipe):
self.cfg = {}
def add_listener(self, listener: "Tok2VecListener") -> None:
"""Add a listener for a downstream component. Usually internals."""
self.listeners.append(listener)
def find_listeners(self, model: Model) -> None:
"""Walk over a model, looking for layers that are Tok2vecListener
subclasses that have an upstream_name that matches this component.
Listeners can also set their upstream_name attribute to the wildcard
string '*' to match any `Tok2Vec`.
You're unlikely to ever need multiple `Tok2Vec` components, so it's
fine to leave your listeners upstream_name on '*'.
"""
for node in model.walk():
if isinstance(node, Tok2VecListener) and node.upstream_name in (
"*",
@ -59,7 +85,8 @@ class Tok2Vec(Pipe):
self.add_listener(node)
def __call__(self, doc: Doc) -> Doc:
"""Add context-sensitive embeddings to the Doc.tensor attribute.
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
them to be used as features by downstream components.
docs (Doc): The Doc to preocess.
RETURNS (Doc): The processed Doc.
@ -205,11 +232,27 @@ class Tok2Vec(Pipe):
class Tok2VecListener(Model):
"""A layer that gets fed its answers from an upstream connection,
for instance from a component earlier in the pipeline.
The Tok2VecListener layer is used as a sublayer within a component such
as a parser, NER or text categorizer. Usually you'll have multiple listeners
connecting to a single upstream Tok2Vec component, that's earlier in the
pipeline. The Tok2VecListener layers act as proxies, passing the predictions
from the Tok2Vec component into downstream components, and communicating
gradients back upstream.
"""
name = "tok2vec-listener"
def __init__(self, upstream_name: str, width: int) -> None:
"""
upstream_name (str): A string to identify the 'upstream' Tok2Vec component
to communicate with. The upstream name should either be the wildcard
string '*', or the name of the `Tok2Vec` component. You'll almost
never have multiple upstream Tok2Vec components, so the wildcard
string will almost always be fine.
width (int):
The width of the vectors produced by the upstream tok2vec component.
"""
Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
self.upstream_name = upstream_name
self._batch_id = None
@ -217,15 +260,25 @@ class Tok2VecListener(Model):
self._backprop = None
@classmethod
def get_batch_id(cls, inputs) -> int:
def get_batch_id(cls, inputs: List[Doc]) -> int:
"""Calculate a content-sensitive hash of the batch of documents, to check
whether the next batch of documents is unexpected.
"""
return sum(sum(token.orth for token in doc) for doc in inputs)
def receive(self, batch_id: int, outputs, backprop) -> None:
"""Store a batch of training predictions and a backprop callback. The
predictions and callback are produced by the upstream Tok2Vec component,
and later will be used when the listener's component's model is called.
"""
self._batch_id = batch_id
self._outputs = outputs
self._backprop = backprop
def verify_inputs(self, inputs) -> bool:
"""Check that the batch of Doc objects matches the ones we have a
prediction for.
"""
if self._batch_id is None and self._outputs is None:
raise ValueError(Errors.E954)
else:
@ -237,6 +290,7 @@ class Tok2VecListener(Model):
def forward(model: Tok2VecListener, inputs, is_train: bool):
"""Supply the outputs from the upstream Tok2Vec component."""
if is_train:
model.verify_inputs(inputs)
return model._outputs, model._backprop

View File

@ -426,7 +426,7 @@ class Scorer:
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
}
if len(labels) == 2 and not multi_label and positive_label:
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
results[f"{attr}_score"] = positive_label_f
results[f"{attr}_score_desc"] = f"F ({positive_label})"
elif not multi_label:

View File

@ -15,5 +15,7 @@ def morphology():
def test_morphology_pickle_roundtrip(morphology):
b = pickle.dumps(morphology)
reloaded_morphology = pickle.loads(b)
assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
assert feat == "Feat1=Val1|Feat2=Val2"
feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
assert feat == "Feat3=Val3|Feat4=Val4"

View File

@ -144,10 +144,7 @@ def test_accept_blocked_token():
# 1. test normal behaviour
nlp1 = English()
doc1 = nlp1("I live in New York")
config = {
"learn_tokens": False,
"min_action_freq": 30,
}
config = {}
ner1 = nlp1.create_pipe("ner", config=config)
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
@ -166,10 +163,7 @@ def test_accept_blocked_token():
# 2. test blocking behaviour
nlp2 = English()
doc2 = nlp2("I live in New York")
config = {
"learn_tokens": False,
"min_action_freq": 30,
}
config = {}
ner2 = nlp2.create_pipe("ner", config=config)
# set "New York" to a blocked entity
@ -224,10 +218,7 @@ def test_overwrite_token():
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
# Check that a new ner can overwrite O
config = {
"learn_tokens": False,
"min_action_freq": 30,
}
config = {}
ner2 = nlp.create_pipe("ner", config=config)
ner2.moves.add_action(5, "")
ner2.add_label("GPE")

View File

@ -1,8 +1,7 @@
import pytest
from spacy import util, registry
from spacy.lang.en import English
from spacy.lookups import Lookups, load_lookups
from spacy.lookups import Lookups
from ..util import make_tempdir

View File

@ -1,10 +1,8 @@
import pytest
from spacy import util
from spacy.gold import Example
from spacy.lang.en import English
from spacy.language import Language
from spacy.symbols import POS, NOUN
from ..util import make_tempdir

View File

@ -117,9 +117,7 @@ def test_overfitting_IO():
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
# Test scoring
scores = nlp.evaluate(
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
)
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
assert scores["cats_micro_f"] == 1.0
assert scores["cats_score"] == 1.0
assert "cats_score_desc" in scores

View File

@ -1,11 +1,9 @@
import pytest
import random
from spacy import util
from spacy.gold import Example
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.symbols import POS, VERB
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.lookups import Lookups

View File

@ -6,8 +6,7 @@ from spacy.lang.en import English
from spacy.lang.lex_attrs import LEX_ATTRS
from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.lookups import Lookups
from spacy.symbols import ORTH, LEMMA, POS, VERB
from spacy.symbols import ORTH, LEMMA, POS
def test_issue1061():

View File

@ -271,10 +271,7 @@ def test_issue1963(en_tokenizer):
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
nlp = Language()
config = {
"learn_tokens": False,
"min_action_freq": 30,
}
config = {}
ner = nlp.create_pipe("ner", config=config)
example = Example.from_dict(
Doc(ner.vocab, words=["word"]),

View File

@ -157,7 +157,11 @@ def test_issue3540(en_vocab):
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
attrs = {
"POS": ["PROPN", "PROPN"],
"LEMMA": ["New", "York"],
"DEP": ["pobj", "compound"],
}
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]

View File

@ -138,10 +138,7 @@ def test_issue4042_bug2():
if not output_dir.exists():
output_dir.mkdir()
ner1.to_disk(output_dir)
config = {
"learn_tokens": False,
"min_action_freq": 30,
}
config = {}
ner2 = nlp1.create_pipe("ner", config=config)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2
@ -303,10 +300,7 @@ def test_issue4313():
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {
"learn_tokens": False,
"min_action_freq": 30,
}
config = {}
ner = nlp.create_pipe("ner", config=config)
ner.add_label("SOME_LABEL")
ner.begin_training([])

View File

@ -185,20 +185,16 @@ def test_issue4725_1():
vocab = Vocab(vectors_name="test_vocab_add_vector")
nlp = English(vocab=vocab)
config = {
"learn_tokens": False,
"min_action_freq": 342,
"update_with_oracle_cut_size": 111,
}
ner = nlp.create_pipe("ner", config=config)
with make_tempdir() as tmp_path:
with (tmp_path / "ner.pkl").open("wb") as file_:
pickle.dump(ner, file_)
assert ner.cfg["min_action_freq"] == 342
assert ner.cfg["update_with_oracle_cut_size"] == 111
with (tmp_path / "ner.pkl").open("rb") as file_:
ner2 = pickle.load(file_)
assert ner2.cfg["min_action_freq"] == 342
assert ner2.cfg["update_with_oracle_cut_size"] == 111

View File

@ -236,3 +236,33 @@ def test_language_from_config_before_after_init_invalid():
config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
with pytest.raises(ValueError):
English.from_config(config)
def test_language_custom_tokenizer():
"""Test that a fully custom tokenizer can be plugged in via the registry."""
name = "test_language_custom_tokenizer"
class CustomTokenizer:
"""Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
def __init__(self, nlp, prefix):
self.vocab = nlp.vocab
self.prefix = prefix
def __call__(self, text):
words = [f"{self.prefix}{word}" for word in text.split(" ")]
return Doc(self.vocab, words=words)
@registry.tokenizers(name)
def custom_create_tokenizer(prefix: str = "_"):
def create_tokenizer(nlp):
return CustomTokenizer(nlp, prefix=prefix)
return create_tokenizer
config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
nlp = English.from_config(config)
doc = nlp("hello world")
assert [t.text for t in doc] == ["_hello", "_world"]
doc = list(nlp.pipe(["hello world"]))[0]
assert [t.text for t in doc] == ["_hello", "_world"]

View File

@ -3,7 +3,7 @@ title: Model Architectures
teaser: Pre-defined model architectures included with the core library
source: spacy/ml/models
menu:
- ['Tok2Vec', 'tok2vec']
- ['Tok2Vec', 'tok2vec-arch']
- ['Transformers', 'transformers']
- ['Parser & NER', 'parser']
- ['Tagging', 'tagger']
@ -70,6 +70,47 @@ blog post for background.
| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. |
| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. |
### spacy.Tok2VecListener.v1 {#Tok2VecListener}
> #### Example config
>
> ```ini
> [components.tok2vec]
> factory = "tok2vec"
>
> [components.tok2vec.model]
> @architectures = "spacy.HashEmbedCNN.v1"
> width = 342
>
> [components.tagger]
> factory = "tagger"
>
> [components.tagger.model]
> @architectures = "spacy.Tagger.v1"
>
> [components.tagger.model.tok2vec]
> @architectures = "spacy.Tok2VecListener.v1"
> width = ${components.tok2vec.model:width}
> ```
A listener is used as a sublayer within a component such as a
[`DependencyParser`](/api/dependencyparser),
[`EntityRecognizer`](/api/entityrecognizer)or
[`TextCategorizer`](/api/textcategorizer). Usually you'll have multiple
listeners connecting to a single upstream [`Tok2Vec`](/api/tok2vec) component
that's earlier in the pipeline. The listener layers act as **proxies**, passing
the predictions from the `Tok2Vec` component into downstream components, and
communicating gradients back upstream.
Instead of defining its own `Tok2Vec` instance, a model architecture like
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
argument that connects to the shared `tok2vec` component in the pipeline.
| Name | Type | Description |
| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `width` | int | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. |
| `upstream` | str | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. |
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
<!-- TODO: check example config -->
@ -195,7 +236,7 @@ and residual connections.
> depth = 4
> ```
Encode context using bidirectonal LSTM layers. Requires
Encode context using bidirectional LSTM layers. Requires
[PyTorch](https://pytorch.org).
| Name | Type | Description |
@ -237,8 +278,6 @@ architectures into your training config.
### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}
<!-- TODO: description -->
> #### Example Config
>
> ```ini
@ -250,10 +289,41 @@ architectures into your training config.
> @layers = "reduce_mean.v1"
> ```
Create a `TransformerListener` layer, which will connect to a
[`Transformer`](/api/transformer) component earlier in the pipeline. The layer
takes a list of [`Doc`](/api/doc) objects as input, and produces a list of
2-dimensional arrays as output, with each array having one row per token. Most
spaCy models expect a sublayer with this signature, making it easy to connect
them to a transformer model via this sublayer. Transformer models usually
operate over wordpieces, which usually don't align one-to-one against spaCy
tokens. The layer therefore requires a reduction operation in order to calculate
a single token vector given zero or more wordpiece vectors.
| Name | Type | Description |
| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- |
| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. |
| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. |
| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
> #### Example Config
>
> ```ini
> # TODO:
> ```
Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
**not** allow multiple components to share the transformer weights, and does
**not** allow the transformer to set annotations into the [`Doc`](/api/doc)
object, but it's a **simpler solution** if you only need the transformer within
one component.
| Name | Type | Description |
| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_spans` | callable | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
## Parser & NER architectures {#parser}
@ -418,7 +488,7 @@ network has an internal CNN Tok2Vec layer and uses attention.
> ```
| Name | Type | Description |
| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. |
| `width` | int | Output dimension of the feature encoding step. |
@ -427,10 +497,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. |
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
| `dropout` | float | The dropout rate. |
| `nO` | int | Output dimension, determined by the number of different labels. |
If the `nO` dimension is not set, the TextCategorizer component will set it when
`begin_training` is called.
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
| `begin_training` is called. |
### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -458,13 +526,11 @@ vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.
| Name | Type | Description |
| ------------------- | ------------------------------------------ | --------------------------------------------------------------- |
| --------------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
| `nO` | int | Output dimension, determined by the number of different labels. |
If the `nO` dimension is not set, the TextCategorizer component will set it when
`begin_training` is called.
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
| `begin_training` is called. |
### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -483,16 +549,16 @@ An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short.
| Name | Type | Description |
| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. |
| `nO` | int | Output dimension, determined by the number of different labels. |
If the `nO` dimension is not set, the TextCategorizer component will set it when
`begin_training` is called.
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
| `begin_training` is called. |
<!-- TODO:
### spacy.TextCatLowData.v1 {#TextCatLowData}
-->
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
@ -558,8 +624,6 @@ A function that creates a default, empty `KnowledgeBase` from a
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
[`Span`](/api/span) object denoting a named entity, and returns a list of
plausible [`Candidate` objects](/api/kb/#candidate_init).
The default `CandidateGenerator` simply uses the text of a mention to find its
potential aliases in the Knowledgebase. Note that this function is
case-dependent.
plausible [`Candidate` objects](/api/kb/#candidate_init). The default
`CandidateGenerator` simply uses the text of a mention to find its potential
aliases in the `KnowledgeBase`. Note that this function is case-dependent.

View File

@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
## Pretrain {#pretrain new="2.1" tag="experimental"}
<!-- TODO: document new pretrain command and link to new pretraining docs -->
Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
components on [raw text](/api/data-formats#pretrain), using an approximate
language-modeling objective. Specifically, we load pretrained vectors, and train
a component like a CNN, BiLSTM, etc to predict vectors which match the
@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
then include a **path to one of these pretrained weights files** in your
[training config](/usage/training#config) as the `init_tok2vec` setting when you
train your model. This technique may be especially helpful if you have little
labelled data.
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
for more info.
<Infobox title="Changed in v3.0" variant="warning">
@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
| `output_dir` | positional | Directory to write models to on each epoch. |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
| `--resume-path`, `-r` | option | TODO: |
| `--epoch-resume`, `-er` | option | TODO: |
| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
| `--help`, `-h` | flag | Show help message and available arguments. |
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |

View File

@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
[`spacy train`](/api/cli#train). They use
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
hood. For details on how to use training configs, see the
[usage documentation](/usage/training#config).
<!-- TODO: add details on getting started and init config -->
[usage documentation](/usage/training#config). To get started with a blank
config or fill a partial config with all defaults, you can use the
[`init config`](/api/cli#init-config) command.
> #### What does the @ mean?
>
@ -52,8 +52,6 @@ your config and check that it's valid, you can run the
</Infobox>
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
### nlp {#config-nlp tag="section"}
> #### Example
@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
[language model pretraining](/usage/training#pretraining). It's used when you
run [`spacy pretrain`](/api/cli#pretrain).
<!-- TODO: complete -->
| Name | Type | Description | Default |
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
| `max_epochs` | int | Maximum number of epochs. | `1000` |

View File

@ -5,4 +5,194 @@ tag: class
source: spacy/matcher/dependencymatcher.pyx
---
TODO: write
The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
using the
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
It requires a pretrained [`DependencyParser`](/api/parser) or other component
that sets the `Token.dep` attribute.
## Pattern format {#patterns}
> ```json
> ### Example
> [
> {
> "SPEC": {"NODE_NAME": "founded"},
> "PATTERN": {"ORTH": "founded"}
> },
> {
> "SPEC": {
> "NODE_NAME": "founder",
> "NBOR_RELOP": ">",
> "NBOR_NAME": "founded"
> },
> "PATTERN": {"DEP": "nsubj"}
> },
> {
> "SPEC": {
> "NODE_NAME": "object",
> "NBOR_RELOP": ">",
> "NBOR_NAME": "founded"
> },
> "PATTERN": {"DEP": "dobj"}
> }
> ]
> ```
A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
with each dictionary describing a node to match. Each pattern should have the
following top-level keys:
| Name | Type | Description |
| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. |
The `SPEC` includes the following fields:
| Name | Type | Description |
| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. |
| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
| `NBOR_NAME` | str | The unique name of the node that this node is connected to. |
## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
Create a rule-based `DependencyMatcher`.
> #### Example
>
> ```python
> from spacy.matcher import DependencyMatcher
> matcher = DependencyMatcher(nlp.vocab)
> ```
| Name | Type | Description |
| ------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
## DependencyMatcher.\_\call\_\_ {#call tag="method"}
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> #### Example
>
> ```python
> from spacy.matcher import Matcher
>
> matcher = Matcher(nlp.vocab)
> pattern = [
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
> ]
> matcher.add("Founder", [pattern])
> doc = nlp("Bill Gates founded Microsoft.")
> matches = matcher(doc)
> ```
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
Get the number of rules (edges) added to the dependency matcher. Note that this
only returns the number of rules (identical with the number of IDs), not the
number of individual patterns.
> #### Example
>
> ```python
> matcher = DependencyMatcher(nlp.vocab)
> assert len(matcher) == 0
> pattern = [
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
> ]
> matcher.add("Rule", [pattern])
> assert len(matcher) == 1
> ```
| Name | Type | Description |
| ----------- | ---- | -------------------- |
| **RETURNS** | int | The number of rules. |
## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
Check whether the matcher contains rules for a match ID.
> #### Example
>
> ```python
> matcher = Matcher(nlp.vocab)
> assert "Rule" not in matcher
> matcher.add("Rule", [pattern])
> assert "Rule" in matcher
> ```
| Name | Type | Description |
| ----------- | ---- | ----------------------------------------------------- |
| `key` | str | The match ID. |
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## DependencyMatcher.add {#add tag="method"}
Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
optional callback function to act on the matches. The callback function will
receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
exists for the given ID, the patterns will be extended. An `on_match` callback
will be overwritten.
> #### Example
>
> ```python
> def on_match(matcher, doc, id, matches):
> print('Matched!', matches)
>
> matcher = Matcher(nlp.vocab)
> matcher.add("TEST_PATTERNS", patterns)
> ```
| Name | Type | Description |
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
| _keyword-only_ | | |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
## DependencyMatcher.remove {#remove tag="method"}
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
exist.
> #### Example
>
> ```python
> matcher.add("Rule", [pattern]])
> assert "Rule" in matcher
> matcher.remove("Rule")
> assert "Rule" not in matcher
> ```
| Name | Type | Description |
| ----- | ---- | ------------------------- |
| `key` | str | The ID of the match rule. |
## DependencyMatcher.get {#get tag="method"}
Retrieve the pattern stored for a key. Returns the rule as an
`(on_match, patterns)` tuple containing the callback and available patterns.
> #### Example
>
> ```python
> matcher.add("Rule", [pattern], on_match=on_match)
> on_match, patterns = matcher.get("Rule")
> ```
| Name | Type | Description |
| ----------- | ----- | --------------------------------------------- |
| `key` | str | The ID of the match rule. |
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |

View File

@ -8,6 +8,23 @@ api_string_name: parser
api_trainable: true
---
A transition-based dependency parser component. The dependency parser jointly
learns sentence segmentation and labelled dependency parsing, and can optionally
learn to merge tokens that had been over-segmented by the tokenizer. The parser
uses a variant of the **non-monotonic arc-eager transition-system** described by
[Honnibal and Johnson (2014)](https://www.aclweb.org/anthology/D15-1162/), with
the addition of a "break" transition to perform the sentence segmentation.
[Nivre (2005)](https://www.aclweb.org/anthology/P05-1013/)'s **pseudo-projective
dependency transformation** is used to allow the parser to predict
non-projective parses.
The parser is trained using an **imitation learning objective**. It follows the
actions predicted by the current weights, and at each state, determines which
actions are compatible with the optimal parse that could be reached from the
current state. The weights such that the scores assigned to the set of optimal
actions is increased, while scores assigned to other actions are decreased. Note
that more than one action may be optimal for a given state.
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@ -23,17 +40,20 @@ architectures and their arguments and hyperparameters.
> from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
> config = {
> "moves": None,
> # TODO: rest
> "update_with_oracle_cut_size": 100,
> "learn_tokens": False,
> "min_action_freq": 30,
> "model": DEFAULT_PARSER_MODEL,
> }
> nlp.add_pipe("parser", config=config)
> ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
| `moves` | list | | `None` |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | `None` |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | `False` |
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python
@ -61,19 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `moves` | list | |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
| _keyword-only_ | | |
| `update_with_oracle_cut_size` | int | |
| `multitasks` | `Iterable` | |
| `learn_tokens` | bool | |
| `min_action_freq` | int | |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. |
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. |
## DependencyParser.\_\_call\_\_ {#call tag="method"}

View File

@ -8,6 +8,18 @@ api_string_name: ner
api_trainable: true
---
A transition-based named entity recognition component. The entity recognizer
identifies **non-overlapping labelled spans** of tokens. The transition-based
algorithm used encodes certain assumptions that are effective for "traditional"
named entity recognition tasks, but may not be a good fit for every span
identification problem. Specifically, the loss function optimizes for **whole
entity accuracy**, so if your inter-annotator agreement on boundary tokens is
low, the component will likely perform poorly on your problem. The
transition-based algorithm also assumes that the most decisive information about
your entities will be close to their initial tokens. If your entities are long
and characterized by tokens in their middle, the component will likely not be a
good fit for your task.
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@ -23,17 +35,16 @@ architectures and their arguments and hyperparameters.
> from spacy.pipeline.ner import DEFAULT_NER_MODEL
> config = {
> "moves": None,
> # TODO: rest
> "update_with_oracle_cut_size": 100,
> "model": DEFAULT_NER_MODEL,
> }
> nlp.add_pipe("ner", config=config)
> ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
| `moves` | list | | `None` |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python
@ -61,19 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `moves` | list | |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
| _keyword-only_ | | |
| `update_with_oracle_cut_size` | int | |
| `multitasks` | `Iterable` | |
| `learn_tokens` | bool | |
| `min_action_freq` | int | |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}

View File

@ -242,6 +242,21 @@ a batch of [Example](/api/example) objects.
Update the models in the pipeline.
<Infobox variant="warning" title="Changed in v3.0">
The `Language.update` method now takes a batch of [`Example`](/api/example)
objects instead of the raw texts and annotations or `Doc` and `GoldParse`
objects. An [`Example`](/api/example) streamlines how data is passed around. It
stores two `Doc` objects: one for holding the gold-standard reference data, and
one for holding the predictions of the pipeline.
For most use cases, you shouldn't have to write your own training scripts
anymore. Instead, you can use [`spacy train`](/api/cli#train) with a config file
and custom registered functions if needed. See the
[training documentation](/usage/training) for details.
</Infobox>
> #### Example
>
> ```python
@ -253,7 +268,7 @@ Update the models in the pipeline.
| Name | Type | Description |
| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| _keyword-only_ | | |
| `drop` | float | The dropout rate. |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |

View File

@ -9,6 +9,28 @@ api_string_name: lemmatizer
api_trainable: false
---
Component for assigning base forms to tokens using rules based on part-of-speech
tags, or lookup tables. Functionality to train the component is coming soon.
Different [`Language`](/api/language) subclasses can implement their own
lemmatizer components via
[language-specific factories](/usage/processing-pipelines#factories-language).
The default data used is provided by the
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
extension package.
<Infobox variant="warning" title="New in v3.0">
As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be
added to your pipeline, and not a hidden part of the vocab that runs behind the
scenes. This makes it easier to customize how lemmas should be assigned in your
pipeline.
If the lemmatization mode is set to `"rule"` and requires part-of-speech tags to
be assigned, make sure a [`Tagger`](/api/tagger) or another component assigning
tags is available in the pipeline and runs _before_ the lemmatizer.
</Infobox>
## Config and implementation
The default config is defined by the pipeline component factory and describes
@ -29,7 +51,7 @@ lemmatizers, see the
| Setting | Type | Description | Default |
| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` |
| `mode` | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. | `"lookup"` |
| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` |
| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` |
@ -56,13 +78,13 @@ shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description |
| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | [`Vocab`](/api/vocab) | The vocab. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". |
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
| mode | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. |
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. |
| overwrite | bool | Whether to overwrite existing lemmas. |
## Lemmatizer.\_\_call\_\_ {#call tag="method"}

View File

@ -5,6 +5,82 @@ tag: class
source: spacy/matcher/matcher.pyx
---
The `Matcher` lets you find words and phrases using rules describing their token
attributes. Rules can refer to token annotations (like the text or
part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
tokens in context. For in-depth examples and workflows for combining rules and
statistical models, see the [usage guide](/usage/rule-based-matching) on
rule-based matching.
## Pattern format {#patterns}
> ```json
> ### Example
> [
> {"LOWER": "i"},
> {"LEMMA": {"IN": ["like", "love"]}},
> {"POS": "NOUN", "OP": "+"}
> ]
> ```
A pattern added to the `Matcher` consists of a list of dictionaries. Each
dictionary describes **one token** and its attributes. The available token
pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
| Attribute | Type |  Description |
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
| `ORTH` | str | The exact verbatim text of a token. |
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
| `LOWER` | str | The lowercase form of the token text. |
|  `LENGTH` | int | The length of the token text. |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
| `ENT_TYPE` | str | The token's entity label. |
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
| `OP` | str | Operator or quantifier to determine how often to match a token pattern. |
Operators and quantifiers define **how often** a token pattern should be
matched:
> ```json
> ### Example
> [
> {"POS": "ADJ", "OP": "*"},
> {"POS": "NOUN", "OP": "+"}
> ]
> ```
| OP | Description |
| --- | ---------------------------------------------------------------- |
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match zero or more times. |
Token patterns can also map to a **dictionary of properties** instead of a
single value to indicate whether the expected value is a member of a list or how
it compares to another value.
> ```json
> ### Example
> [
> {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
> {"POS": "PROPN", "LENGTH": {">=": 10}},
> ]
> ```
| Attribute | Type | Description |
| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
| `IN` | any | Attribute value is member of a list. |
| `NOT_IN` | any | Attribute value is _not_ member of a list. |
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
## Matcher.\_\_init\_\_ {#init tag="method"}
Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.
| Name | Type | Description |
| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A stream of documents. |
| `docs` | iterable | A stream of documents or spans. |
| `batch_size` | int | The number of documents to accumulate into a working set. |
| `return_matches` <Tag variant="new">2.1</Tag> | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. |
| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.
## Matcher.add {#add tag="method" new="2"}
Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
callback function to act on the matches. The callback function will receive the
arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
the given ID, the patterns will be extended. An `on_match` callback will be
overwritten.
Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
optional callback function to act on the matches. The callback function will
receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
exists for the given ID, the patterns will be extended. An `on_match` callback
will be overwritten.
> #### Example
>
@ -142,11 +218,12 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
</Infobox>
| Name | Type | Description |
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. |
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
| _keyword-only_ | | |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `greedy` <Tag variant="new">3</Tag> | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. |
## Matcher.remove {#remove tag="method" new="2"}

View File

@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description |
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| `labels_morph` | dict | |
| `labels_pos` | dict | |
| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. |
| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. |
## Morphologizer.\_\_call\_\_ {#call tag="method"}

View File

@ -9,7 +9,8 @@ new: 2
The `PhraseMatcher` lets you efficiently match large terminology lists. While
the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
objects.
objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
examples.
## PhraseMatcher.\_\_init\_\_ {#init tag="method"}

View File

@ -29,9 +29,9 @@ architectures and their arguments and hyperparameters.
> ```
| Setting | Type | Description | Default |
| ---------------- | ------------------------------------------ | -------------------------------------- | ----------------------------------- |
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
| `set_morphology` | bool | Whether to set morphological features. | `False` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
@ -59,9 +59,9 @@ shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description |
| ---------------- | ------- | ------------------------------------------------------------------------------------------- |
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| `set_morphology` | bool | Whether to set morphological features. |

View File

@ -9,6 +9,12 @@ api_string_name: textcat
api_trainable: true
---
The text categorizer predicts **categories over a whole document**. It can learn
one or more labels, and the labels can be mutually exclusive (i.e. one true
label per document) or non-mutually exclusive (i.e. zero or more labels may be
true per document). The multi-label setting is controlled by the model instance
that's provided.
## Config and implementation {#config}
The default config is defined by the pipeline component factory and describes
@ -30,9 +36,9 @@ architectures and their arguments and hyperparameters.
> ```
| Setting | Type | Description | Default |
| -------- | ------------------------------------------ | ------------------ | ----------------------------------------------------- |
| `labels` | `Iterable[str]` | The labels to use. | `[]` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- |
| `labels` | `List[str]` | A list of categories to learn. If empty, the model infers the categories from the data. | `[]` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
@ -67,23 +73,6 @@ shortcut for this and instantiate the component using its string name and
| _keyword-only_ | | |
| `labels` | `Iterable[str]` | The labels to use. |
<!-- TODO move to config page
### Architectures {#architectures new="2.1"}
Text classification models can be used to solve a wide variety of problems.
Differences in text length, number of labels, difficulty, and runtime
performance constraints mean that no single algorithm performs well on all types
of problems. To handle a wider variety of problems, the `TextCategorizer` object
allows configuration of its model architecture, using the `architecture` keyword
argument.
| Name | Description |
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `"ensemble"` | **Default:** Stacked ensemble of a bag-of-words model and a neural network model. The neural network uses a CNN with mean pooling and attention. The "ngram_size" and "attr" arguments can be used to configure the feature extraction for the bag-of-words model. |
| `"simple_cnn"` | A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. |
| `"bow"` | An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. The features extracted can be controlled using the keyword arguments `ngram_size` and `attr`. For instance, `ngram_size=3` and `attr="lower"` would give lower-cased unigram, trigram and bigram features. 2, 3 or 4 are usually good choices of ngram size. |
-->
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned.

View File

@ -8,7 +8,20 @@ api_string_name: tok2vec
api_trainable: true
---
<!-- TODO: intro describing component -->
Apply a "token-to-vector" model and set its outputs in the `Doc.tensor`
attribute. This is mostly useful to **share a single subnetwork** between
multiple components, e.g. to have one embedding and CNN network shared between a
[`DependencyParser`](/api/dependencyparser), [`Tagger`](/api/tagger) and
[`EntityRecognizer`](/api/entityrecognizer).
In order to use the `Tok2Vec` predictions, subsequent components should use the
[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec
subnetwork of their model. This layer will read data from the `doc.tensor`
attribute during prediction. During training, the `Tok2Vec` component will save
its prediction and backprop callback for each batch, so that the subsequent
components can backpropagate to the shared weights. This implementation is used
because it allows us to avoid relying on object identity within the models to
achieve the parameter sharing.
## Config and implementation {#config}
@ -28,8 +41,8 @@ architectures and their arguments and hyperparameters.
> ```
| Setting | Type | Description | Default |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
@ -64,9 +77,11 @@ shortcut for this and instantiate the component using its string name and
## Tok2Vec.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned.
This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both
Apply the pipe to one document and add context-sensitive embeddings to the
`Doc.tensor` attribute, allowing them to be used as features by downstream
components. The document is modified in place, and returned. This usually
happens under the hood when the `nlp` object is called on a text and all
pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/tok2vec#call) and [`pipe`](/api/tok2vec#pipe) delegate to the
[`predict`](/api/tok2vec#predict) and
[`set_annotations`](/api/tok2vec#set_annotations) methods.

View File

@ -340,7 +340,7 @@ See the [`Transformer`](/api/transformer) API reference and
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
<!-- TODO: intro and also describe signature of functions -->
<!-- TODO: intro -->
#### batch_by_words.v1 {#batch_by_words tag="registered function"}
@ -361,19 +361,16 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
> get_length = null
> ```
<!-- TODO: complete table -->
| Name | Type | Description |
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
| `tolerance` | float | |
| `discard_oversize` | bool | Discard items that are longer than the specified batch length. |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `seqs` | `Iterable[Any]` | The sequences to minibatch. |
| `size` | `Iterable[int]` / int | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
| `tolerance` | float | What percentage of the size to allow batches to exceed. |
| `discard_oversize` | bool | Whether to discard sequences that by themselves exceed the tolerated size. |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
<!-- TODO: -->
> #### Example config
>
> ```ini
@ -383,34 +380,37 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
> get_length = null
> ```
<!-- TODO: complete table -->
Create a batcher that creates batches of the specified size.
| Name | Type | Description |
| ------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `size` | `Iterable[int]` / int | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
#### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
<!-- TODO: -->
> #### Example config
>
> ```ini
> [training.batcher]
> @batchers = "batch_by_words.v1"
> @batchers = "batch_by_padded.v1"
> size = 100
> buffer = TODO:
> buffer = 256
> discard_oversize = false
> get_length = null
> ```
Minibatch a sequence by the size of padded batches that would result, with
sequences binned by length within a window. The padded size is defined as the
maximum length of sequences within the batch multiplied by the number of
sequences in the batch.
| Name | Type | Description |
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
| `buffer` | int | |
| `discard_oversize` | bool | Discard items that are longer than the specified batch length. |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `size` | `Iterable[int]` / int | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
| `buffer` | int | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. |
| `discard_oversize` | bool | Whether to discard sequences that are by themselves longer than the largest padded batch size. |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
## Training data and alignment {#gold source="spacy/gold"}

View File

@ -25,8 +25,15 @@ work out-of-the-box.
</Infobox>
This pipeline component lets you use transformer models in your pipeline. The
component assigns the output of the transformer to the Doc's extension
This pipeline component lets you use transformer models in your pipeline.
Supports all models that are available via the
[HuggingFace `transformers`](https://huggingface.co/transformers) library.
Usually you will connect subsequent components to the shared transformer using
the [TransformerListener](/api/architectures#TransformerListener) layer. This
works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and
[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer.
The component assigns the output of the transformer to the `Doc`'s extension
attributes. We also calculate an alignment between the word-piece tokens and the
spaCy tokenization, so that we can use the last hidden states to set the
`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
@ -54,10 +61,10 @@ architectures and their arguments and hyperparameters.
> ```
| Setting | Type | Description | Default |
| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) |
| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. | [TransformerModel](/api/architectures#TransformerModel) |
```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
@ -86,15 +93,19 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
> trf = Transformer(nlp.vocab, model)
> ```
Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe).
Construct a `Transformer` component. One or more subsequent spaCy components can
use the transformer outputs as features in its model, with gradients
backpropagated to the single shared weights. The activations from the
transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension
attribute. You can also provide a callback to set additional annotations. In
your application, you would normally use a shortcut for this and instantiate the
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Type | Description |
| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. |
| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. |
| _keyword-only_ | | |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
@ -184,7 +195,10 @@ Apply the pipeline's model to a batch of docs, without modifying them.
## Transformer.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed scores.
Assign the extracted features to the Doc objects. By default, the
[`TransformerData`](/api/transformer#transformerdata) object is written to the
[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter
callback is then called, if provided.
> #### Example
>
@ -201,8 +215,19 @@ Modify a batch of documents, using pre-computed scores.
## Transformer.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating the
pipe's model. Delegates to [`predict`](/api/transformer#predict).
Prepare for an update to the transformer. Like the [`Tok2Vec`](/api/tok2vec)
component, the `Transformer` component is unusual in that it does not receive
"gold standard" annotations to calculate a weight update. The optimal output of
the transformer data is unknown it's a hidden layer inside the network that is
updated by backpropagating from output layers.
The `Transformer` component therefore does **not** perform a weight update
during its own `update` method. Instead, it runs its transformer model and
communicates the output and the backpropagation callback to any **downstream
components** that have been connected to it via the
[TransformerListener](/api/architectures#TransformerListener) sublayer. If there
are multiple listeners, the last layer will actually backprop to the transformer
and call the optimizer, while the others simply increment the gradients.
> #### Example
>
@ -213,8 +238,8 @@ pipe's model. Delegates to [`predict`](/api/transformer#predict).
> ```
| Name | Type | Description |
| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. |
| _keyword-only_ | | |
| `drop` | float | The dropout rate. |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). |
@ -394,21 +419,23 @@ Split a `TransformerData` object that represents a batch into a list with one
| ----------- | ----------------------- | ----------- |
| **RETURNS** | `List[TransformerData]` | |
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
<!-- TODO: details on what this is for -->
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
return a lists of [`Span`](/api/span) objects for each doc, to be processed by
the transformer. The returned spans can overlap. Span getters can be referenced
in the config's `[components.transformer.model.get_spans]` block to customize
the sequences processed by the transformer. You can also register custom span
getters using the `@registry.span_getters` decorator.
the transformer. This is used to manage long documents, by cutting them into
smaller sequences before running the transformer. The spans are allowed to
overlap, and you can also omit sections of the Doc if they are not relevant.
Span getters can be referenced in the `[components.transformer.model.get_spans]`
block of the config to customize the sequences processed by the transformer. You
can also register custom span getters using the `@spacy.registry.span_getters`
decorator.
> #### Example
>
> ```python
> @registry.span_getters("sent_spans.v1")
> @spacy.registry.span_getters("sent_spans.v1")
> def configure_get_sent_spans() -> Callable:
> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
> return [list(doc.sents) for doc in docs]
@ -421,15 +448,55 @@ getters using the `@registry.span_getters` decorator.
| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. |
| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. |
The following built-in functions are available:
### doc_spans.v1 {#doc_spans tag="registered function"}
<!-- TODO: finish API docs -->
> #### Example config
>
> ```ini
> [transformer.model.get_spans]
> @span_getters = "doc_spans.v1"
> ```
| Name | Description |
| ------------------ | ------------------------------------------------------------------ |
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
| `strided_spans.v1` | |
Create a span getter that uses the whole document as its spans. This is the best
approach if your [`Doc`](/api/doc) objects already refer to relatively short
texts.
### sent_spans.v1 {#sent_spans tag="registered function"}
> #### Example config
>
> ```ini
> [transformer.model.get_spans]
> @span_getters = "sent_spans.v1"
> ```
Create a span getter that uses sentence boundary markers to extract the spans.
This requires sentence boundaries to be set (e.g. by the
[`Sentencizer`](/api/sentencizer)), and may result in somewhat uneven batches,
depending on the sentence lengths. However, it does provide the transformer with
more meaningful windows to attend over.
### strided_spans.v1 {#strided_spans tag="registered function"}
> #### Example config
>
> ```ini
> [transformer.model.get_spans]
> @span_getters = "strided_spans.v1"
> window = 128
> stride = 96
> ```
Create a span getter for strided spans. If you set the `window` and `stride` to
the same value, the spans will cover each token once. Setting `stride` lower
than `window` will allow for an overlap, so that some tokens are counted twice.
This can be desirable, because it allows all tokens to have both a left and
right context.
| Name | Type | Description |
| --------- | ---- | ---------------- |
|  `window` | int | The window size. |
| `stride` | int | The stride size. |
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}

View File

@ -1,54 +1,88 @@
The central data structures in spaCy are the `Doc` and the `Vocab`. The `Doc`
object owns the **sequence of tokens** and all their annotations. The `Vocab`
object owns a set of **look-up tables** that make common information available
across documents. By centralizing strings, word vectors and lexical attributes,
we avoid storing multiple copies of this data. This saves memory, and ensures
there's a **single source of truth**.
The central data structures in spaCy are the [`Language`](/api/language) class,
the [`Vocab`](/api/vocab) and the [`Doc`](/api/doc) object. The `Language` class
is used to process a text and turn it into a `Doc` object. It's typically stored
as a variable called `nlp`. The `Doc` object owns the **sequence of tokens** and
all their annotations. By centralizing strings, word vectors and lexical
attributes in the `Vocab`, we avoid storing multiple copies of this data. This
saves memory, and ensures there's a **single source of truth**.
Text annotations are also designed to allow a single source of truth: the `Doc`
object owns the data, and `Span` and `Token` are **views that point into it**.
The `Doc` object is constructed by the `Tokenizer`, and then **modified in
place** by the components of the pipeline. The `Language` object coordinates
these components. It takes raw text and sends it through the pipeline, returning
an **annotated document**. It also orchestrates training and serialization.
object owns the data, and [`Span`](/api/span) and [`Token`](/api/token) are
**views that point into it**. The `Doc` object is constructed by the
[`Tokenizer`](/api/tokenizer), and then **modified in place** by the components
of the pipeline. The `Language` object coordinates these components. It takes
raw text and sends it through the pipeline, returning an **annotated document**.
It also orchestrates training and serialization.
<!-- TODO: update architecture and tables below to match sidebar in API docs etc. -->
<!-- TODO: update graphic -->
![Library architecture](../../images/architecture.svg)
### Container objects {#architecture-containers}
| Name | Description |
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`. |
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
| [`Span`](/api/span) | A slice from a `Doc` object. |
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
| [`Example`](/api/example) | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions. |
| [`DocBin`](/api/docbin) | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training). |
### Processing pipeline {#architecture-pipeline}
The processing pipeline consists of one or more **pipeline components** that are
called on the `Doc` in order. The tokenizer runs before the components. Pipeline
components can be added using [`Language.add_pipe`](/api/language#add_pipe).
They can contain a statistical model and trained weights, or only make
rule-based modifications to the `Doc`. spaCy provides a range of built-in
components for different language processing tasks and also allows adding
[custom components](/usage/processing-pipelines#custom-components).
![The processing pipeline](../../images/pipeline.svg)
| Name | Description |
| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. |
| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. |
| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. |
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. |
| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. |
| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. |
| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. |
| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. |
| [`Tagger`](/api/tagger) | Predict part-of-speech tags. |
| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. |
| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. |
| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. |
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. |
| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. |
| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
| [`Pipe`](/api/pipe) | Base class that all trainable pipeline components inherit from. |
### Matchers {#architecture-matchers}
Matchers help you find and extract information from [`Doc`](/api/doc) objects
based on match patterns describing the sequences you're looking for. A matcher
operates on a `Doc` and gives you access to the matched tokens **in context**.
| Name | Description |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
| [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. |
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
### Other classes {#architecture-other}
| Name | Description |
| --------------------------------- | ----------------------------------------------------------------------------- |
| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
| [`Example`](/api/example) | Collection for training annotations. |
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
| [`Scorer`](/api/scorer) | Compute evaluation scores. |
| [`Corpus`](/api/corpis) | Class for managing annotated corpora for training and evaluation data. |

View File

@ -750,16 +750,13 @@ print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that']
The special case doesn't have to match an entire whitespace-delimited substring.
The tokenizer will incrementally split off punctuation, and keep looking up the
remaining substring:
remaining substring. The special case rules also have precedence over the
punctuation splitting.
```python
assert "gimme" not in [w.text for w in nlp("gimme!")]
assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
```
The special case rules have precedence over the punctuation splitting:
```python
nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}])
assert len(nlp("...gimme...?")) == 1
```
@ -813,19 +810,6 @@ domain. There are six things you may need to define:
6. An optional boolean function `url_match`, which is similar to `token_match`
except that prefixes and suffixes are removed before applying the match.
<Infobox title="Important note: token match in spaCy v2.2" variant="warning">
In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match`
above and there was no match pattern applied before prefixes and suffixes were
analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its
behavior in v2.2.1 and earlier with precedence over prefixes and suffixes.
The `url_match` is introduced in v2.3.0 to handle cases like URLs where the
tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a
URL) before applying the match.
</Infobox>
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
to use `re.compile()` to build a regular expression object, and pass its
`.search()` and `.finditer()` methods:
@ -905,12 +889,13 @@ function that behaves the same way.
<Infobox title="Important note" variant="warning">
If you're using a statistical model, writing to the `nlp.Defaults` or
`English.Defaults` directly won't work, since the regular expressions are read
from the model and will be compiled when you load it. If you modify
`nlp.Defaults`, you'll only see the effect if you call
[`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the tokenizer
loaded from a statistical model, you should modify `nlp.tokenizer` directly.
If you're using a statistical model, writing to the
[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
work, since the regular expressions are read from the model and will be compiled
when you load it. If you modify `nlp.Defaults`, you'll only see the effect if
you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the
tokenizer loaded from a statistical model, you should modify `nlp.tokenizer`
directly.
</Infobox>
@ -961,51 +946,50 @@ and language-specific definitions such as
[`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py)
for German.
### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer}
### Hooking a custom tokenizer into the pipeline {#custom-tokenizer}
The tokenizer is the first component of the processing pipeline and the only one
that can't be replaced by writing to `nlp.pipeline`. This is because it has a
different signature from all the other components: it takes a text and returns a
`Doc`, whereas all other components expect to already receive a tokenized `Doc`.
[`Doc`](/api/doc), whereas all other components expect to already receive a
tokenized `Doc`.
![The processing pipeline](../images/pipeline.svg)
To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a
custom function that takes a text, and returns a `Doc`.
custom function that takes a text, and returns a [`Doc`](/api/doc).
> #### Creating a Doc
>
> Constructing a [`Doc`](/api/doc) object manually requires at least two
> arguments: the shared `Vocab` and a list of words. Optionally, you can pass in
> a list of `spaces` values indicating whether the token at this position is
> followed by a space (default `True`). See the section on
> [pre-tokenized text](#own-annotations) for more info.
>
> ```python
> words = ["Let", "'s", "go", "!"]
> spaces = [False, True, False, False]
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ```
```python
nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank("en")
nlp.tokenizer = my_tokenizer
```
| Argument | Type | Description |
| ----------- | ----- | ------------------------- |
| ----------- | ----------------- | ------------------------- |
| `text` | str | The raw text to tokenize. |
| **RETURNS** | `Doc` | The tokenized document. |
| **RETURNS** | [`Doc`](/api/doc) | The tokenized document. |
<Infobox title="Important note: using a custom tokenizer" variant="warning">
#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example}
In spaCy v1.x, you had to add a custom tokenizer by passing it to the `make_doc`
keyword argument, or by passing a tokenizer "factory" to `create_make_doc`. This
was unnecessarily complicated. Since spaCy v2.0, you can write to
`nlp.tokenizer` instead. If your tokenizer needs the vocab, you can write a
function and use `nlp.vocab`.
```diff
- nlp = spacy.load("en_core_web_sm", make_doc=my_tokenizer)
- nlp = spacy.load("en_core_web_sm", create_make_doc=my_tokenizer_factory)
+ nlp.tokenizer = my_tokenizer
+ nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
```
</Infobox>
### Example: A custom whitespace tokenizer {#custom-tokenizer-example}
To construct the tokenizer, we usually want attributes of the `nlp` pipeline.
Specifically, we want the tokenizer to hold a reference to the vocabulary
object. Let's say we have the following class as our tokenizer:
Here's an example of the most basic whitespace tokenizer. It takes the shared
vocab, so it can construct `Doc` objects. When it's called on a text, it returns
a `Doc` object consisting of the text split on single space characters. We can
then overwrite the `nlp.tokenizer` attribute with an instance of our custom
tokenizer.
```python
### {executable="true"}
@ -1017,68 +1001,189 @@ class WhitespaceTokenizer:
self.vocab = vocab
def __call__(self, text):
words = text.split(' ')
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
words = text.split(" ")
return Doc(self.vocab, words=words)
nlp = spacy.load("en_core_web_sm")
nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
print([t.text for t in doc])
print([token.text for token in doc])
```
As you can see, we need a `Vocab` instance to construct this — but we won't have
it until we get back the loaded `nlp` object. The simplest solution is to build
the tokenizer in two steps. This also means that you can reuse the "tokenizer
factory" and initialize it with different instances of `Vocab`.
#### Example 2: Third-party tokenizers (BERT word pieces) {#custom-tokenizer-example2}
### Bringing your own annotations {#own-annotations}
You can use the same approach to plug in any other third-party tokenizers. Your
custom callable just needs to return a `Doc` object with the tokens produced by
your tokenizer. In this example, the wrapper uses the **BERT word piece
tokenizer**, provided by the
[`tokenizers`](https://github.com/huggingface/tokenizers) library. The tokens
available in the `Doc` object returned by spaCy now match the exact word pieces
produced by the tokenizer.
spaCy generally assumes by default that your data is raw text. However,
> #### 💡 Tip: spacy-transformers
>
> If you're working with transformer models like BERT, check out the
> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
> extension package and [documentation](/usage/transformers). It includes a
> pipeline component for using pretrained transformer weights and **training
> transformer models** in spaCy, as well as helpful utilities for aligning word
> pieces to linguistic tokenization.
```python
### Custom BERT word piece tokenizer
from tokenizers import BertWordPieceTokenizer
from spacy.tokens import Doc
import spacy
class BertTokenizer:
def __init__(self, vocab, vocab_file, lowercase=True):
self.vocab = vocab
self._tokenizer = BertWordPieceTokenizer(vocab_file, lowercase=lowercase)
def __call__(self, text):
tokens = self._tokenizer.encode(text)
words = []
spaces = []
for i, (text, (start, end)) in enumerate(zip(tokens.tokens, tokens.offsets)):
words.append(text)
if i < len(tokens.tokens) - 1:
# If next start != current end we assume a space in between
next_start, next_end = tokens.offsets[i + 1]
spaces.append(next_start > end)
else:
spaces.append(True)
return Doc(self.vocab, words=words, spaces=spaces)
nlp = spacy.blank("en")
nlp.tokenizer = BertTokenizer(nlp.vocab, "bert-base-uncased-vocab.txt")
doc = nlp("Justin Drew Bieber is a Canadian singer, songwriter, and actor.")
print(doc.text, [token.text for token in doc])
# [CLS]justin drew bi##eber is a canadian singer, songwriter, and actor.[SEP]
# ['[CLS]', 'justin', 'drew', 'bi', '##eber', 'is', 'a', 'canadian', 'singer',
# ',', 'songwriter', ',', 'and', 'actor', '.', '[SEP]']
```
<Infobox title="Important note on tokenization and models" variant="warning">
Keep in mind that your model's result may be less accurate if the tokenization
during training differs from the tokenization at runtime. So if you modify a
pretrained model's tokenization afterwards, it may produce very different
predictions. You should therefore train your model with the **same tokenizer**
it will be using at runtime. See the docs on
[training with custom tokenization](#custom-tokenizer-training) for details.
</Infobox>
#### Training with custom tokenization {#custom-tokenizer-training new="3"}
spaCy's [training config](/usage/training#config) describe the settings,
hyperparameters, pipeline and tokenizer used for constructing and training the
model. The `[nlp.tokenizer]` block refers to a **registered function** that
takes the `nlp` object and returns a tokenizer. Here, we're registering a
function called `whitespace_tokenizer` in the
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
construct your tokenizer during training, you can pass in your Python file by
setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
> #### config.cfg
>
> ```ini
> [nlp.tokenizer]
> @tokenizers = "whitespace_tokenizer"
> ```
```python
### functions.py {highlight="1"}
@spacy.registry.tokenizers("whitespace_tokenizer")
def create_whitespace_tokenizer():
def create_tokenizer(nlp):
return WhitespaceTokenizer(nlp.vocab)
return create_tokenizer
```
Registered functions can also take arguments that are then passed in from the
config. This allows you to quickly change and keep track of different settings.
Here, the registered function called `bert_word_piece_tokenizer` takes two
arguments: the path to a vocabulary file and whether to lowercase the text. The
Python type hints `str` and `bool` ensure that the received values have the
correct type.
> #### config.cfg
>
> ```ini
> [nlp.tokenizer]
> @tokenizers = "bert_word_piece_tokenizer"
> vocab_file = "bert-base-uncased-vocab.txt"
> lowercase = true
> ```
```python
### functions.py {highlight="1"}
@spacy.registry.tokenizers("bert_word_piece_tokenizer")
def create_whitespace_tokenizer(vocab_file: str, lowercase: bool):
def create_tokenizer(nlp):
return BertWordPieceTokenizer(nlp.vocab, vocab_file, lowercase)
return create_tokenizer
```
To avoid hard-coding local paths into your config file, you can also set the
vocab path on the CLI by using the `--nlp.tokenizer.vocab_file`
[override](/usage/training#config-overrides) when you run
[`spacy train`](/api/cli#train). For more details on using registered functions,
see the docs in [training with custom code](/usage/training#custom-code).
<Infobox variant="warning">
Remember that a registered function should always be a function that spaCy
**calls to create something**, not the "something" itself. In this case, it
**creates a function** that takes the `nlp` object and returns a callable that
takes a text and returns a `Doc`.
</Infobox>
#### Using pre-tokenized text {#own-annotations}
spaCy generally assumes by default that your data is **raw text**. However,
sometimes your data is partially annotated, e.g. with pre-existing tokenization,
part-of-speech tags, etc. The most common situation is that you have pre-defined
tokenization. If you have a list of strings, you can create a `Doc` object
directly. Optionally, you can also specify a list of boolean values, indicating
whether each word has a subsequent space.
part-of-speech tags, etc. The most common situation is that you have
**pre-defined tokenization**. If you have a list of strings, you can create a
[`Doc`](/api/doc) object directly. Optionally, you can also specify a list of
boolean values, indicating whether each word is followed by a space.
> #### ✏️ Things to try
>
> 1. Change a boolean value in the list of `spaces`. You should see it reflected
> in the `doc.text` and whether the token is followed by a space.
> 2. Remove `spaces=spaces` from the `Doc`. You should see that every token is
> now followed by a space.
> 3. Copy-paste a random sentence from the internet and manually construct a
> `Doc` with `words` and `spaces` so that the `doc.text` matches the original
> input text.
```python
### {executable="true"}
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English
nlp = English()
doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
spaces=[False, True, False, False])
nlp = spacy.blank("en")
words = ["Hello", ",", "world", "!"]
spaces = [False, True, False, False]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
```
If provided, the spaces list must be the same length as the words list. The
If provided, the spaces list must be the **same length** as the words list. The
spaces list affects the `doc.text`, `span.text`, `token.idx`, `span.start_char`
and `span.end_char` attributes. If you don't provide a `spaces` sequence, spaCy
will assume that all words are whitespace delimited.
will assume that all words are followed by a space. Once you have a
[`Doc`](/api/doc) object, you can write to its attributes to set the
part-of-speech tags, syntactic dependencies, named entities and other
attributes.
```python
### {executable="true"}
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English
nlp = English()
bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
spaces=[False, True, False, False])
print(bad_spaces.text) # 'Hello , world !'
print(good_spaces.text) # 'Hello, world!'
```
Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set
the part-of-speech tags, syntactic dependencies, named entities and other
attributes. For details, see the respective usage pages.
### Aligning tokenization {#aligning-tokenization}
#### Aligning tokenization {#aligning-tokenization}
spaCy's tokenization is non-destructive and uses language-specific rules
optimized for compatibility with treebank annotations. Other tools and resources

View File

@ -979,8 +979,8 @@ added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
called on a text, it will find matches in the `doc` and add them as entities to
the `doc.ents`, using the specified pattern label as the entity label. If any
matches were to overlap, the pattern matching most tokens takes priority. If
they also happen to be equally long, then the match occuring first in the Doc is
chosen.
they also happen to be equally long, then the match occurring first in the `Doc`
is chosen.
```python
### {executable="true"}

View File

@ -6,26 +6,98 @@ menu:
- ['New Features', 'features']
- ['Backwards Incompatibilities', 'incompat']
- ['Migrating from v2.x', 'migrating']
- ['Migrating plugins', 'plugins']
---
## Summary {#summary}
## New Features {#features}
### New training workflow and config system {#features-training}
### Transformer-based pipelines {#features-transformers}
### Custom models using any framework {#feautres-custom-models}
### Manage end-to-end workflows with projects {#features-projects}
### New built-in pipeline components {#features-pipeline-components}
| Name | Description |
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
### New and improved pipeline component APIs {#features-components}
- `Language.factory`, `Language.component`
- `Language.analyze_pipes`
- Adding components from other models
### Type hints and type-based data validation {#features-types}
spaCy v3.0 officially drops support for Python 2 and now requires **Python
3.6+**. This also means that the code base can take full advantage of
[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
API that's implemented in pure Python (as opposed to Cython) now comes with type
hints. The new version of spaCy's machine learning library
[Thinc](https://thinc.ai) also features extensive
[type support](https://thinc.ai/docs/usage-type-checking/), including custom
types for models and arrays, and a custom `mypy` plugin that can be used to
type-check model definitions.
For data validation, spacy v3.0 adopts
[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
lets you to register **custom functions with typed arguments**, reference them
in your config and see validation errors if the argument values don't match.
### CLI
| Name | Description |
| --------------------------------------- | -------------------------------------------------------------------------------------------------------- |
| [`init config`](/api/cli#init-config) | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. |
| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors. |
| [`project`](/api/cli#project) | Subcommand for cloning and running [spaCy projects](/usage/projects). |
## Backwards Incompatibilities {#incompat}
### Removed or renamed objects, methods, attributes and arguments {#incompat-removed}
As always, we've tried to keep the breaking changes to a minimum and focus on
changes that were necessary to support the new features, fix problems or improve
usability. The following section lists the relevant changes to the user-facing
API. For specific examples of how to rewrite your code, check out the
[migration guide](#migrating).
### Compatibility {#incompat-compat}
- spaCy now requires **Python 3.6+**.
### API changes {#incompat-api}
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
the component factory instead of the component function.
- **Custom pipeline components** now needs to be decorated with the
[`@Language.component`](/api/language#component) or
[`@Language.factory`](/api/language#factory) decorator.
- [`Language.update`](/api/language#update) now takes a batch of
[`Example`](/api/example) objects instead of raw texts and annotations, or
`Doc` and `GoldParse` objects.
- The `Language.disable_pipes` contextmanager has been replaced by
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
disable or enable components.
### Removed or renamed API {#incompat-removed}
| Removed | Replacement |
| -------------------------------------------------------- | ----------------------------------------- |
| -------------------------------------------------------- | ----------------------------------------------------- |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) |
| `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
### Removed deprecated methods, attributes and arguments {#incompat-removed-deprecated}
The following deprecated methods, attributes and arguments were removed in v3.0.
Most of them have been **deprecated for a while** and many would previously
raise errors. Many of them were also mostly internals. If you've been working
@ -214,17 +286,14 @@ python -m spacy package ./model ./packages
- python setup.py sdist
```
## Migration notes for plugin maintainers {#plugins}
#### Migration notes for plugin maintainers {#migrating-plugins}
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
and maintaining one of the many awesome [plugins and extensions](/universe).
We've tried to keep breaking changes to a minimum and make it as easy as
possible for you to upgrade your packages for spaCy v3.
### Custom pipeline components
The most common use case for plugins is providing pipeline components and
extension attributes.
We've tried to make it as easy as possible for you to upgrade your packages for
spaCy v3. The most common use case for plugins is providing pipeline components
and extension attributes. When migrating your plugin, double-check the
following:
- Use the [`@Language.factory`](/api/language#factory) decorator to register
your component and assign it a name. This allows users to refer to your

View File

@ -11,7 +11,7 @@ import Link from './link'
import GitHubCode from './github'
import classes from '../styles/code.module.sass'
const WRAP_THRESHOLD = 15
const WRAP_THRESHOLD = 16
export default props => (
<Pre>