mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-23 23:20:52 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
94da9f48de
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a5"
|
||||
__version__ = "3.0.0a6"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -35,7 +35,7 @@ def pretrain_cli(
|
|||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
# fmt: on
|
||||
):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable
|
||||
from typing import Union, Iterable, Sequence, TypeVar, List, Callable
|
||||
from typing import Optional, Any
|
||||
from functools import partial
|
||||
import itertools
|
||||
|
@ -19,6 +19,22 @@ def configure_minibatch_by_padded_size(
|
|||
discard_oversize: bool,
|
||||
get_length: Optional[Callable[[ItemT], int]] = None
|
||||
) -> BatcherT:
|
||||
"""Create a batcher that uses the `batch_by_padded_size` strategy.
|
||||
|
||||
The padded size is defined as the maximum length of sequences within the
|
||||
batch multiplied by the number of sequences in the batch.
|
||||
|
||||
size (int or Iterable[int]): The largest padded size to batch sequences into.
|
||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||
buffer (int): The number of sequences to accumulate before sorting by length.
|
||||
A larger buffer will result in more even sizing, but if the buffer is
|
||||
very large, the iteration order will be less random, which can result
|
||||
in suboptimal training.
|
||||
discard_oversize (bool): Whether to discard sequences that are by themselves
|
||||
longer than the largest padded batch size.
|
||||
get_length (Callable or None): Function to get the length of a sequence item.
|
||||
The `len` function is used by default.
|
||||
"""
|
||||
# Avoid displacing optional values from the underlying function.
|
||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||
return partial(
|
||||
|
@ -38,6 +54,16 @@ def configure_minibatch_by_words(
|
|||
discard_oversize: bool,
|
||||
get_length: Optional[Callable[[ItemT], int]] = None
|
||||
) -> BatcherT:
|
||||
"""Create a batcher that uses the "minibatch by words" strategy.
|
||||
|
||||
size (int or Iterable[int]): The target number of words per batch.
|
||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||
tolerance (float): What percentage of the size to allow batches to exceed.
|
||||
discard_oversize (bool): Whether to discard sequences that by themselves
|
||||
exceed the tolerated size.
|
||||
get_length (Callable or None): Function to get the length of a sequence
|
||||
item. The `len` function is used by default.
|
||||
"""
|
||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||
return partial(
|
||||
minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals
|
||||
|
@ -48,22 +74,43 @@ def configure_minibatch_by_words(
|
|||
def configure_minibatch(
|
||||
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
|
||||
) -> BatcherT:
|
||||
"""Create a batcher that creates batches of the specified size.
|
||||
|
||||
size (int or Iterable[int]): The target number of items per batch.
|
||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||
"""
|
||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||
return partial(minibatch, size=size, **optionals)
|
||||
|
||||
|
||||
def minibatch_by_padded_size(
|
||||
docs: Iterator["Doc"],
|
||||
seqs: Iterable[ItemT],
|
||||
size: Sizing,
|
||||
buffer: int = 256,
|
||||
discard_oversize: bool = False,
|
||||
get_length: Callable = len,
|
||||
) -> Iterator[Iterator["Doc"]]:
|
||||
) -> Iterable[List[ItemT]]:
|
||||
"""Minibatch a sequence by the size of padded batches that would result,
|
||||
with sequences binned by length within a window.
|
||||
|
||||
The padded size is defined as the maximum length of sequences within the
|
||||
batch multiplied by the number of sequences in the batch.
|
||||
|
||||
size (int): The largest padded size to batch sequences into.
|
||||
buffer (int): The number of sequences to accumulate before sorting by length.
|
||||
A larger buffer will result in more even sizing, but if the buffer is
|
||||
very large, the iteration order will be less random, which can result
|
||||
in suboptimal training.
|
||||
discard_oversize (bool): Whether to discard sequences that are by themselves
|
||||
longer than the largest padded batch size.
|
||||
get_length (Callable or None): Function to get the length of a sequence item.
|
||||
The `len` function is used by default.
|
||||
"""
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
else:
|
||||
size_ = size
|
||||
for outer_batch in minibatch(docs, size=buffer):
|
||||
for outer_batch in minibatch(seqs, size=buffer):
|
||||
outer_batch = list(outer_batch)
|
||||
target_size = next(size_)
|
||||
for indices in _batch_by_length(outer_batch, target_size, get_length):
|
||||
|
@ -76,12 +123,24 @@ def minibatch_by_padded_size(
|
|||
|
||||
|
||||
def minibatch_by_words(
|
||||
docs, size, tolerance=0.2, discard_oversize=False, get_length=len
|
||||
):
|
||||
seqs: Iterable[ItemT],
|
||||
size: Sizing,
|
||||
tolerance=0.2,
|
||||
discard_oversize=False,
|
||||
get_length=len,
|
||||
) -> Iterable[List[ItemT]]:
|
||||
"""Create minibatches of roughly a given number of words. If any examples
|
||||
are longer than the specified batch length, they will appear in a batch by
|
||||
themselves, or be discarded if discard_oversize=True.
|
||||
The argument 'docs' can be a list of strings, Docs or Examples.
|
||||
|
||||
seqs (Iterable[Sequence]): The sequences to minibatch.
|
||||
size (int or Iterable[int]): The target number of words per batch.
|
||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||
tolerance (float): What percentage of the size to allow batches to exceed.
|
||||
discard_oversize (bool): Whether to discard sequences that by themselves
|
||||
exceed the tolerated size.
|
||||
get_length (Callable or None): Function to get the length of a sequence
|
||||
item. The `len` function is used by default.
|
||||
"""
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
|
@ -95,20 +154,20 @@ def minibatch_by_words(
|
|||
overflow = []
|
||||
batch_size = 0
|
||||
overflow_size = 0
|
||||
for doc in docs:
|
||||
n_words = get_length(doc)
|
||||
for seq in seqs:
|
||||
n_words = get_length(seq)
|
||||
# if the current example exceeds the maximum batch size, it is returned separately
|
||||
# but only if discard_oversize=False.
|
||||
if n_words > target_size + tol_size:
|
||||
if not discard_oversize:
|
||||
yield [doc]
|
||||
yield [seq]
|
||||
# add the example to the current batch if there's no overflow yet and it still fits
|
||||
elif overflow_size == 0 and (batch_size + n_words) <= target_size:
|
||||
batch.append(doc)
|
||||
batch.append(seq)
|
||||
batch_size += n_words
|
||||
# add the example to the overflow buffer if it fits in the tolerance margin
|
||||
elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
|
||||
overflow.append(doc)
|
||||
overflow.append(seq)
|
||||
overflow_size += n_words
|
||||
# yield the previous batch and start a new one. The new one gets the overflow examples.
|
||||
else:
|
||||
|
@ -122,11 +181,11 @@ def minibatch_by_words(
|
|||
overflow_size = 0
|
||||
# this example still fits
|
||||
if (batch_size + n_words) <= target_size:
|
||||
batch.append(doc)
|
||||
batch.append(seq)
|
||||
batch_size += n_words
|
||||
# this example fits in overflow
|
||||
elif (batch_size + n_words) <= (target_size + tol_size):
|
||||
overflow.append(doc)
|
||||
overflow.append(seq)
|
||||
overflow_size += n_words
|
||||
# this example does not fit with the previous overflow: start another new batch
|
||||
else:
|
||||
|
@ -134,7 +193,7 @@ def minibatch_by_words(
|
|||
yield batch
|
||||
target_size = next(size_)
|
||||
tol_size = target_size * tolerance
|
||||
batch = [doc]
|
||||
batch = [seq]
|
||||
batch_size = n_words
|
||||
batch.extend(overflow)
|
||||
if batch:
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
from typing import Optional
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
|||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .lang.punctuation import TOKENIZER_INFIXES
|
||||
from .tokens import Doc
|
||||
from .lookups import load_lookups
|
||||
from .tokenizer import Tokenizer
|
||||
from .errors import Errors, Warnings
|
||||
from .schemas import ConfigSchema
|
||||
|
@ -1439,10 +1438,7 @@ class Language:
|
|||
or lang_cls is not cls
|
||||
):
|
||||
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
||||
nlp = lang_cls(
|
||||
vocab=vocab,
|
||||
create_tokenizer=create_tokenizer,
|
||||
)
|
||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
||||
if after_creation is not None:
|
||||
nlp = after_creation(nlp)
|
||||
if not isinstance(nlp, cls):
|
||||
|
|
|
@ -68,11 +68,11 @@ cdef class DependencyMatcher:
|
|||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self._normalize_key(key) in self._patterns
|
||||
return self.has_key(key)
|
||||
|
||||
def validateInput(self, pattern, key):
|
||||
def validate_input(self, pattern, key):
|
||||
idx = 0
|
||||
visitedNodes = {}
|
||||
visited_nodes = {}
|
||||
for relation in pattern:
|
||||
if "PATTERN" not in relation or "SPEC" not in relation:
|
||||
raise ValueError(Errors.E098.format(key=key))
|
||||
|
@ -83,7 +83,7 @@ cdef class DependencyMatcher:
|
|||
and "NBOR_NAME" not in relation["SPEC"]
|
||||
):
|
||||
raise ValueError(Errors.E099.format(key=key))
|
||||
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
else:
|
||||
if not(
|
||||
"NODE_NAME" in relation["SPEC"]
|
||||
|
@ -92,22 +92,28 @@ cdef class DependencyMatcher:
|
|||
):
|
||||
raise ValueError(Errors.E100.format(key=key))
|
||||
if (
|
||||
relation["SPEC"]["NODE_NAME"] in visitedNodes
|
||||
or relation["SPEC"]["NBOR_NAME"] not in visitedNodes
|
||||
relation["SPEC"]["NODE_NAME"] in visited_nodes
|
||||
or relation["SPEC"]["NBOR_NAME"] not in visited_nodes
|
||||
):
|
||||
raise ValueError(Errors.E101.format(key=key))
|
||||
visitedNodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
|
||||
visited_nodes[relation["SPEC"]["NODE_NAME"]] = True
|
||||
visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True
|
||||
idx = idx + 1
|
||||
|
||||
def add(self, key, patterns, *_patterns, on_match=None):
|
||||
"""Add a new matcher rule to the matcher.
|
||||
|
||||
key (str): The match ID.
|
||||
patterns (list): The patterns to add for the given key.
|
||||
on_match (callable): Optional callback executed on match.
|
||||
"""
|
||||
if patterns is None or hasattr(patterns, "__call__"): # old API
|
||||
on_match = patterns
|
||||
patterns = _patterns
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
raise ValueError(Errors.E012.format(key=key))
|
||||
self.validateInput(pattern,key)
|
||||
self.validate_input(pattern,key)
|
||||
key = self._normalize_key(key)
|
||||
_patterns = []
|
||||
for pattern in patterns:
|
||||
|
@ -187,8 +193,7 @@ cdef class DependencyMatcher:
|
|||
key (string or int): The key to check.
|
||||
RETURNS (bool): Whether the matcher has the rule.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
return key in self._patterns
|
||||
return self._normalize_key(key) in self._patterns
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
@ -202,6 +207,13 @@ cdef class DependencyMatcher:
|
|||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
def __call__(self, Doc doc):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doclike (Doc or Span): The document to match over.
|
||||
RETURNS (list): A list of `(key, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
"""
|
||||
matched_key_trees = []
|
||||
matches = self.token_matcher(doc)
|
||||
for key in list(self._patterns.keys()):
|
||||
|
@ -241,25 +253,25 @@ cdef class DependencyMatcher:
|
|||
on_match(self, doc, i, matched_key_trees)
|
||||
return matched_key_trees
|
||||
|
||||
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
|
||||
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees):
|
||||
cdef bool isValid;
|
||||
if(patternLength == len(id_to_position.keys())):
|
||||
isValid = True
|
||||
for node in range(patternLength):
|
||||
if(node in tree):
|
||||
for idx, (relop,nbor) in enumerate(tree[node]):
|
||||
computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop])
|
||||
computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop])
|
||||
isNbor = False
|
||||
for computed_nbor in computed_nbors:
|
||||
if(computed_nbor.i == visitedNodes[nbor]):
|
||||
if(computed_nbor.i == visited_nodes[nbor]):
|
||||
isNbor = True
|
||||
isValid = isValid & isNbor
|
||||
if(isValid):
|
||||
matched_trees.append(visitedNodes)
|
||||
matched_trees.append(visited_nodes)
|
||||
return
|
||||
allPatternNodes = numpy.asarray(id_to_position[patternLength])
|
||||
for patternNode in allPatternNodes:
|
||||
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees)
|
||||
self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees)
|
||||
|
||||
# Given a node and an edge operator, to return the list of nodes
|
||||
# from the doc that belong to node+operator. This is used to store
|
||||
|
|
|
@ -70,7 +70,7 @@ cdef class Matcher:
|
|||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self._normalize_key(key) in self._patterns
|
||||
return self.has_key(key)
|
||||
|
||||
def add(self, key, patterns, *, on_match=None, greedy: str=None):
|
||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||
|
@ -162,8 +162,7 @@ cdef class Matcher:
|
|||
key (string or int): The key to check.
|
||||
RETURNS (bool): Whether the matcher has the rule.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
return key in self._patterns
|
||||
return self._normalize_key(key) in self._patterns
|
||||
|
||||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
@ -179,7 +178,7 @@ cdef class Matcher:
|
|||
def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
docs (iterable): A stream of documents.
|
||||
docs (Iterable[Union[Doc, Span]]): A stream of documents or spans.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
return_matches (bool): Yield the match lists along with the docs, making
|
||||
results (doc, matches) tuples.
|
||||
|
|
|
@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
default_config={
|
||||
"moves": None,
|
||||
"update_with_oracle_cut_size": 100,
|
||||
"multitasks": [],
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
|
@ -51,17 +50,52 @@ def make_parser(
|
|||
model: Model,
|
||||
moves: Optional[list],
|
||||
update_with_oracle_cut_size: int,
|
||||
multitasks: Iterable,
|
||||
learn_tokens: bool,
|
||||
min_action_freq: int
|
||||
):
|
||||
"""Create a transition-based DependencyParser component. The dependency parser
|
||||
jointly learns sentence segmentation and labelled dependency parsing, and can
|
||||
optionally learn to merge tokens that had been over-segmented by the tokenizer.
|
||||
|
||||
The parser uses a variant of the non-monotonic arc-eager transition-system
|
||||
described by Honnibal and Johnson (2014), with the addition of a "break"
|
||||
transition to perform the sentence segmentation. Nivre's pseudo-projective
|
||||
dependency transformation is used to allow the parser to predict
|
||||
non-projective parses.
|
||||
|
||||
The parser is trained using an imitation learning objective. The parser follows
|
||||
the actions predicted by the current weights, and at each state, determines
|
||||
which actions are compatible with the optimal parse that could be reached
|
||||
from the current state. The weights such that the scores assigned to the
|
||||
set of optimal actions is increased, while scores assigned to other
|
||||
actions are decreased. Note that more than one action may be optimal for
|
||||
a given state.
|
||||
|
||||
model (Model): The model for the transition-based parser. The model needs
|
||||
to have a specific substructure of named components --- see the
|
||||
spacy.ml.tb_framework.TransitionModel for details.
|
||||
moves (List[str]): A list of transition names. Inferred from the data if not
|
||||
provided.
|
||||
update_with_oracle_cut_size (int):
|
||||
During training, cut long sequences into shorter segments by creating
|
||||
intermediate states based on the gold-standard history. The model is
|
||||
not very sensitive to this parameter, so you usually won't need to change
|
||||
it. 100 is a good default.
|
||||
learn_tokens (bool): Whether to learn to merge subtokens that are split
|
||||
relative to the gold standard. Experimental.
|
||||
min_action_freq (int): The minimum frequency of labelled actions to retain.
|
||||
Rarer labelled actions have their label backed-off to "dep". While this
|
||||
primarily affects the label accuracy, it can also affect the attachment
|
||||
structure, as the labels are used to represent the pseudo-projectivity
|
||||
transformation.
|
||||
"""
|
||||
return DependencyParser(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
moves=moves,
|
||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||
multitasks=multitasks,
|
||||
multitasks=[],
|
||||
learn_tokens=learn_tokens,
|
||||
min_action_freq=min_action_freq
|
||||
)
|
||||
|
|
|
@ -62,6 +62,16 @@ def make_entity_linker(
|
|||
incl_prior: bool,
|
||||
incl_context: bool,
|
||||
):
|
||||
"""Construct an EntityLinker component.
|
||||
|
||||
model (Model[List[Doc], Floats2d]): A model that learns document vector
|
||||
representations. Given a batch of Doc objects, it should return a single
|
||||
array, with one row per item in the batch.
|
||||
kb (KnowledgeBase): The knowledge-base to link entities to.
|
||||
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
|
||||
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
|
||||
incl_context (bool): Whether or not to include the local context in the model.
|
||||
"""
|
||||
return EntityLinker(
|
||||
nlp.vocab,
|
||||
model,
|
||||
|
|
|
@ -75,8 +75,8 @@ class Morphologizer(Tagger):
|
|||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
labels_morph (dict): TODO:
|
||||
labels_pos (dict): TODO:
|
||||
labels_morph (dict): Mapping of morph + POS tags to morph labels.
|
||||
labels_pos (dict): Mapping of morph + POS tags to POS tags.
|
||||
|
||||
DOCS: https://spacy.io/api/morphologizer#init
|
||||
"""
|
||||
|
|
|
@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
default_config={
|
||||
"moves": None,
|
||||
"update_with_oracle_cut_size": 100,
|
||||
"multitasks": [],
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"model": DEFAULT_NER_MODEL,
|
||||
},
|
||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
||||
|
@ -50,19 +47,40 @@ def make_ner(
|
|||
model: Model,
|
||||
moves: Optional[list],
|
||||
update_with_oracle_cut_size: int,
|
||||
multitasks: Iterable,
|
||||
learn_tokens: bool,
|
||||
min_action_freq: int
|
||||
):
|
||||
"""Create a transition-based EntityRecognizer component. The entity recognizer
|
||||
identifies non-overlapping labelled spans of tokens.
|
||||
|
||||
The transition-based algorithm used encodes certain assumptions that are
|
||||
effective for "traditional" named entity recognition tasks, but may not be
|
||||
a good fit for every span identification problem. Specifically, the loss
|
||||
function optimizes for whole entity accuracy, so if your inter-annotator
|
||||
agreement on boundary tokens is low, the component will likely perform poorly
|
||||
on your problem. The transition-based algorithm also assumes that the most
|
||||
decisive information about your entities will be close to their initial tokens.
|
||||
If your entities are long and characterised by tokens in their middle, the
|
||||
component will likely do poorly on your task.
|
||||
|
||||
model (Model): The model for the transition-based parser. The model needs
|
||||
to have a specific substructure of named components --- see the
|
||||
spacy.ml.tb_framework.TransitionModel for details.
|
||||
moves (list[str]): A list of transition names. Inferred from the data if not
|
||||
provided.
|
||||
update_with_oracle_cut_size (int):
|
||||
During training, cut long sequences into shorter segments by creating
|
||||
intermediate states based on the gold-standard history. The model is
|
||||
not very sensitive to this parameter, so you usually won't need to change
|
||||
it. 100 is a good default.
|
||||
"""
|
||||
return EntityRecognizer(
|
||||
nlp.vocab,
|
||||
model,
|
||||
name,
|
||||
moves=moves,
|
||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||
multitasks=multitasks,
|
||||
learn_tokens=learn_tokens,
|
||||
min_action_freq=min_action_freq
|
||||
multitasks=[],
|
||||
min_action_freq=1,
|
||||
learn_tokens=False,
|
||||
)
|
||||
|
||||
|
||||
|
@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser):
|
|||
TransitionSystem = BiluoPushDown
|
||||
|
||||
def add_multitask_objective(self, mt_component):
|
||||
"""Register another component as a multi-task objective. Experimental."""
|
||||
self._multitasks.append(mt_component)
|
||||
|
||||
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||
"""Setup multi-task objective components. Experimental and internal."""
|
||||
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
|
||||
for labeller in self._multitasks:
|
||||
labeller.model.set_dim("nO", len(self.labels))
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# cython: infer_types=True, profile=True, binding=True
|
||||
from typing import List
|
||||
import numpy
|
||||
import srsly
|
||||
|
||||
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
|
||||
from thinc.types import Floats2d
|
||||
import warnings
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
|
@ -42,7 +43,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
scores=["tag_acc"],
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(nlp: Language, name: str, model: Model):
|
||||
def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]):
|
||||
"""Construct a part-of-speech tagger component.
|
||||
|
||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
the tag probabilities. The output vectors should match the number of tags
|
||||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||
with the rows summing to 1).
|
||||
"""
|
||||
return Tagger(nlp.vocab, model, name)
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
|
||||
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
|
||||
from thinc.types import Floats2d
|
||||
import numpy
|
||||
|
||||
from .pipe import Pipe
|
||||
|
@ -69,8 +70,22 @@ subword_features = true
|
|||
default_score_weights={"cats_score": 1.0},
|
||||
)
|
||||
def make_textcat(
|
||||
nlp: Language, name: str, model: Model, labels: Iterable[str]
|
||||
nlp: Language,
|
||||
name: str,
|
||||
model: Model[List[Doc], List[Floats2d]],
|
||||
labels: Iterable[str],
|
||||
) -> "TextCategorizer":
|
||||
"""Create a TextCategorizer compoment. The text categorizer predicts categories
|
||||
over a whole document. It can learn one or more labels, and the labels can
|
||||
be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
|
||||
(i.e. zero or more labels may be true per doc). The multi-label setting is
|
||||
controlled by the model instance that's provided.
|
||||
|
||||
model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
|
||||
scores for each category.
|
||||
labels (list): A list of categories to learn. If empty, the model infers the
|
||||
categories from the data.
|
||||
"""
|
||||
return TextCategorizer(nlp.vocab, model, name, labels=labels)
|
||||
|
||||
|
||||
|
|
|
@ -32,11 +32,28 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
|
|||
|
||||
|
||||
class Tok2Vec(Pipe):
|
||||
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor
|
||||
attribute. This is mostly useful to share a single subnetwork between multiple
|
||||
components, e.g. to have one embedding and CNN network shared between a
|
||||
parser, tagger and NER.
|
||||
|
||||
In order to use the `Tok2Vec` predictions, subsequent components should use
|
||||
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
|
||||
layer will read data from the `doc.tensor` attribute during prediction.
|
||||
During training, the `Tok2Vec` component will save its prediction and backprop
|
||||
callback for each batch, so that the subsequent components can backpropagate
|
||||
to the shared weights. This implementation is used because it allows us to
|
||||
avoid relying on object identity within the models to achieve the parameter
|
||||
sharing.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
|
||||
"""Initialize a tok2vec component.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
model (thinc.api.Model[List[Doc], List[Floats2d]]):
|
||||
The Thinc Model powering the pipeline component. It should take
|
||||
a list of Doc objects as input, and output a list of 2d float arrays.
|
||||
name (str): The component instance name.
|
||||
|
||||
DOCS: https://spacy.io/api/tok2vec#init
|
||||
|
@ -48,9 +65,18 @@ class Tok2Vec(Pipe):
|
|||
self.cfg = {}
|
||||
|
||||
def add_listener(self, listener: "Tok2VecListener") -> None:
|
||||
"""Add a listener for a downstream component. Usually internals."""
|
||||
self.listeners.append(listener)
|
||||
|
||||
def find_listeners(self, model: Model) -> None:
|
||||
"""Walk over a model, looking for layers that are Tok2vecListener
|
||||
subclasses that have an upstream_name that matches this component.
|
||||
Listeners can also set their upstream_name attribute to the wildcard
|
||||
string '*' to match any `Tok2Vec`.
|
||||
|
||||
You're unlikely to ever need multiple `Tok2Vec` components, so it's
|
||||
fine to leave your listeners upstream_name on '*'.
|
||||
"""
|
||||
for node in model.walk():
|
||||
if isinstance(node, Tok2VecListener) and node.upstream_name in (
|
||||
"*",
|
||||
|
@ -59,7 +85,8 @@ class Tok2Vec(Pipe):
|
|||
self.add_listener(node)
|
||||
|
||||
def __call__(self, doc: Doc) -> Doc:
|
||||
"""Add context-sensitive embeddings to the Doc.tensor attribute.
|
||||
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
|
||||
them to be used as features by downstream components.
|
||||
|
||||
docs (Doc): The Doc to preocess.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
@ -205,11 +232,27 @@ class Tok2Vec(Pipe):
|
|||
class Tok2VecListener(Model):
|
||||
"""A layer that gets fed its answers from an upstream connection,
|
||||
for instance from a component earlier in the pipeline.
|
||||
|
||||
The Tok2VecListener layer is used as a sublayer within a component such
|
||||
as a parser, NER or text categorizer. Usually you'll have multiple listeners
|
||||
connecting to a single upstream Tok2Vec component, that's earlier in the
|
||||
pipeline. The Tok2VecListener layers act as proxies, passing the predictions
|
||||
from the Tok2Vec component into downstream components, and communicating
|
||||
gradients back upstream.
|
||||
"""
|
||||
|
||||
name = "tok2vec-listener"
|
||||
|
||||
def __init__(self, upstream_name: str, width: int) -> None:
|
||||
"""
|
||||
upstream_name (str): A string to identify the 'upstream' Tok2Vec component
|
||||
to communicate with. The upstream name should either be the wildcard
|
||||
string '*', or the name of the `Tok2Vec` component. You'll almost
|
||||
never have multiple upstream Tok2Vec components, so the wildcard
|
||||
string will almost always be fine.
|
||||
width (int):
|
||||
The width of the vectors produced by the upstream tok2vec component.
|
||||
"""
|
||||
Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
|
||||
self.upstream_name = upstream_name
|
||||
self._batch_id = None
|
||||
|
@ -217,15 +260,25 @@ class Tok2VecListener(Model):
|
|||
self._backprop = None
|
||||
|
||||
@classmethod
|
||||
def get_batch_id(cls, inputs) -> int:
|
||||
def get_batch_id(cls, inputs: List[Doc]) -> int:
|
||||
"""Calculate a content-sensitive hash of the batch of documents, to check
|
||||
whether the next batch of documents is unexpected.
|
||||
"""
|
||||
return sum(sum(token.orth for token in doc) for doc in inputs)
|
||||
|
||||
def receive(self, batch_id: int, outputs, backprop) -> None:
|
||||
"""Store a batch of training predictions and a backprop callback. The
|
||||
predictions and callback are produced by the upstream Tok2Vec component,
|
||||
and later will be used when the listener's component's model is called.
|
||||
"""
|
||||
self._batch_id = batch_id
|
||||
self._outputs = outputs
|
||||
self._backprop = backprop
|
||||
|
||||
def verify_inputs(self, inputs) -> bool:
|
||||
"""Check that the batch of Doc objects matches the ones we have a
|
||||
prediction for.
|
||||
"""
|
||||
if self._batch_id is None and self._outputs is None:
|
||||
raise ValueError(Errors.E954)
|
||||
else:
|
||||
|
@ -237,6 +290,7 @@ class Tok2VecListener(Model):
|
|||
|
||||
|
||||
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||
"""Supply the outputs from the upstream Tok2Vec component."""
|
||||
if is_train:
|
||||
model.verify_inputs(inputs)
|
||||
return model._outputs, model._backprop
|
||||
|
|
|
@ -426,7 +426,7 @@ class Scorer:
|
|||
f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||
}
|
||||
if len(labels) == 2 and not multi_label and positive_label:
|
||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f']
|
||||
positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
|
||||
results[f"{attr}_score"] = positive_label_f
|
||||
results[f"{attr}_score_desc"] = f"F ({positive_label})"
|
||||
elif not multi_label:
|
||||
|
|
|
@ -15,5 +15,7 @@ def morphology():
|
|||
def test_morphology_pickle_roundtrip(morphology):
|
||||
b = pickle.dumps(morphology)
|
||||
reloaded_morphology = pickle.loads(b)
|
||||
assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2"
|
||||
assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4"
|
||||
feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"])
|
||||
assert feat == "Feat1=Val1|Feat2=Val2"
|
||||
feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"])
|
||||
assert feat == "Feat3=Val3|Feat4=Val4"
|
||||
|
|
|
@ -144,10 +144,7 @@ def test_accept_blocked_token():
|
|||
# 1. test normal behaviour
|
||||
nlp1 = English()
|
||||
doc1 = nlp1("I live in New York")
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
}
|
||||
config = {}
|
||||
ner1 = nlp1.create_pipe("ner", config=config)
|
||||
assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""]
|
||||
assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""]
|
||||
|
@ -166,10 +163,7 @@ def test_accept_blocked_token():
|
|||
# 2. test blocking behaviour
|
||||
nlp2 = English()
|
||||
doc2 = nlp2("I live in New York")
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
}
|
||||
config = {}
|
||||
ner2 = nlp2.create_pipe("ner", config=config)
|
||||
|
||||
# set "New York" to a blocked entity
|
||||
|
@ -224,10 +218,7 @@ def test_overwrite_token():
|
|||
assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"]
|
||||
assert [token.ent_type_ for token in doc] == ["", "", "", "", ""]
|
||||
# Check that a new ner can overwrite O
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
}
|
||||
config = {}
|
||||
ner2 = nlp.create_pipe("ner", config=config)
|
||||
ner2.moves.add_action(5, "")
|
||||
ner2.add_label("GPE")
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
import pytest
|
||||
|
||||
from spacy import util, registry
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups, load_lookups
|
||||
from spacy.lookups import Lookups
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
import pytest
|
||||
|
||||
from spacy import util
|
||||
from spacy.gold import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from spacy.symbols import POS, NOUN
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
|
|
@ -117,9 +117,7 @@ def test_overfitting_IO():
|
|||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(
|
||||
train_examples, scorer_cfg={"positive_label": "POSITIVE"}
|
||||
)
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||
assert scores["cats_micro_f"] == 1.0
|
||||
assert scores["cats_score"] == 1.0
|
||||
assert "cats_score_desc" in scores
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
import pytest
|
||||
import random
|
||||
|
||||
from spacy import util
|
||||
from spacy.gold import Example
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
||||
from spacy.symbols import POS, VERB
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.lang.en import English
|
||||
from spacy.lookups import Lookups
|
||||
|
|
|
@ -6,8 +6,7 @@ from spacy.lang.en import English
|
|||
from spacy.lang.lex_attrs import LEX_ATTRS
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.symbols import ORTH, LEMMA, POS, VERB
|
||||
from spacy.symbols import ORTH, LEMMA, POS
|
||||
|
||||
|
||||
def test_issue1061():
|
||||
|
|
|
@ -271,10 +271,7 @@ def test_issue1963(en_tokenizer):
|
|||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||
def test_issue1967(label):
|
||||
nlp = Language()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
}
|
||||
config = {}
|
||||
ner = nlp.create_pipe("ner", config=config)
|
||||
example = Example.from_dict(
|
||||
Doc(ner.vocab, words=["word"]),
|
||||
|
|
|
@ -157,7 +157,11 @@ def test_issue3540(en_vocab):
|
|||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
|
||||
attrs = {
|
||||
"POS": ["PROPN", "PROPN"],
|
||||
"LEMMA": ["New", "York"],
|
||||
"DEP": ["pobj", "compound"],
|
||||
}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
|
|
|
@ -138,10 +138,7 @@ def test_issue4042_bug2():
|
|||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
}
|
||||
config = {}
|
||||
ner2 = nlp1.create_pipe("ner", config=config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
@ -303,10 +300,7 @@ def test_issue4313():
|
|||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
}
|
||||
config = {}
|
||||
ner = nlp.create_pipe("ner", config=config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
|
|
|
@ -185,20 +185,16 @@ def test_issue4725_1():
|
|||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
nlp = English(vocab=vocab)
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 342,
|
||||
"update_with_oracle_cut_size": 111,
|
||||
}
|
||||
ner = nlp.create_pipe("ner", config=config)
|
||||
with make_tempdir() as tmp_path:
|
||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||
pickle.dump(ner, file_)
|
||||
assert ner.cfg["min_action_freq"] == 342
|
||||
assert ner.cfg["update_with_oracle_cut_size"] == 111
|
||||
|
||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||
ner2 = pickle.load(file_)
|
||||
assert ner2.cfg["min_action_freq"] == 342
|
||||
assert ner2.cfg["update_with_oracle_cut_size"] == 111
|
||||
|
||||
|
||||
|
|
|
@ -236,3 +236,33 @@ def test_language_from_config_before_after_init_invalid():
|
|||
config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
|
||||
with pytest.raises(ValueError):
|
||||
English.from_config(config)
|
||||
|
||||
|
||||
def test_language_custom_tokenizer():
|
||||
"""Test that a fully custom tokenizer can be plugged in via the registry."""
|
||||
name = "test_language_custom_tokenizer"
|
||||
|
||||
class CustomTokenizer:
|
||||
"""Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
|
||||
|
||||
def __init__(self, nlp, prefix):
|
||||
self.vocab = nlp.vocab
|
||||
self.prefix = prefix
|
||||
|
||||
def __call__(self, text):
|
||||
words = [f"{self.prefix}{word}" for word in text.split(" ")]
|
||||
return Doc(self.vocab, words=words)
|
||||
|
||||
@registry.tokenizers(name)
|
||||
def custom_create_tokenizer(prefix: str = "_"):
|
||||
def create_tokenizer(nlp):
|
||||
return CustomTokenizer(nlp, prefix=prefix)
|
||||
|
||||
return create_tokenizer
|
||||
|
||||
config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
|
||||
nlp = English.from_config(config)
|
||||
doc = nlp("hello world")
|
||||
assert [t.text for t in doc] == ["_hello", "_world"]
|
||||
doc = list(nlp.pipe(["hello world"]))[0]
|
||||
assert [t.text for t in doc] == ["_hello", "_world"]
|
||||
|
|
|
@ -3,7 +3,7 @@ title: Model Architectures
|
|||
teaser: Pre-defined model architectures included with the core library
|
||||
source: spacy/ml/models
|
||||
menu:
|
||||
- ['Tok2Vec', 'tok2vec']
|
||||
- ['Tok2Vec', 'tok2vec-arch']
|
||||
- ['Transformers', 'transformers']
|
||||
- ['Parser & NER', 'parser']
|
||||
- ['Tagging', 'tagger']
|
||||
|
@ -70,6 +70,47 @@ blog post for background.
|
|||
| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. |
|
||||
| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. |
|
||||
|
||||
### spacy.Tok2VecListener.v1 {#Tok2VecListener}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [components.tok2vec]
|
||||
> factory = "tok2vec"
|
||||
>
|
||||
> [components.tok2vec.model]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> width = 342
|
||||
>
|
||||
> [components.tagger]
|
||||
> factory = "tagger"
|
||||
>
|
||||
> [components.tagger.model]
|
||||
> @architectures = "spacy.Tagger.v1"
|
||||
>
|
||||
> [components.tagger.model.tok2vec]
|
||||
> @architectures = "spacy.Tok2VecListener.v1"
|
||||
> width = ${components.tok2vec.model:width}
|
||||
> ```
|
||||
|
||||
A listener is used as a sublayer within a component such as a
|
||||
[`DependencyParser`](/api/dependencyparser),
|
||||
[`EntityRecognizer`](/api/entityrecognizer)or
|
||||
[`TextCategorizer`](/api/textcategorizer). Usually you'll have multiple
|
||||
listeners connecting to a single upstream [`Tok2Vec`](/api/tok2vec) component
|
||||
that's earlier in the pipeline. The listener layers act as **proxies**, passing
|
||||
the predictions from the `Tok2Vec` component into downstream components, and
|
||||
communicating gradients back upstream.
|
||||
|
||||
Instead of defining its own `Tok2Vec` instance, a model architecture like
|
||||
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
|
||||
argument that connects to the shared `tok2vec` component in the pipeline.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. |
|
||||
| `upstream` | str | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. |
|
||||
|
||||
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
||||
|
||||
<!-- TODO: check example config -->
|
||||
|
@ -195,7 +236,7 @@ and residual connections.
|
|||
> depth = 4
|
||||
> ```
|
||||
|
||||
Encode context using bidirectonal LSTM layers. Requires
|
||||
Encode context using bidirectional LSTM layers. Requires
|
||||
[PyTorch](https://pytorch.org).
|
||||
|
||||
| Name | Type | Description |
|
||||
|
@ -237,8 +278,6 @@ architectures into your training config.
|
|||
|
||||
### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener}
|
||||
|
||||
<!-- TODO: description -->
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
|
@ -250,10 +289,41 @@ architectures into your training config.
|
|||
> @layers = "reduce_mean.v1"
|
||||
> ```
|
||||
|
||||
Create a `TransformerListener` layer, which will connect to a
|
||||
[`Transformer`](/api/transformer) component earlier in the pipeline. The layer
|
||||
takes a list of [`Doc`](/api/doc) objects as input, and produces a list of
|
||||
2-dimensional arrays as output, with each array having one row per token. Most
|
||||
spaCy models expect a sublayer with this signature, making it easy to connect
|
||||
them to a transformer model via this sublayer. Transformer models usually
|
||||
operate over wordpieces, which usually don't align one-to-one against spaCy
|
||||
tokens. The layer therefore requires a reduction operation in order to calculate
|
||||
a single token vector given zero or more wordpiece vectors.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- |
|
||||
| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. |
|
||||
| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. |
|
||||
| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
|
||||
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
|
||||
|
||||
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> # TODO:
|
||||
> ```
|
||||
|
||||
Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
|
||||
**not** allow multiple components to share the transformer weights, and does
|
||||
**not** allow the transformer to set annotations into the [`Doc`](/api/doc)
|
||||
object, but it's a **simpler solution** if you only need the transformer within
|
||||
one component.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_spans` | callable | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
|
||||
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
|
||||
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
|
||||
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
|
||||
|
||||
## Parser & NER architectures {#parser}
|
||||
|
||||
|
@ -418,7 +488,7 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
|||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. |
|
||||
| `width` | int | Output dimension of the feature encoding step. |
|
||||
|
@ -427,10 +497,8 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
|||
| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. |
|
||||
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
|
||||
| `dropout` | float | The dropout rate. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. |
|
||||
|
||||
If the `nO` dimension is not set, the TextCategorizer component will set it when
|
||||
`begin_training` is called.
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
|
||||
| `begin_training` is called. |
|
||||
|
||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||
|
||||
|
@ -458,13 +526,11 @@ vectors are mean pooled and used as features in a feed-forward network. This
|
|||
architecture is usually less accurate than the ensemble, but runs faster.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | --------------------------------------------------------------- |
|
||||
| --------------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. |
|
||||
|
||||
If the `nO` dimension is not set, the TextCategorizer component will set it when
|
||||
`begin_training` is called.
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
|
||||
| `begin_training` is called. |
|
||||
|
||||
### spacy.TextCatBOW.v1 {#TextCatBOW}
|
||||
|
||||
|
@ -483,16 +549,16 @@ An ngram "bag-of-words" model. This architecture should run much faster than the
|
|||
others, but may not be as accurate, especially if texts are short.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
|
||||
| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. |
|
||||
|
||||
If the `nO` dimension is not set, the TextCategorizer component will set it when
|
||||
`begin_training` is called.
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
|
||||
| `begin_training` is called. |
|
||||
|
||||
<!-- TODO:
|
||||
### spacy.TextCatLowData.v1 {#TextCatLowData}
|
||||
-->
|
||||
|
||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||
|
||||
|
@ -558,8 +624,6 @@ A function that creates a default, empty `KnowledgeBase` from a
|
|||
|
||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
|
||||
[`Span`](/api/span) object denoting a named entity, and returns a list of
|
||||
plausible [`Candidate` objects](/api/kb/#candidate_init).
|
||||
|
||||
The default `CandidateGenerator` simply uses the text of a mention to find its
|
||||
potential aliases in the Knowledgebase. Note that this function is
|
||||
case-dependent.
|
||||
plausible [`Candidate` objects](/api/kb/#candidate_init). The default
|
||||
`CandidateGenerator` simply uses the text of a mention to find its potential
|
||||
aliases in the `KnowledgeBase`. Note that this function is case-dependent.
|
||||
|
|
|
@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
|
|||
|
||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||
|
||||
<!-- TODO: document new pretrain command and link to new pretraining docs -->
|
||||
|
||||
Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
||||
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
||||
components on [raw text](/api/data-formats#pretrain), using an approximate
|
||||
language-modeling objective. Specifically, we load pretrained vectors, and train
|
||||
a component like a CNN, BiLSTM, etc to predict vectors which match the
|
||||
|
@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
|
|||
then include a **path to one of these pretrained weights files** in your
|
||||
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
||||
train your model. This technique may be especially helpful if you have little
|
||||
labelled data.
|
||||
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
||||
for more info.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path]
|
|||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. |
|
||||
| `--resume-path`, `-r` | option | TODO: |
|
||||
| `--epoch-resume`, `-er` | option | TODO: |
|
||||
| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. |
|
||||
| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
|
||||
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
|
|
@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to
|
|||
[`spacy train`](/api/cli#train). They use
|
||||
[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the
|
||||
hood. For details on how to use training configs, see the
|
||||
[usage documentation](/usage/training#config).
|
||||
|
||||
<!-- TODO: add details on getting started and init config -->
|
||||
[usage documentation](/usage/training#config). To get started with a blank
|
||||
config or fill a partial config with all defaults, you can use the
|
||||
[`init config`](/api/cli#init-config) command.
|
||||
|
||||
> #### What does the @ mean?
|
||||
>
|
||||
|
@ -52,8 +52,6 @@ your config and check that it's valid, you can run the
|
|||
|
||||
</Infobox>
|
||||
|
||||
<!-- TODO: once we know how we want to implement "starter config" workflow or outputting a full default config for the user, update this section with the command -->
|
||||
|
||||
### nlp {#config-nlp tag="section"}
|
||||
|
||||
> #### Example
|
||||
|
@ -154,8 +152,6 @@ This section is optional and defines settings and controls for
|
|||
[language model pretraining](/usage/training#pretraining). It's used when you
|
||||
run [`spacy pretrain`](/api/cli#pretrain).
|
||||
|
||||
<!-- TODO: complete -->
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `max_epochs` | int | Maximum number of epochs. | `1000` |
|
||||
|
|
|
@ -5,4 +5,194 @@ tag: class
|
|||
source: spacy/matcher/dependencymatcher.pyx
|
||||
---
|
||||
|
||||
TODO: write
|
||||
The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher)
|
||||
and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees
|
||||
using the
|
||||
[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html).
|
||||
It requires a pretrained [`DependencyParser`](/api/parser) or other component
|
||||
that sets the `Token.dep` attribute.
|
||||
|
||||
## Pattern format {#patterns}
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {
|
||||
> "SPEC": {"NODE_NAME": "founded"},
|
||||
> "PATTERN": {"ORTH": "founded"}
|
||||
> },
|
||||
> {
|
||||
> "SPEC": {
|
||||
> "NODE_NAME": "founder",
|
||||
> "NBOR_RELOP": ">",
|
||||
> "NBOR_NAME": "founded"
|
||||
> },
|
||||
> "PATTERN": {"DEP": "nsubj"}
|
||||
> },
|
||||
> {
|
||||
> "SPEC": {
|
||||
> "NODE_NAME": "object",
|
||||
> "NBOR_RELOP": ">",
|
||||
> "NBOR_NAME": "founded"
|
||||
> },
|
||||
> "PATTERN": {"DEP": "dobj"}
|
||||
> }
|
||||
> ]
|
||||
> ```
|
||||
|
||||
A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
|
||||
with each dictionary describing a node to match. Each pattern should have the
|
||||
following top-level keys:
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
|
||||
| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. |
|
||||
|
||||
The `SPEC` includes the following fields:
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. |
|
||||
| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
|
||||
| `NBOR_NAME` | str | The unique name of the node that this node is connected to. |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a rule-based `DependencyMatcher`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import DependencyMatcher
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||
|
||||
## DependencyMatcher.\_\call\_\_ {#call tag="method"}
|
||||
|
||||
Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import Matcher
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> pattern = [
|
||||
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
|
||||
> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
|
||||
> ]
|
||||
> matcher.add("Founder", [pattern])
|
||||
> doc = nlp("Bill Gates founded Microsoft.")
|
||||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
|
||||
|
||||
## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Get the number of rules (edges) added to the dependency matcher. Note that this
|
||||
only returns the number of rules (identical with the number of IDs), not the
|
||||
number of individual patterns.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> assert len(matcher) == 0
|
||||
> pattern = [
|
||||
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
|
||||
> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
|
||||
> ]
|
||||
> matcher.add("Rule", [pattern])
|
||||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------- |
|
||||
| **RETURNS** | int | The number of rules. |
|
||||
|
||||
## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
Check whether the matcher contains rules for a match ID.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> assert "Rule" not in matcher
|
||||
> matcher.add("Rule", [pattern])
|
||||
> assert "Rule" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------- |
|
||||
| `key` | str | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
|
||||
## DependencyMatcher.add {#add tag="method"}
|
||||
|
||||
Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
|
||||
optional callback function to act on the matches. The callback function will
|
||||
receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
|
||||
exists for the given ID, the patterns will be extended. An `on_match` callback
|
||||
will be overwritten.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> def on_match(matcher, doc, id, matches):
|
||||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> matcher.add("TEST_PATTERNS", patterns)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
|
||||
## DependencyMatcher.remove {#remove tag="method"}
|
||||
|
||||
Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
|
||||
exist.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", [pattern]])
|
||||
> assert "Rule" in matcher
|
||||
> matcher.remove("Rule")
|
||||
> assert "Rule" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ---- | ------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
|
||||
## DependencyMatcher.get {#get tag="method"}
|
||||
|
||||
Retrieve the pattern stored for a key. Returns the rule as an
|
||||
`(on_match, patterns)` tuple containing the callback and available patterns.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> matcher.add("Rule", [pattern], on_match=on_match)
|
||||
> on_match, patterns = matcher.get("Rule")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
|
||||
|
|
|
@ -8,6 +8,23 @@ api_string_name: parser
|
|||
api_trainable: true
|
||||
---
|
||||
|
||||
A transition-based dependency parser component. The dependency parser jointly
|
||||
learns sentence segmentation and labelled dependency parsing, and can optionally
|
||||
learn to merge tokens that had been over-segmented by the tokenizer. The parser
|
||||
uses a variant of the **non-monotonic arc-eager transition-system** described by
|
||||
[Honnibal and Johnson (2014)](https://www.aclweb.org/anthology/D15-1162/), with
|
||||
the addition of a "break" transition to perform the sentence segmentation.
|
||||
[Nivre (2005)](https://www.aclweb.org/anthology/P05-1013/)'s **pseudo-projective
|
||||
dependency transformation** is used to allow the parser to predict
|
||||
non-projective parses.
|
||||
|
||||
The parser is trained using an **imitation learning objective**. It follows the
|
||||
actions predicted by the current weights, and at each state, determines which
|
||||
actions are compatible with the optimal parse that could be reached from the
|
||||
current state. The weights such that the scores assigned to the set of optimal
|
||||
actions is increased, while scores assigned to other actions are decreased. Note
|
||||
that more than one action may be optimal for a given state.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
|
@ -23,17 +40,20 @@ architectures and their arguments and hyperparameters.
|
|||
> from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
> config = {
|
||||
> "moves": None,
|
||||
> # TODO: rest
|
||||
> "update_with_oracle_cut_size": 100,
|
||||
> "learn_tokens": False,
|
||||
> "min_action_freq": 30,
|
||||
> "model": DEFAULT_PARSER_MODEL,
|
||||
> }
|
||||
> nlp.add_pipe("parser", config=config)
|
||||
> ```
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||
| `moves` | list | | `None` |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | `None` |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
|
||||
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | `False` |
|
||||
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||
|
||||
```python
|
||||
|
@ -61,19 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `moves` | list | |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
|
||||
| _keyword-only_ | | |
|
||||
| `update_with_oracle_cut_size` | int | |
|
||||
| `multitasks` | `Iterable` | |
|
||||
| `learn_tokens` | bool | |
|
||||
| `min_action_freq` | int | |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
|
||||
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. |
|
||||
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. |
|
||||
|
||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -8,6 +8,18 @@ api_string_name: ner
|
|||
api_trainable: true
|
||||
---
|
||||
|
||||
A transition-based named entity recognition component. The entity recognizer
|
||||
identifies **non-overlapping labelled spans** of tokens. The transition-based
|
||||
algorithm used encodes certain assumptions that are effective for "traditional"
|
||||
named entity recognition tasks, but may not be a good fit for every span
|
||||
identification problem. Specifically, the loss function optimizes for **whole
|
||||
entity accuracy**, so if your inter-annotator agreement on boundary tokens is
|
||||
low, the component will likely perform poorly on your problem. The
|
||||
transition-based algorithm also assumes that the most decisive information about
|
||||
your entities will be close to their initial tokens. If your entities are long
|
||||
and characterized by tokens in their middle, the component will likely not be a
|
||||
good fit for your task.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
|
@ -23,17 +35,16 @@ architectures and their arguments and hyperparameters.
|
|||
> from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
> config = {
|
||||
> "moves": None,
|
||||
> # TODO: rest
|
||||
> "update_with_oracle_cut_size": 100,
|
||||
> "model": DEFAULT_NER_MODEL,
|
||||
> }
|
||||
> nlp.add_pipe("ner", config=config)
|
||||
> ```
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||
| `moves` | list | | `None` |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||
|
||||
```python
|
||||
|
@ -61,19 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `moves` | list | |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
|
||||
| _keyword-only_ | | |
|
||||
| `update_with_oracle_cut_size` | int | |
|
||||
| `multitasks` | `Iterable` | |
|
||||
| `learn_tokens` | bool | |
|
||||
| `min_action_freq` | int | |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
|
||||
|
||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -242,6 +242,21 @@ a batch of [Example](/api/example) objects.
|
|||
|
||||
Update the models in the pipeline.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
The `Language.update` method now takes a batch of [`Example`](/api/example)
|
||||
objects instead of the raw texts and annotations or `Doc` and `GoldParse`
|
||||
objects. An [`Example`](/api/example) streamlines how data is passed around. It
|
||||
stores two `Doc` objects: one for holding the gold-standard reference data, and
|
||||
one for holding the predictions of the pipeline.
|
||||
|
||||
For most use cases, you shouldn't have to write your own training scripts
|
||||
anymore. Instead, you can use [`spacy train`](/api/cli#train) with a config file
|
||||
and custom registered functions if needed. See the
|
||||
[training documentation](/usage/training) for details.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -253,7 +268,7 @@ Update the models in the pipeline.
|
|||
|
||||
| Name | Type | Description |
|
||||
| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
|
|
|
@ -9,6 +9,28 @@ api_string_name: lemmatizer
|
|||
api_trainable: false
|
||||
---
|
||||
|
||||
Component for assigning base forms to tokens using rules based on part-of-speech
|
||||
tags, or lookup tables. Functionality to train the component is coming soon.
|
||||
Different [`Language`](/api/language) subclasses can implement their own
|
||||
lemmatizer components via
|
||||
[language-specific factories](/usage/processing-pipelines#factories-language).
|
||||
The default data used is provided by the
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||
extension package.
|
||||
|
||||
<Infobox variant="warning" title="New in v3.0">
|
||||
|
||||
As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be
|
||||
added to your pipeline, and not a hidden part of the vocab that runs behind the
|
||||
scenes. This makes it easier to customize how lemmas should be assigned in your
|
||||
pipeline.
|
||||
|
||||
If the lemmatization mode is set to `"rule"` and requires part-of-speech tags to
|
||||
be assigned, make sure a [`Tagger`](/api/tagger) or another component assigning
|
||||
tags is available in the pipeline and runs _before_ the lemmatizer.
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Config and implementation
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
|
@ -29,7 +51,7 @@ lemmatizers, see the
|
|||
|
||||
| Setting | Type | Description | Default |
|
||||
| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
|
||||
| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` |
|
||||
| `mode` | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. | `"lookup"` |
|
||||
| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` |
|
||||
| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` |
|
||||
|
@ -56,13 +78,13 @@ shortcut for this and instantiate the component using its string name and
|
|||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | [`Vocab`](/api/vocab) | The vocab. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". |
|
||||
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. |
|
||||
| mode | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. |
|
||||
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. |
|
||||
| overwrite | bool | Whether to overwrite existing lemmas. |
|
||||
|
||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
|
|
@ -5,6 +5,82 @@ tag: class
|
|||
source: spacy/matcher/matcher.pyx
|
||||
---
|
||||
|
||||
The `Matcher` lets you find words and phrases using rules describing their token
|
||||
attributes. Rules can refer to token annotations (like the text or
|
||||
part-of-speech tags), as well as lexical attributes like `Token.is_punct`.
|
||||
Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched
|
||||
tokens in context. For in-depth examples and workflows for combining rules and
|
||||
statistical models, see the [usage guide](/usage/rule-based-matching) on
|
||||
rule-based matching.
|
||||
|
||||
## Pattern format {#patterns}
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {"LOWER": "i"},
|
||||
> {"LEMMA": {"IN": ["like", "love"]}},
|
||||
> {"POS": "NOUN", "OP": "+"}
|
||||
> ]
|
||||
> ```
|
||||
|
||||
A pattern added to the `Matcher` consists of a list of dictionaries. Each
|
||||
dictionary describes **one token** and its attributes. The available token
|
||||
pattern keys correspond to a number of
|
||||
[`Token` attributes](/api/token#attributes). The supported attributes for
|
||||
rule-based matching are:
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `ORTH` | str | The exact verbatim text of a token. |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
|
||||
| `LOWER` | str | The lowercase form of the token text. |
|
||||
| `LENGTH` | int | The length of the token text. |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
||||
| `ENT_TYPE` | str | The token's entity label. |
|
||||
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| `OP` | str | Operator or quantifier to determine how often to match a token pattern. |
|
||||
|
||||
Operators and quantifiers define **how often** a token pattern should be
|
||||
matched:
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {"POS": "ADJ", "OP": "*"},
|
||||
> {"POS": "NOUN", "OP": "+"}
|
||||
> ]
|
||||
> ```
|
||||
|
||||
| OP | Description |
|
||||
| --- | ---------------------------------------------------------------- |
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match zero or more times. |
|
||||
|
||||
Token patterns can also map to a **dictionary of properties** instead of a
|
||||
single value to indicate whether the expected value is a member of a list or how
|
||||
it compares to another value.
|
||||
|
||||
> ```json
|
||||
> ### Example
|
||||
> [
|
||||
> {"LEMMA": {"IN": ["like", "love", "enjoy"]}},
|
||||
> {"POS": "PROPN", "LENGTH": {">=": 10}},
|
||||
> ]
|
||||
> ```
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
|
||||
| `IN` | any | Attribute value is member of a list. |
|
||||
| `NOT_IN` | any | Attribute value is _not_ member of a list. |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
|
||||
|
||||
## Matcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create the rule-based `Matcher`. If `validate=True` is set, all patterns added
|
||||
|
@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn.
|
|||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | A stream of documents. |
|
||||
| `docs` | iterable | A stream of documents or spans. |
|
||||
| `batch_size` | int | The number of documents to accumulate into a working set. |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. |
|
||||
| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
|
||||
|
@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID.
|
|||
|
||||
## Matcher.add {#add tag="method" new="2"}
|
||||
|
||||
Add a rule to the matcher, consisting of an ID key, one or more patterns, and a
|
||||
callback function to act on the matches. The callback function will receive the
|
||||
arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for
|
||||
the given ID, the patterns will be extended. An `on_match` callback will be
|
||||
overwritten.
|
||||
Add a rule to the matcher, consisting of an ID key, one or more patterns, and an
|
||||
optional callback function to act on the matches. The callback function will
|
||||
receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already
|
||||
exists for the given ID, the patterns will be extended. An `on_match` callback
|
||||
will be overwritten.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -142,11 +218,12 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
|||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `greedy` <Tag variant="new">3</Tag> | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. |
|
||||
|
||||
## Matcher.remove {#remove tag="method" new="2"}
|
||||
|
||||
|
|
|
@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `labels_morph` | dict | |
|
||||
| `labels_pos` | dict | |
|
||||
| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. |
|
||||
| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. |
|
||||
|
||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -9,7 +9,8 @@ new: 2
|
|||
The `PhraseMatcher` lets you efficiently match large terminology lists. While
|
||||
the [`Matcher`](/api/matcher) lets you match sequences based on lists of token
|
||||
descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc`
|
||||
objects.
|
||||
objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for
|
||||
examples.
|
||||
|
||||
## PhraseMatcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
|
|
@ -29,9 +29,9 @@ architectures and their arguments and hyperparameters.
|
|||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ---------------- | ------------------------------------------ | -------------------------------------- | ----------------------------------- |
|
||||
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
|
||||
| `set_morphology` | bool | Whether to set morphological features. | `False` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
|
||||
|
@ -59,9 +59,9 @@ shortcut for this and instantiate the component using its string name and
|
|||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `set_morphology` | bool | Whether to set morphological features. |
|
||||
|
|
|
@ -9,6 +9,12 @@ api_string_name: textcat
|
|||
api_trainable: true
|
||||
---
|
||||
|
||||
The text categorizer predicts **categories over a whole document**. It can learn
|
||||
one or more labels, and the labels can be mutually exclusive (i.e. one true
|
||||
label per document) or non-mutually exclusive (i.e. zero or more labels may be
|
||||
true per document). The multi-label setting is controlled by the model instance
|
||||
that's provided.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
The default config is defined by the pipeline component factory and describes
|
||||
|
@ -30,9 +36,9 @@ architectures and their arguments and hyperparameters.
|
|||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| -------- | ------------------------------------------ | ------------------ | ----------------------------------------------------- |
|
||||
| `labels` | `Iterable[str]` | The labels to use. | `[]` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
|
||||
| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- |
|
||||
| `labels` | `List[str]` | A list of categories to learn. If empty, the model infers the categories from the data. | `[]` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
|
||||
|
@ -67,23 +73,6 @@ shortcut for this and instantiate the component using its string name and
|
|||
| _keyword-only_ | | |
|
||||
| `labels` | `Iterable[str]` | The labels to use. |
|
||||
|
||||
<!-- TODO move to config page
|
||||
### Architectures {#architectures new="2.1"}
|
||||
|
||||
Text classification models can be used to solve a wide variety of problems.
|
||||
Differences in text length, number of labels, difficulty, and runtime
|
||||
performance constraints mean that no single algorithm performs well on all types
|
||||
of problems. To handle a wider variety of problems, the `TextCategorizer` object
|
||||
allows configuration of its model architecture, using the `architecture` keyword
|
||||
argument.
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `"ensemble"` | **Default:** Stacked ensemble of a bag-of-words model and a neural network model. The neural network uses a CNN with mean pooling and attention. The "ngram_size" and "attr" arguments can be used to configure the feature extraction for the bag-of-words model. |
|
||||
| `"simple_cnn"` | A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. |
|
||||
| `"bow"` | An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. The features extracted can be controlled using the keyword arguments `ngram_size` and `attr`. For instance, `ngram_size=3` and `attr="lower"` would give lower-cased unigram, trigram and bigram features. 2, 3 or 4 are usually good choices of ngram size. |
|
||||
-->
|
||||
|
||||
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the pipe to one document. The document is modified in place, and returned.
|
||||
|
|
|
@ -8,7 +8,20 @@ api_string_name: tok2vec
|
|||
api_trainable: true
|
||||
---
|
||||
|
||||
<!-- TODO: intro describing component -->
|
||||
Apply a "token-to-vector" model and set its outputs in the `Doc.tensor`
|
||||
attribute. This is mostly useful to **share a single subnetwork** between
|
||||
multiple components, e.g. to have one embedding and CNN network shared between a
|
||||
[`DependencyParser`](/api/dependencyparser), [`Tagger`](/api/tagger) and
|
||||
[`EntityRecognizer`](/api/entityrecognizer).
|
||||
|
||||
In order to use the `Tok2Vec` predictions, subsequent components should use the
|
||||
[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec
|
||||
subnetwork of their model. This layer will read data from the `doc.tensor`
|
||||
attribute during prediction. During training, the `Tok2Vec` component will save
|
||||
its prediction and backprop callback for each batch, so that the subsequent
|
||||
components can backpropagate to the shared weights. This implementation is used
|
||||
because it allows us to avoid relying on object identity within the models to
|
||||
achieve the parameter sharing.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
||||
|
@ -28,8 +41,8 @@ architectures and their arguments and hyperparameters.
|
|||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
|
||||
| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
|
||||
|
@ -64,9 +77,11 @@ shortcut for this and instantiate the component using its string name and
|
|||
|
||||
## Tok2Vec.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the pipe to one document. The document is modified in place, and returned.
|
||||
This usually happens under the hood when the `nlp` object is called on a text
|
||||
and all pipeline components are applied to the `Doc` in order. Both
|
||||
Apply the pipe to one document and add context-sensitive embeddings to the
|
||||
`Doc.tensor` attribute, allowing them to be used as features by downstream
|
||||
components. The document is modified in place, and returned. This usually
|
||||
happens under the hood when the `nlp` object is called on a text and all
|
||||
pipeline components are applied to the `Doc` in order. Both
|
||||
[`__call__`](/api/tok2vec#call) and [`pipe`](/api/tok2vec#pipe) delegate to the
|
||||
[`predict`](/api/tok2vec#predict) and
|
||||
[`set_annotations`](/api/tok2vec#set_annotations) methods.
|
||||
|
|
|
@ -340,7 +340,7 @@ See the [`Transformer`](/api/transformer) API reference and
|
|||
|
||||
## Batchers {#batchers source="spacy/gold/batchers.py" new="3"}
|
||||
|
||||
<!-- TODO: intro and also describe signature of functions -->
|
||||
<!-- TODO: intro -->
|
||||
|
||||
#### batch_by_words.v1 {#batch_by_words tag="registered function"}
|
||||
|
||||
|
@ -361,19 +361,16 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
|||
> get_length = null
|
||||
> ```
|
||||
|
||||
<!-- TODO: complete table -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `tolerance` | float | |
|
||||
| `discard_oversize` | bool | Discard items that are longer than the specified batch length. |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `seqs` | `Iterable[Any]` | The sequences to minibatch. |
|
||||
| `size` | `Iterable[int]` / int | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `tolerance` | float | What percentage of the size to allow batches to exceed. |
|
||||
| `discard_oversize` | bool | Whether to discard sequences that by themselves exceed the tolerated size. |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
|
||||
#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
|
@ -383,34 +380,37 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
|||
> get_length = null
|
||||
> ```
|
||||
|
||||
<!-- TODO: complete table -->
|
||||
Create a batcher that creates batches of the specified size.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | `Iterable[int]` / int | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
|
||||
#### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
|
||||
|
||||
<!-- TODO: -->
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [training.batcher]
|
||||
> @batchers = "batch_by_words.v1"
|
||||
> @batchers = "batch_by_padded.v1"
|
||||
> size = 100
|
||||
> buffer = TODO:
|
||||
> buffer = 256
|
||||
> discard_oversize = false
|
||||
> get_length = null
|
||||
> ```
|
||||
|
||||
Minibatch a sequence by the size of padded batches that would result, with
|
||||
sequences binned by length within a window. The padded size is defined as the
|
||||
maximum length of sequences within the batch multiplied by the number of
|
||||
sequences in the batch.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `buffer` | int | |
|
||||
| `discard_oversize` | bool | Discard items that are longer than the specified batch length. |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | `Iterable[int]` / int | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `buffer` | int | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. |
|
||||
| `discard_oversize` | bool | Whether to discard sequences that are by themselves longer than the largest padded batch size. |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
|
||||
## Training data and alignment {#gold source="spacy/gold"}
|
||||
|
||||
|
|
|
@ -25,8 +25,15 @@ work out-of-the-box.
|
|||
|
||||
</Infobox>
|
||||
|
||||
This pipeline component lets you use transformer models in your pipeline. The
|
||||
component assigns the output of the transformer to the Doc's extension
|
||||
This pipeline component lets you use transformer models in your pipeline.
|
||||
Supports all models that are available via the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers) library.
|
||||
Usually you will connect subsequent components to the shared transformer using
|
||||
the [TransformerListener](/api/architectures#TransformerListener) layer. This
|
||||
works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and
|
||||
[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer.
|
||||
|
||||
The component assigns the output of the transformer to the `Doc`'s extension
|
||||
attributes. We also calculate an alignment between the word-piece tokens and the
|
||||
spaCy tokenization, so that we can use the last hidden states to set the
|
||||
`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy
|
||||
|
@ -54,10 +61,10 @@ architectures and their arguments and hyperparameters.
|
|||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
|
||||
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
|
||||
| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
|
||||
| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) |
|
||||
| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. | [TransformerModel](/api/architectures#TransformerModel) |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||
|
@ -86,15 +93,19 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p
|
|||
> trf = Transformer(nlp.vocab, model)
|
||||
> ```
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
Construct a `Transformer` component. One or more subsequent spaCy components can
|
||||
use the transformer outputs as features in its model, with gradients
|
||||
backpropagated to the single shared weights. The activations from the
|
||||
transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension
|
||||
attribute. You can also provide a callback to set additional annotations. In
|
||||
your application, you would normally use a shortcut for this and instantiate the
|
||||
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. |
|
||||
| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. |
|
||||
| _keyword-only_ | | |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
|
||||
|
@ -184,7 +195,10 @@ Apply the pipeline's model to a batch of docs, without modifying them.
|
|||
|
||||
## Transformer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
Modify a batch of documents, using pre-computed scores.
|
||||
Assign the extracted features to the Doc objects. By default, the
|
||||
[`TransformerData`](/api/transformer#transformerdata) object is written to the
|
||||
[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter
|
||||
callback is then called, if provided.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -201,8 +215,19 @@ Modify a batch of documents, using pre-computed scores.
|
|||
|
||||
## Transformer.update {#update tag="method"}
|
||||
|
||||
Learn from a batch of documents and gold-standard information, updating the
|
||||
pipe's model. Delegates to [`predict`](/api/transformer#predict).
|
||||
Prepare for an update to the transformer. Like the [`Tok2Vec`](/api/tok2vec)
|
||||
component, the `Transformer` component is unusual in that it does not receive
|
||||
"gold standard" annotations to calculate a weight update. The optimal output of
|
||||
the transformer data is unknown – it's a hidden layer inside the network that is
|
||||
updated by backpropagating from output layers.
|
||||
|
||||
The `Transformer` component therefore does **not** perform a weight update
|
||||
during its own `update` method. Instead, it runs its transformer model and
|
||||
communicates the output and the backpropagation callback to any **downstream
|
||||
components** that have been connected to it via the
|
||||
[TransformerListener](/api/architectures#TransformerListener) sublayer. If there
|
||||
are multiple listeners, the last layer will actually backprop to the transformer
|
||||
and call the optimizer, while the others simply increment the gradients.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -213,8 +238,8 @@ pipe's model. Delegates to [`predict`](/api/transformer#predict).
|
|||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). |
|
||||
|
@ -394,21 +419,23 @@ Split a `TransformerData` object that represents a batch into a list with one
|
|||
| ----------- | ----------------------- | ----------- |
|
||||
| **RETURNS** | `List[TransformerData]` | |
|
||||
|
||||
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||
|
||||
<!-- TODO: details on what this is for -->
|
||||
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||
|
||||
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
|
||||
return a lists of [`Span`](/api/span) objects for each doc, to be processed by
|
||||
the transformer. The returned spans can overlap. Span getters can be referenced
|
||||
in the config's `[components.transformer.model.get_spans]` block to customize
|
||||
the sequences processed by the transformer. You can also register custom span
|
||||
getters using the `@registry.span_getters` decorator.
|
||||
the transformer. This is used to manage long documents, by cutting them into
|
||||
smaller sequences before running the transformer. The spans are allowed to
|
||||
overlap, and you can also omit sections of the Doc if they are not relevant.
|
||||
|
||||
Span getters can be referenced in the `[components.transformer.model.get_spans]`
|
||||
block of the config to customize the sequences processed by the transformer. You
|
||||
can also register custom span getters using the `@spacy.registry.span_getters`
|
||||
decorator.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> @registry.span_getters("sent_spans.v1")
|
||||
> @spacy.registry.span_getters("sent_spans.v1")
|
||||
> def configure_get_sent_spans() -> Callable:
|
||||
> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]:
|
||||
> return [list(doc.sents) for doc in docs]
|
||||
|
@ -421,15 +448,55 @@ getters using the `@registry.span_getters` decorator.
|
|||
| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. |
|
||||
| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. |
|
||||
|
||||
The following built-in functions are available:
|
||||
### doc_spans.v1 {#doc_spans tag="registered function"}
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "doc_spans.v1"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------ |
|
||||
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
|
||||
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
|
||||
| `strided_spans.v1` | |
|
||||
Create a span getter that uses the whole document as its spans. This is the best
|
||||
approach if your [`Doc`](/api/doc) objects already refer to relatively short
|
||||
texts.
|
||||
|
||||
### sent_spans.v1 {#sent_spans tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "sent_spans.v1"
|
||||
> ```
|
||||
|
||||
Create a span getter that uses sentence boundary markers to extract the spans.
|
||||
This requires sentence boundaries to be set (e.g. by the
|
||||
[`Sentencizer`](/api/sentencizer)), and may result in somewhat uneven batches,
|
||||
depending on the sentence lengths. However, it does provide the transformer with
|
||||
more meaningful windows to attend over.
|
||||
|
||||
### strided_spans.v1 {#strided_spans tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [transformer.model.get_spans]
|
||||
> @span_getters = "strided_spans.v1"
|
||||
> window = 128
|
||||
> stride = 96
|
||||
> ```
|
||||
|
||||
Create a span getter for strided spans. If you set the `window` and `stride` to
|
||||
the same value, the spans will cover each token once. Setting `stride` lower
|
||||
than `window` will allow for an overlap, so that some tokens are counted twice.
|
||||
This can be desirable, because it allows all tokens to have both a left and
|
||||
right context.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | ---------------- |
|
||||
| `window` | int | The window size. |
|
||||
| `stride` | int | The stride size. |
|
||||
|
||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||
|
||||
|
|
|
@ -1,54 +1,88 @@
|
|||
The central data structures in spaCy are the `Doc` and the `Vocab`. The `Doc`
|
||||
object owns the **sequence of tokens** and all their annotations. The `Vocab`
|
||||
object owns a set of **look-up tables** that make common information available
|
||||
across documents. By centralizing strings, word vectors and lexical attributes,
|
||||
we avoid storing multiple copies of this data. This saves memory, and ensures
|
||||
there's a **single source of truth**.
|
||||
The central data structures in spaCy are the [`Language`](/api/language) class,
|
||||
the [`Vocab`](/api/vocab) and the [`Doc`](/api/doc) object. The `Language` class
|
||||
is used to process a text and turn it into a `Doc` object. It's typically stored
|
||||
as a variable called `nlp`. The `Doc` object owns the **sequence of tokens** and
|
||||
all their annotations. By centralizing strings, word vectors and lexical
|
||||
attributes in the `Vocab`, we avoid storing multiple copies of this data. This
|
||||
saves memory, and ensures there's a **single source of truth**.
|
||||
|
||||
Text annotations are also designed to allow a single source of truth: the `Doc`
|
||||
object owns the data, and `Span` and `Token` are **views that point into it**.
|
||||
The `Doc` object is constructed by the `Tokenizer`, and then **modified in
|
||||
place** by the components of the pipeline. The `Language` object coordinates
|
||||
these components. It takes raw text and sends it through the pipeline, returning
|
||||
an **annotated document**. It also orchestrates training and serialization.
|
||||
object owns the data, and [`Span`](/api/span) and [`Token`](/api/token) are
|
||||
**views that point into it**. The `Doc` object is constructed by the
|
||||
[`Tokenizer`](/api/tokenizer), and then **modified in place** by the components
|
||||
of the pipeline. The `Language` object coordinates these components. It takes
|
||||
raw text and sends it through the pipeline, returning an **annotated document**.
|
||||
It also orchestrates training and serialization.
|
||||
|
||||
<!-- TODO: update architecture and tables below to match sidebar in API docs etc. -->
|
||||
<!-- TODO: update graphic -->
|
||||
|
||||

|
||||
|
||||
### Container objects {#architecture-containers}
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`. |
|
||||
| [`Doc`](/api/doc) | A container for accessing linguistic annotations. |
|
||||
| [`Span`](/api/span) | A slice from a `Doc` object. |
|
||||
| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. |
|
||||
| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. |
|
||||
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
|
||||
| [`Example`](/api/example) | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions. |
|
||||
| [`DocBin`](/api/docbin) | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training). |
|
||||
|
||||
### Processing pipeline {#architecture-pipeline}
|
||||
|
||||
The processing pipeline consists of one or more **pipeline components** that are
|
||||
called on the `Doc` in order. The tokenizer runs before the components. Pipeline
|
||||
components can be added using [`Language.add_pipe`](/api/language#add_pipe).
|
||||
They can contain a statistical model and trained weights, or only make
|
||||
rule-based modifications to the `Doc`. spaCy provides a range of built-in
|
||||
components for different language processing tasks and also allows adding
|
||||
[custom components](/usage/processing-pipelines#custom-components).
|
||||
|
||||

|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
|
||||
| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. |
|
||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------- |
|
||||
| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. |
|
||||
| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. |
|
||||
| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
|
||||
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
|
||||
| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. |
|
||||
| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. |
|
||||
| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. |
|
||||
| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. |
|
||||
| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. |
|
||||
| [`Tagger`](/api/tagger) | Predict part-of-speech tags. |
|
||||
| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. |
|
||||
| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. |
|
||||
| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. |
|
||||
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
|
||||
| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. |
|
||||
| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. |
|
||||
| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. |
|
||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. |
|
||||
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
|
||||
| [`Pipe`](/api/pipe) | Base class that all trainable pipeline components inherit from. |
|
||||
|
||||
### Matchers {#architecture-matchers}
|
||||
|
||||
Matchers help you find and extract information from [`Doc`](/api/doc) objects
|
||||
based on match patterns describing the sequences you're looking for. A matcher
|
||||
operates on a `Doc` and gives you access to the matched tokens **in context**.
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
|
||||
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
|
||||
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
|
||||
| [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. |
|
||||
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
|
||||
| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). |
|
||||
|
||||
### Other classes {#architecture-other}
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------- | ----------------------------------------------------------------------------- |
|
||||
| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. |
|
||||
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
|
||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||
| [`Example`](/api/example) | Collection for training annotations. |
|
||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
|
||||
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
|
||||
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
|
||||
| [`Scorer`](/api/scorer) | Compute evaluation scores. |
|
||||
| [`Corpus`](/api/corpis) | Class for managing annotated corpora for training and evaluation data. |
|
||||
|
|
|
@ -750,16 +750,13 @@ print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that']
|
|||
|
||||
The special case doesn't have to match an entire whitespace-delimited substring.
|
||||
The tokenizer will incrementally split off punctuation, and keep looking up the
|
||||
remaining substring:
|
||||
remaining substring. The special case rules also have precedence over the
|
||||
punctuation splitting.
|
||||
|
||||
```python
|
||||
assert "gimme" not in [w.text for w in nlp("gimme!")]
|
||||
assert "gimme" not in [w.text for w in nlp('("...gimme...?")')]
|
||||
```
|
||||
|
||||
The special case rules have precedence over the punctuation splitting:
|
||||
|
||||
```python
|
||||
nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}])
|
||||
assert len(nlp("...gimme...?")) == 1
|
||||
```
|
||||
|
@ -813,19 +810,6 @@ domain. There are six things you may need to define:
|
|||
6. An optional boolean function `url_match`, which is similar to `token_match`
|
||||
except that prefixes and suffixes are removed before applying the match.
|
||||
|
||||
<Infobox title="Important note: token match in spaCy v2.2" variant="warning">
|
||||
|
||||
In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match`
|
||||
above and there was no match pattern applied before prefixes and suffixes were
|
||||
analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its
|
||||
behavior in v2.2.1 and earlier with precedence over prefixes and suffixes.
|
||||
|
||||
The `url_match` is introduced in v2.3.0 to handle cases like URLs where the
|
||||
tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a
|
||||
URL) before applying the match.
|
||||
|
||||
</Infobox>
|
||||
|
||||
You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is
|
||||
to use `re.compile()` to build a regular expression object, and pass its
|
||||
`.search()` and `.finditer()` methods:
|
||||
|
@ -905,12 +889,13 @@ function that behaves the same way.
|
|||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
If you're using a statistical model, writing to the `nlp.Defaults` or
|
||||
`English.Defaults` directly won't work, since the regular expressions are read
|
||||
from the model and will be compiled when you load it. If you modify
|
||||
`nlp.Defaults`, you'll only see the effect if you call
|
||||
[`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the tokenizer
|
||||
loaded from a statistical model, you should modify `nlp.tokenizer` directly.
|
||||
If you're using a statistical model, writing to the
|
||||
[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't
|
||||
work, since the regular expressions are read from the model and will be compiled
|
||||
when you load it. If you modify `nlp.Defaults`, you'll only see the effect if
|
||||
you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the
|
||||
tokenizer loaded from a statistical model, you should modify `nlp.tokenizer`
|
||||
directly.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -961,51 +946,50 @@ and language-specific definitions such as
|
|||
[`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py)
|
||||
for German.
|
||||
|
||||
### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer}
|
||||
### Hooking a custom tokenizer into the pipeline {#custom-tokenizer}
|
||||
|
||||
The tokenizer is the first component of the processing pipeline and the only one
|
||||
that can't be replaced by writing to `nlp.pipeline`. This is because it has a
|
||||
different signature from all the other components: it takes a text and returns a
|
||||
`Doc`, whereas all other components expect to already receive a tokenized `Doc`.
|
||||
[`Doc`](/api/doc), whereas all other components expect to already receive a
|
||||
tokenized `Doc`.
|
||||
|
||||

|
||||
|
||||
To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a
|
||||
custom function that takes a text, and returns a `Doc`.
|
||||
custom function that takes a text, and returns a [`Doc`](/api/doc).
|
||||
|
||||
> #### Creating a Doc
|
||||
>
|
||||
> Constructing a [`Doc`](/api/doc) object manually requires at least two
|
||||
> arguments: the shared `Vocab` and a list of words. Optionally, you can pass in
|
||||
> a list of `spaces` values indicating whether the token at this position is
|
||||
> followed by a space (default `True`). See the section on
|
||||
> [pre-tokenized text](#own-annotations) for more info.
|
||||
>
|
||||
> ```python
|
||||
> words = ["Let", "'s", "go", "!"]
|
||||
> spaces = [False, True, False, False]
|
||||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
> ```
|
||||
|
||||
```python
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
nlp = spacy.blank("en")
|
||||
nlp.tokenizer = my_tokenizer
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ----- | ------------------------- |
|
||||
| ----------- | ----------------- | ------------------------- |
|
||||
| `text` | str | The raw text to tokenize. |
|
||||
| **RETURNS** | `Doc` | The tokenized document. |
|
||||
| **RETURNS** | [`Doc`](/api/doc) | The tokenized document. |
|
||||
|
||||
<Infobox title="Important note: using a custom tokenizer" variant="warning">
|
||||
#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example}
|
||||
|
||||
In spaCy v1.x, you had to add a custom tokenizer by passing it to the `make_doc`
|
||||
keyword argument, or by passing a tokenizer "factory" to `create_make_doc`. This
|
||||
was unnecessarily complicated. Since spaCy v2.0, you can write to
|
||||
`nlp.tokenizer` instead. If your tokenizer needs the vocab, you can write a
|
||||
function and use `nlp.vocab`.
|
||||
|
||||
```diff
|
||||
- nlp = spacy.load("en_core_web_sm", make_doc=my_tokenizer)
|
||||
- nlp = spacy.load("en_core_web_sm", create_make_doc=my_tokenizer_factory)
|
||||
|
||||
+ nlp.tokenizer = my_tokenizer
|
||||
+ nlp.tokenizer = my_tokenizer_factory(nlp.vocab)
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
### Example: A custom whitespace tokenizer {#custom-tokenizer-example}
|
||||
|
||||
To construct the tokenizer, we usually want attributes of the `nlp` pipeline.
|
||||
Specifically, we want the tokenizer to hold a reference to the vocabulary
|
||||
object. Let's say we have the following class as our tokenizer:
|
||||
Here's an example of the most basic whitespace tokenizer. It takes the shared
|
||||
vocab, so it can construct `Doc` objects. When it's called on a text, it returns
|
||||
a `Doc` object consisting of the text split on single space characters. We can
|
||||
then overwrite the `nlp.tokenizer` attribute with an instance of our custom
|
||||
tokenizer.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
@ -1017,68 +1001,189 @@ class WhitespaceTokenizer:
|
|||
self.vocab = vocab
|
||||
|
||||
def __call__(self, text):
|
||||
words = text.split(' ')
|
||||
# All tokens 'own' a subsequent space character in this tokenizer
|
||||
spaces = [True] * len(words)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
words = text.split(" ")
|
||||
return Doc(self.vocab, words=words)
|
||||
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
nlp = spacy.blank("en")
|
||||
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
||||
doc = nlp("What's happened to me? he thought. It wasn't a dream.")
|
||||
print([t.text for t in doc])
|
||||
print([token.text for token in doc])
|
||||
```
|
||||
|
||||
As you can see, we need a `Vocab` instance to construct this — but we won't have
|
||||
it until we get back the loaded `nlp` object. The simplest solution is to build
|
||||
the tokenizer in two steps. This also means that you can reuse the "tokenizer
|
||||
factory" and initialize it with different instances of `Vocab`.
|
||||
#### Example 2: Third-party tokenizers (BERT word pieces) {#custom-tokenizer-example2}
|
||||
|
||||
### Bringing your own annotations {#own-annotations}
|
||||
You can use the same approach to plug in any other third-party tokenizers. Your
|
||||
custom callable just needs to return a `Doc` object with the tokens produced by
|
||||
your tokenizer. In this example, the wrapper uses the **BERT word piece
|
||||
tokenizer**, provided by the
|
||||
[`tokenizers`](https://github.com/huggingface/tokenizers) library. The tokens
|
||||
available in the `Doc` object returned by spaCy now match the exact word pieces
|
||||
produced by the tokenizer.
|
||||
|
||||
spaCy generally assumes by default that your data is raw text. However,
|
||||
> #### 💡 Tip: spacy-transformers
|
||||
>
|
||||
> If you're working with transformer models like BERT, check out the
|
||||
> [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
||||
> extension package and [documentation](/usage/transformers). It includes a
|
||||
> pipeline component for using pretrained transformer weights and **training
|
||||
> transformer models** in spaCy, as well as helpful utilities for aligning word
|
||||
> pieces to linguistic tokenization.
|
||||
|
||||
```python
|
||||
### Custom BERT word piece tokenizer
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
from spacy.tokens import Doc
|
||||
import spacy
|
||||
|
||||
class BertTokenizer:
|
||||
def __init__(self, vocab, vocab_file, lowercase=True):
|
||||
self.vocab = vocab
|
||||
self._tokenizer = BertWordPieceTokenizer(vocab_file, lowercase=lowercase)
|
||||
|
||||
def __call__(self, text):
|
||||
tokens = self._tokenizer.encode(text)
|
||||
words = []
|
||||
spaces = []
|
||||
for i, (text, (start, end)) in enumerate(zip(tokens.tokens, tokens.offsets)):
|
||||
words.append(text)
|
||||
if i < len(tokens.tokens) - 1:
|
||||
# If next start != current end we assume a space in between
|
||||
next_start, next_end = tokens.offsets[i + 1]
|
||||
spaces.append(next_start > end)
|
||||
else:
|
||||
spaces.append(True)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
nlp.tokenizer = BertTokenizer(nlp.vocab, "bert-base-uncased-vocab.txt")
|
||||
doc = nlp("Justin Drew Bieber is a Canadian singer, songwriter, and actor.")
|
||||
print(doc.text, [token.text for token in doc])
|
||||
# [CLS]justin drew bi##eber is a canadian singer, songwriter, and actor.[SEP]
|
||||
# ['[CLS]', 'justin', 'drew', 'bi', '##eber', 'is', 'a', 'canadian', 'singer',
|
||||
# ',', 'songwriter', ',', 'and', 'actor', '.', '[SEP]']
|
||||
```
|
||||
|
||||
<Infobox title="Important note on tokenization and models" variant="warning">
|
||||
|
||||
Keep in mind that your model's result may be less accurate if the tokenization
|
||||
during training differs from the tokenization at runtime. So if you modify a
|
||||
pretrained model's tokenization afterwards, it may produce very different
|
||||
predictions. You should therefore train your model with the **same tokenizer**
|
||||
it will be using at runtime. See the docs on
|
||||
[training with custom tokenization](#custom-tokenizer-training) for details.
|
||||
|
||||
</Infobox>
|
||||
|
||||
#### Training with custom tokenization {#custom-tokenizer-training new="3"}
|
||||
|
||||
spaCy's [training config](/usage/training#config) describe the settings,
|
||||
hyperparameters, pipeline and tokenizer used for constructing and training the
|
||||
model. The `[nlp.tokenizer]` block refers to a **registered function** that
|
||||
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
||||
function called `whitespace_tokenizer` in the
|
||||
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
|
||||
construct your tokenizer during training, you can pass in your Python file by
|
||||
setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [nlp.tokenizer]
|
||||
> @tokenizers = "whitespace_tokenizer"
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="1"}
|
||||
@spacy.registry.tokenizers("whitespace_tokenizer")
|
||||
def create_whitespace_tokenizer():
|
||||
def create_tokenizer(nlp):
|
||||
return WhitespaceTokenizer(nlp.vocab)
|
||||
|
||||
return create_tokenizer
|
||||
```
|
||||
|
||||
Registered functions can also take arguments that are then passed in from the
|
||||
config. This allows you to quickly change and keep track of different settings.
|
||||
Here, the registered function called `bert_word_piece_tokenizer` takes two
|
||||
arguments: the path to a vocabulary file and whether to lowercase the text. The
|
||||
Python type hints `str` and `bool` ensure that the received values have the
|
||||
correct type.
|
||||
|
||||
> #### config.cfg
|
||||
>
|
||||
> ```ini
|
||||
> [nlp.tokenizer]
|
||||
> @tokenizers = "bert_word_piece_tokenizer"
|
||||
> vocab_file = "bert-base-uncased-vocab.txt"
|
||||
> lowercase = true
|
||||
> ```
|
||||
|
||||
```python
|
||||
### functions.py {highlight="1"}
|
||||
@spacy.registry.tokenizers("bert_word_piece_tokenizer")
|
||||
def create_whitespace_tokenizer(vocab_file: str, lowercase: bool):
|
||||
def create_tokenizer(nlp):
|
||||
return BertWordPieceTokenizer(nlp.vocab, vocab_file, lowercase)
|
||||
|
||||
return create_tokenizer
|
||||
```
|
||||
|
||||
To avoid hard-coding local paths into your config file, you can also set the
|
||||
vocab path on the CLI by using the `--nlp.tokenizer.vocab_file`
|
||||
[override](/usage/training#config-overrides) when you run
|
||||
[`spacy train`](/api/cli#train). For more details on using registered functions,
|
||||
see the docs in [training with custom code](/usage/training#custom-code).
|
||||
|
||||
<Infobox variant="warning">
|
||||
|
||||
Remember that a registered function should always be a function that spaCy
|
||||
**calls to create something**, not the "something" itself. In this case, it
|
||||
**creates a function** that takes the `nlp` object and returns a callable that
|
||||
takes a text and returns a `Doc`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
#### Using pre-tokenized text {#own-annotations}
|
||||
|
||||
spaCy generally assumes by default that your data is **raw text**. However,
|
||||
sometimes your data is partially annotated, e.g. with pre-existing tokenization,
|
||||
part-of-speech tags, etc. The most common situation is that you have pre-defined
|
||||
tokenization. If you have a list of strings, you can create a `Doc` object
|
||||
directly. Optionally, you can also specify a list of boolean values, indicating
|
||||
whether each word has a subsequent space.
|
||||
part-of-speech tags, etc. The most common situation is that you have
|
||||
**pre-defined tokenization**. If you have a list of strings, you can create a
|
||||
[`Doc`](/api/doc) object directly. Optionally, you can also specify a list of
|
||||
boolean values, indicating whether each word is followed by a space.
|
||||
|
||||
> #### ✏️ Things to try
|
||||
>
|
||||
> 1. Change a boolean value in the list of `spaces`. You should see it reflected
|
||||
> in the `doc.text` and whether the token is followed by a space.
|
||||
> 2. Remove `spaces=spaces` from the `Doc`. You should see that every token is
|
||||
> now followed by a space.
|
||||
> 3. Copy-paste a random sentence from the internet and manually construct a
|
||||
> `Doc` with `words` and `spaces` so that the `doc.text` matches the original
|
||||
> input text.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.lang.en import English
|
||||
|
||||
nlp = English()
|
||||
doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
|
||||
spaces=[False, True, False, False])
|
||||
nlp = spacy.blank("en")
|
||||
words = ["Hello", ",", "world", "!"]
|
||||
spaces = [False, True, False, False]
|
||||
doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
print(doc.text)
|
||||
print([(t.text, t.text_with_ws, t.whitespace_) for t in doc])
|
||||
```
|
||||
|
||||
If provided, the spaces list must be the same length as the words list. The
|
||||
If provided, the spaces list must be the **same length** as the words list. The
|
||||
spaces list affects the `doc.text`, `span.text`, `token.idx`, `span.start_char`
|
||||
and `span.end_char` attributes. If you don't provide a `spaces` sequence, spaCy
|
||||
will assume that all words are whitespace delimited.
|
||||
will assume that all words are followed by a space. Once you have a
|
||||
[`Doc`](/api/doc) object, you can write to its attributes to set the
|
||||
part-of-speech tags, syntactic dependencies, named entities and other
|
||||
attributes.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.lang.en import English
|
||||
|
||||
nlp = English()
|
||||
bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"])
|
||||
good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"],
|
||||
spaces=[False, True, False, False])
|
||||
|
||||
print(bad_spaces.text) # 'Hello , world !'
|
||||
print(good_spaces.text) # 'Hello, world!'
|
||||
```
|
||||
|
||||
Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set
|
||||
the part-of-speech tags, syntactic dependencies, named entities and other
|
||||
attributes. For details, see the respective usage pages.
|
||||
|
||||
### Aligning tokenization {#aligning-tokenization}
|
||||
#### Aligning tokenization {#aligning-tokenization}
|
||||
|
||||
spaCy's tokenization is non-destructive and uses language-specific rules
|
||||
optimized for compatibility with treebank annotations. Other tools and resources
|
||||
|
|
|
@ -979,8 +979,8 @@ added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is
|
|||
called on a text, it will find matches in the `doc` and add them as entities to
|
||||
the `doc.ents`, using the specified pattern label as the entity label. If any
|
||||
matches were to overlap, the pattern matching most tokens takes priority. If
|
||||
they also happen to be equally long, then the match occuring first in the Doc is
|
||||
chosen.
|
||||
they also happen to be equally long, then the match occurring first in the `Doc`
|
||||
is chosen.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
|
|
@ -6,26 +6,98 @@ menu:
|
|||
- ['New Features', 'features']
|
||||
- ['Backwards Incompatibilities', 'incompat']
|
||||
- ['Migrating from v2.x', 'migrating']
|
||||
- ['Migrating plugins', 'plugins']
|
||||
---
|
||||
|
||||
## Summary {#summary}
|
||||
|
||||
## New Features {#features}
|
||||
|
||||
### New training workflow and config system {#features-training}
|
||||
|
||||
### Transformer-based pipelines {#features-transformers}
|
||||
|
||||
### Custom models using any framework {#feautres-custom-models}
|
||||
|
||||
### Manage end-to-end workflows with projects {#features-projects}
|
||||
|
||||
### New built-in pipeline components {#features-pipeline-components}
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. |
|
||||
| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. |
|
||||
| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. |
|
||||
| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
|
||||
|
||||
### New and improved pipeline component APIs {#features-components}
|
||||
|
||||
- `Language.factory`, `Language.component`
|
||||
- `Language.analyze_pipes`
|
||||
- Adding components from other models
|
||||
|
||||
### Type hints and type-based data validation {#features-types}
|
||||
|
||||
spaCy v3.0 officially drops support for Python 2 and now requires **Python
|
||||
3.6+**. This also means that the code base can take full advantage of
|
||||
[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing
|
||||
API that's implemented in pure Python (as opposed to Cython) now comes with type
|
||||
hints. The new version of spaCy's machine learning library
|
||||
[Thinc](https://thinc.ai) also features extensive
|
||||
[type support](https://thinc.ai/docs/usage-type-checking/), including custom
|
||||
types for models and arrays, and a custom `mypy` plugin that can be used to
|
||||
type-check model definitions.
|
||||
|
||||
For data validation, spacy v3.0 adopts
|
||||
[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data
|
||||
validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which
|
||||
lets you to register **custom functions with typed arguments**, reference them
|
||||
in your config and see validation errors if the argument values don't match.
|
||||
|
||||
### CLI
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| [`init config`](/api/cli#init-config) | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. |
|
||||
| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors. |
|
||||
| [`project`](/api/cli#project) | Subcommand for cloning and running [spaCy projects](/usage/projects). |
|
||||
|
||||
## Backwards Incompatibilities {#incompat}
|
||||
|
||||
### Removed or renamed objects, methods, attributes and arguments {#incompat-removed}
|
||||
As always, we've tried to keep the breaking changes to a minimum and focus on
|
||||
changes that were necessary to support the new features, fix problems or improve
|
||||
usability. The following section lists the relevant changes to the user-facing
|
||||
API. For specific examples of how to rewrite your code, check out the
|
||||
[migration guide](#migrating).
|
||||
|
||||
### Compatibility {#incompat-compat}
|
||||
|
||||
- spaCy now requires **Python 3.6+**.
|
||||
|
||||
### API changes {#incompat-api}
|
||||
|
||||
- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of
|
||||
the component factory instead of the component function.
|
||||
- **Custom pipeline components** now needs to be decorated with the
|
||||
[`@Language.component`](/api/language#component) or
|
||||
[`@Language.factory`](/api/language#factory) decorator.
|
||||
- [`Language.update`](/api/language#update) now takes a batch of
|
||||
[`Example`](/api/example) objects instead of raw texts and annotations, or
|
||||
`Doc` and `GoldParse` objects.
|
||||
- The `Language.disable_pipes` contextmanager has been replaced by
|
||||
[`Language.select_pipes`](/api/language#select_pipes), which can explicitly
|
||||
disable or enable components.
|
||||
|
||||
### Removed or renamed API {#incompat-removed}
|
||||
|
||||
| Removed | Replacement |
|
||||
| -------------------------------------------------------- | ----------------------------------------- |
|
||||
| -------------------------------------------------------- | ----------------------------------------------------- |
|
||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) |
|
||||
| `GoldParse` | [`Example`](/api/example) |
|
||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated |
|
||||
|
||||
### Removed deprecated methods, attributes and arguments {#incompat-removed-deprecated}
|
||||
|
||||
The following deprecated methods, attributes and arguments were removed in v3.0.
|
||||
Most of them have been **deprecated for a while** and many would previously
|
||||
raise errors. Many of them were also mostly internals. If you've been working
|
||||
|
@ -214,17 +286,14 @@ python -m spacy package ./model ./packages
|
|||
- python setup.py sdist
|
||||
```
|
||||
|
||||
## Migration notes for plugin maintainers {#plugins}
|
||||
#### Migration notes for plugin maintainers {#migrating-plugins}
|
||||
|
||||
Thanks to everyone who's been contributing to the spaCy ecosystem by developing
|
||||
and maintaining one of the many awesome [plugins and extensions](/universe).
|
||||
We've tried to keep breaking changes to a minimum and make it as easy as
|
||||
possible for you to upgrade your packages for spaCy v3.
|
||||
|
||||
### Custom pipeline components
|
||||
|
||||
The most common use case for plugins is providing pipeline components and
|
||||
extension attributes.
|
||||
We've tried to make it as easy as possible for you to upgrade your packages for
|
||||
spaCy v3. The most common use case for plugins is providing pipeline components
|
||||
and extension attributes. When migrating your plugin, double-check the
|
||||
following:
|
||||
|
||||
- Use the [`@Language.factory`](/api/language#factory) decorator to register
|
||||
your component and assign it a name. This allows users to refer to your
|
||||
|
|
|
@ -11,7 +11,7 @@ import Link from './link'
|
|||
import GitHubCode from './github'
|
||||
import classes from '../styles/code.module.sass'
|
||||
|
||||
const WRAP_THRESHOLD = 15
|
||||
const WRAP_THRESHOLD = 16
|
||||
|
||||
export default props => (
|
||||
<Pre>
|
||||
|
|
Loading…
Reference in New Issue
Block a user