diff --git a/spacy/about.py b/spacy/about.py index 03de62539..eb4d2128c 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a5" +__version__ = "3.0.0a6" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 7202ccacf..ce0eb27a0 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -35,7 +35,7 @@ def pretrain_cli( config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False), code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"), - epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."), + epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), # fmt: on ): diff --git a/spacy/gold/batchers.py b/spacy/gold/batchers.py index 57c6b4b3a..ec1f35815 100644 --- a/spacy/gold/batchers.py +++ b/spacy/gold/batchers.py @@ -1,4 +1,4 @@ -from typing import Union, Iterator, Iterable, Sequence, TypeVar, List, Callable +from typing import Union, Iterable, Sequence, TypeVar, List, Callable from typing import Optional, Any from functools import partial import itertools @@ -19,6 +19,22 @@ def configure_minibatch_by_padded_size( discard_oversize: bool, get_length: Optional[Callable[[ItemT], int]] = None ) -> BatcherT: + """Create a batcher that uses the `batch_by_padded_size` strategy. + + The padded size is defined as the maximum length of sequences within the + batch multiplied by the number of sequences in the batch. + + size (int or Iterable[int]): The largest padded size to batch sequences into. + Can be a single integer, or a sequence, allowing for variable batch sizes. + buffer (int): The number of sequences to accumulate before sorting by length. + A larger buffer will result in more even sizing, but if the buffer is + very large, the iteration order will be less random, which can result + in suboptimal training. + discard_oversize (bool): Whether to discard sequences that are by themselves + longer than the largest padded batch size. + get_length (Callable or None): Function to get the length of a sequence item. + The `len` function is used by default. + """ # Avoid displacing optional values from the underlying function. optionals = {"get_length": get_length} if get_length is not None else {} return partial( @@ -38,6 +54,16 @@ def configure_minibatch_by_words( discard_oversize: bool, get_length: Optional[Callable[[ItemT], int]] = None ) -> BatcherT: + """Create a batcher that uses the "minibatch by words" strategy. + + size (int or Iterable[int]): The target number of words per batch. + Can be a single integer, or a sequence, allowing for variable batch sizes. + tolerance (float): What percentage of the size to allow batches to exceed. + discard_oversize (bool): Whether to discard sequences that by themselves + exceed the tolerated size. + get_length (Callable or None): Function to get the length of a sequence + item. The `len` function is used by default. + """ optionals = {"get_length": get_length} if get_length is not None else {} return partial( minibatch_by_words, size=size, discard_oversize=discard_oversize, **optionals @@ -48,22 +74,43 @@ def configure_minibatch_by_words( def configure_minibatch( size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None ) -> BatcherT: + """Create a batcher that creates batches of the specified size. + + size (int or Iterable[int]): The target number of items per batch. + Can be a single integer, or a sequence, allowing for variable batch sizes. + """ optionals = {"get_length": get_length} if get_length is not None else {} return partial(minibatch, size=size, **optionals) def minibatch_by_padded_size( - docs: Iterator["Doc"], + seqs: Iterable[ItemT], size: Sizing, buffer: int = 256, discard_oversize: bool = False, get_length: Callable = len, -) -> Iterator[Iterator["Doc"]]: +) -> Iterable[List[ItemT]]: + """Minibatch a sequence by the size of padded batches that would result, + with sequences binned by length within a window. + + The padded size is defined as the maximum length of sequences within the + batch multiplied by the number of sequences in the batch. + + size (int): The largest padded size to batch sequences into. + buffer (int): The number of sequences to accumulate before sorting by length. + A larger buffer will result in more even sizing, but if the buffer is + very large, the iteration order will be less random, which can result + in suboptimal training. + discard_oversize (bool): Whether to discard sequences that are by themselves + longer than the largest padded batch size. + get_length (Callable or None): Function to get the length of a sequence item. + The `len` function is used by default. + """ if isinstance(size, int): size_ = itertools.repeat(size) else: size_ = size - for outer_batch in minibatch(docs, size=buffer): + for outer_batch in minibatch(seqs, size=buffer): outer_batch = list(outer_batch) target_size = next(size_) for indices in _batch_by_length(outer_batch, target_size, get_length): @@ -76,12 +123,24 @@ def minibatch_by_padded_size( def minibatch_by_words( - docs, size, tolerance=0.2, discard_oversize=False, get_length=len -): + seqs: Iterable[ItemT], + size: Sizing, + tolerance=0.2, + discard_oversize=False, + get_length=len, +) -> Iterable[List[ItemT]]: """Create minibatches of roughly a given number of words. If any examples are longer than the specified batch length, they will appear in a batch by themselves, or be discarded if discard_oversize=True. - The argument 'docs' can be a list of strings, Docs or Examples. + + seqs (Iterable[Sequence]): The sequences to minibatch. + size (int or Iterable[int]): The target number of words per batch. + Can be a single integer, or a sequence, allowing for variable batch sizes. + tolerance (float): What percentage of the size to allow batches to exceed. + discard_oversize (bool): Whether to discard sequences that by themselves + exceed the tolerated size. + get_length (Callable or None): Function to get the length of a sequence + item. The `len` function is used by default. """ if isinstance(size, int): size_ = itertools.repeat(size) @@ -95,20 +154,20 @@ def minibatch_by_words( overflow = [] batch_size = 0 overflow_size = 0 - for doc in docs: - n_words = get_length(doc) + for seq in seqs: + n_words = get_length(seq) # if the current example exceeds the maximum batch size, it is returned separately # but only if discard_oversize=False. if n_words > target_size + tol_size: if not discard_oversize: - yield [doc] + yield [seq] # add the example to the current batch if there's no overflow yet and it still fits elif overflow_size == 0 and (batch_size + n_words) <= target_size: - batch.append(doc) + batch.append(seq) batch_size += n_words # add the example to the overflow buffer if it fits in the tolerance margin elif (batch_size + overflow_size + n_words) <= (target_size + tol_size): - overflow.append(doc) + overflow.append(seq) overflow_size += n_words # yield the previous batch and start a new one. The new one gets the overflow examples. else: @@ -122,11 +181,11 @@ def minibatch_by_words( overflow_size = 0 # this example still fits if (batch_size + n_words) <= target_size: - batch.append(doc) + batch.append(seq) batch_size += n_words # this example fits in overflow elif (batch_size + n_words) <= (target_size + tol_size): - overflow.append(doc) + overflow.append(seq) overflow_size += n_words # this example does not fit with the previous overflow: start another new batch else: @@ -134,7 +193,7 @@ def minibatch_by_words( yield batch target_size = next(size_) tol_size = target_size * tolerance - batch = [doc] + batch = [seq] batch_size = n_words batch.extend(overflow) if batch: diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py index b8bef39b9..be389f117 100644 --- a/spacy/lang/en/lemmatizer.py +++ b/spacy/lang/en/lemmatizer.py @@ -1,5 +1,3 @@ -from typing import Optional - from ...pipeline import Lemmatizer from ...tokens import Token diff --git a/spacy/language.py b/spacy/language.py index 96661915a..85aac15ef 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -27,7 +27,6 @@ from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc -from .lookups import load_lookups from .tokenizer import Tokenizer from .errors import Errors, Warnings from .schemas import ConfigSchema @@ -1439,10 +1438,7 @@ class Language: or lang_cls is not cls ): raise ValueError(Errors.E943.format(value=type(lang_cls))) - nlp = lang_cls( - vocab=vocab, - create_tokenizer=create_tokenizer, - ) + nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer) if after_creation is not None: nlp = after_creation(nlp) if not isinstance(nlp, cls): diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 716af9909..e0a54e6f1 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -68,11 +68,11 @@ cdef class DependencyMatcher: key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ - return self._normalize_key(key) in self._patterns + return self.has_key(key) - def validateInput(self, pattern, key): + def validate_input(self, pattern, key): idx = 0 - visitedNodes = {} + visited_nodes = {} for relation in pattern: if "PATTERN" not in relation or "SPEC" not in relation: raise ValueError(Errors.E098.format(key=key)) @@ -83,7 +83,7 @@ cdef class DependencyMatcher: and "NBOR_NAME" not in relation["SPEC"] ): raise ValueError(Errors.E099.format(key=key)) - visitedNodes[relation["SPEC"]["NODE_NAME"]] = True + visited_nodes[relation["SPEC"]["NODE_NAME"]] = True else: if not( "NODE_NAME" in relation["SPEC"] @@ -92,22 +92,28 @@ cdef class DependencyMatcher: ): raise ValueError(Errors.E100.format(key=key)) if ( - relation["SPEC"]["NODE_NAME"] in visitedNodes - or relation["SPEC"]["NBOR_NAME"] not in visitedNodes + relation["SPEC"]["NODE_NAME"] in visited_nodes + or relation["SPEC"]["NBOR_NAME"] not in visited_nodes ): raise ValueError(Errors.E101.format(key=key)) - visitedNodes[relation["SPEC"]["NODE_NAME"]] = True - visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True + visited_nodes[relation["SPEC"]["NODE_NAME"]] = True + visited_nodes[relation["SPEC"]["NBOR_NAME"]] = True idx = idx + 1 def add(self, key, patterns, *_patterns, on_match=None): + """Add a new matcher rule to the matcher. + + key (str): The match ID. + patterns (list): The patterns to add for the given key. + on_match (callable): Optional callback executed on match. + """ if patterns is None or hasattr(patterns, "__call__"): # old API on_match = patterns patterns = _patterns for pattern in patterns: if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) - self.validateInput(pattern,key) + self.validate_input(pattern,key) key = self._normalize_key(key) _patterns = [] for pattern in patterns: @@ -187,8 +193,7 @@ cdef class DependencyMatcher: key (string or int): The key to check. RETURNS (bool): Whether the matcher has the rule. """ - key = self._normalize_key(key) - return key in self._patterns + return self._normalize_key(key) in self._patterns def get(self, key, default=None): """Retrieve the pattern stored for a key. @@ -202,6 +207,13 @@ cdef class DependencyMatcher: return (self._callbacks[key], self._patterns[key]) def __call__(self, Doc doc): + """Find all token sequences matching the supplied pattern. + + doclike (Doc or Span): The document to match over. + RETURNS (list): A list of `(key, start, end)` tuples, + describing the matches. A match tuple describes a span + `doc[start:end]`. The `label_id` and `key` are both integers. + """ matched_key_trees = [] matches = self.token_matcher(doc) for key in list(self._patterns.keys()): @@ -241,25 +253,25 @@ cdef class DependencyMatcher: on_match(self, doc, i, matched_key_trees) return matched_key_trees - def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees): + def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visited_nodes,matched_trees): cdef bool isValid; if(patternLength == len(id_to_position.keys())): isValid = True for node in range(patternLength): if(node in tree): for idx, (relop,nbor) in enumerate(tree[node]): - computed_nbors = numpy.asarray(_node_operator_map[visitedNodes[node]][relop]) + computed_nbors = numpy.asarray(_node_operator_map[visited_nodes[node]][relop]) isNbor = False for computed_nbor in computed_nbors: - if(computed_nbor.i == visitedNodes[nbor]): + if(computed_nbor.i == visited_nodes[nbor]): isNbor = True isValid = isValid & isNbor if(isValid): - matched_trees.append(visitedNodes) + matched_trees.append(visited_nodes) return allPatternNodes = numpy.asarray(id_to_position[patternLength]) for patternNode in allPatternNodes: - self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visitedNodes+[patternNode],matched_trees) + self.recurse(tree,id_to_position,_node_operator_map,patternLength+1,visited_nodes+[patternNode],matched_trees) # Given a node and an edge operator, to return the list of nodes # from the doc that belong to node+operator. This is used to store diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index a0f3f1655..16ab73735 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -70,7 +70,7 @@ cdef class Matcher: key (str): The match ID. RETURNS (bool): Whether the matcher contains rules for this match ID. """ - return self._normalize_key(key) in self._patterns + return self.has_key(key) def add(self, key, patterns, *, on_match=None, greedy: str=None): """Add a match-rule to the matcher. A match-rule consists of: an ID @@ -162,8 +162,7 @@ cdef class Matcher: key (string or int): The key to check. RETURNS (bool): Whether the matcher has the rule. """ - key = self._normalize_key(key) - return key in self._patterns + return self._normalize_key(key) in self._patterns def get(self, key, default=None): """Retrieve the pattern stored for a key. @@ -179,7 +178,7 @@ cdef class Matcher: def pipe(self, docs, batch_size=1000, return_matches=False, as_tuples=False): """Match a stream of documents, yielding them in turn. - docs (iterable): A stream of documents. + docs (Iterable[Union[Doc, Span]]): A stream of documents or spans. batch_size (int): Number of documents to accumulate into a working set. return_matches (bool): Yield the match lists along with the docs, making results (doc, matches) tuples. diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index f85b5626a..801229af5 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -37,7 +37,6 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] default_config={ "moves": None, "update_with_oracle_cut_size": 100, - "multitasks": [], "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, @@ -51,17 +50,52 @@ def make_parser( model: Model, moves: Optional[list], update_with_oracle_cut_size: int, - multitasks: Iterable, learn_tokens: bool, min_action_freq: int ): + """Create a transition-based DependencyParser component. The dependency parser + jointly learns sentence segmentation and labelled dependency parsing, and can + optionally learn to merge tokens that had been over-segmented by the tokenizer. + + The parser uses a variant of the non-monotonic arc-eager transition-system + described by Honnibal and Johnson (2014), with the addition of a "break" + transition to perform the sentence segmentation. Nivre's pseudo-projective + dependency transformation is used to allow the parser to predict + non-projective parses. + + The parser is trained using an imitation learning objective. The parser follows + the actions predicted by the current weights, and at each state, determines + which actions are compatible with the optimal parse that could be reached + from the current state. The weights such that the scores assigned to the + set of optimal actions is increased, while scores assigned to other + actions are decreased. Note that more than one action may be optimal for + a given state. + + model (Model): The model for the transition-based parser. The model needs + to have a specific substructure of named components --- see the + spacy.ml.tb_framework.TransitionModel for details. + moves (List[str]): A list of transition names. Inferred from the data if not + provided. + update_with_oracle_cut_size (int): + During training, cut long sequences into shorter segments by creating + intermediate states based on the gold-standard history. The model is + not very sensitive to this parameter, so you usually won't need to change + it. 100 is a good default. + learn_tokens (bool): Whether to learn to merge subtokens that are split + relative to the gold standard. Experimental. + min_action_freq (int): The minimum frequency of labelled actions to retain. + Rarer labelled actions have their label backed-off to "dep". While this + primarily affects the label accuracy, it can also affect the attachment + structure, as the labels are used to represent the pseudo-projectivity + transformation. + """ return DependencyParser( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, - multitasks=multitasks, + multitasks=[], learn_tokens=learn_tokens, min_action_freq=min_action_freq ) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 840070c23..080273f57 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -62,6 +62,16 @@ def make_entity_linker( incl_prior: bool, incl_context: bool, ): + """Construct an EntityLinker component. + + model (Model[List[Doc], Floats2d]): A model that learns document vector + representations. Given a batch of Doc objects, it should return a single + array, with one row per item in the batch. + kb (KnowledgeBase): The knowledge-base to link entities to. + labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction. + incl_prior (bool): Whether or not to include prior probabilities from the KB in the model. + incl_context (bool): Whether or not to include the local context in the model. + """ return EntityLinker( nlp.vocab, model, diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 06c9f9a25..efc494181 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -75,8 +75,8 @@ class Morphologizer(Tagger): model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. - labels_morph (dict): TODO: - labels_pos (dict): TODO: + labels_morph (dict): Mapping of morph + POS tags to morph labels. + labels_pos (dict): Mapping of morph + POS tags to POS tags. DOCS: https://spacy.io/api/morphologizer#init """ diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index d13152a4f..a3bc3d920 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -35,9 +35,6 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"] default_config={ "moves": None, "update_with_oracle_cut_size": 100, - "multitasks": [], - "learn_tokens": False, - "min_action_freq": 30, "model": DEFAULT_NER_MODEL, }, scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], @@ -50,19 +47,40 @@ def make_ner( model: Model, moves: Optional[list], update_with_oracle_cut_size: int, - multitasks: Iterable, - learn_tokens: bool, - min_action_freq: int ): + """Create a transition-based EntityRecognizer component. The entity recognizer + identifies non-overlapping labelled spans of tokens. + + The transition-based algorithm used encodes certain assumptions that are + effective for "traditional" named entity recognition tasks, but may not be + a good fit for every span identification problem. Specifically, the loss + function optimizes for whole entity accuracy, so if your inter-annotator + agreement on boundary tokens is low, the component will likely perform poorly + on your problem. The transition-based algorithm also assumes that the most + decisive information about your entities will be close to their initial tokens. + If your entities are long and characterised by tokens in their middle, the + component will likely do poorly on your task. + + model (Model): The model for the transition-based parser. The model needs + to have a specific substructure of named components --- see the + spacy.ml.tb_framework.TransitionModel for details. + moves (list[str]): A list of transition names. Inferred from the data if not + provided. + update_with_oracle_cut_size (int): + During training, cut long sequences into shorter segments by creating + intermediate states based on the gold-standard history. The model is + not very sensitive to this parameter, so you usually won't need to change + it. 100 is a good default. + """ return EntityRecognizer( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, - multitasks=multitasks, - learn_tokens=learn_tokens, - min_action_freq=min_action_freq + multitasks=[], + min_action_freq=1, + learn_tokens=False, ) @@ -74,9 +92,11 @@ cdef class EntityRecognizer(Parser): TransitionSystem = BiluoPushDown def add_multitask_objective(self, mt_component): + """Register another component as a multi-task objective. Experimental.""" self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): + """Setup multi-task objective components. Experimental and internal.""" # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels)) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index aa0399b33..9be562b61 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -1,8 +1,9 @@ # cython: infer_types=True, profile=True, binding=True +from typing import List import numpy import srsly - from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config +from thinc.types import Floats2d import warnings from ..tokens.doc cimport Doc @@ -42,7 +43,14 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] scores=["tag_acc"], default_score_weights={"tag_acc": 1.0}, ) -def make_tagger(nlp: Language, name: str, model: Model): +def make_tagger(nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]]): + """Construct a part-of-speech tagger component. + + model (Model[List[Doc], List[Floats2d]]): A model instance that predicts + the tag probabilities. The output vectors should match the number of tags + in size, and be normalized as probabilities (all scores between 0 and 1, + with the rows summing to 1). + """ return Tagger(nlp.vocab, model, name) diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 06b72f8c7..d632825bd 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -1,5 +1,6 @@ from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config +from thinc.types import Floats2d import numpy from .pipe import Pipe @@ -69,8 +70,22 @@ subword_features = true default_score_weights={"cats_score": 1.0}, ) def make_textcat( - nlp: Language, name: str, model: Model, labels: Iterable[str] + nlp: Language, + name: str, + model: Model[List[Doc], List[Floats2d]], + labels: Iterable[str], ) -> "TextCategorizer": + """Create a TextCategorizer compoment. The text categorizer predicts categories + over a whole document. It can learn one or more labels, and the labels can + be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive + (i.e. zero or more labels may be true per doc). The multi-label setting is + controlled by the model instance that's provided. + + model (Model[List[Doc], List[Floats2d]]): A model instance that predicts + scores for each category. + labels (list): A list of categories to learn. If empty, the model infers the + categories from the data. + """ return TextCategorizer(nlp.vocab, model, name, labels=labels) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index db6843e8f..c9f0a99e9 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -32,11 +32,28 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec": class Tok2Vec(Pipe): + """Apply a "token-to-vector" model and set its outputs in the doc.tensor + attribute. This is mostly useful to share a single subnetwork between multiple + components, e.g. to have one embedding and CNN network shared between a + parser, tagger and NER. + + In order to use the `Tok2Vec` predictions, subsequent components should use + the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This + layer will read data from the `doc.tensor` attribute during prediction. + During training, the `Tok2Vec` component will save its prediction and backprop + callback for each batch, so that the subsequent components can backpropagate + to the shared weights. This implementation is used because it allows us to + avoid relying on object identity within the models to achieve the parameter + sharing. + """ + def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None: """Initialize a tok2vec component. vocab (Vocab): The shared vocabulary. - model (thinc.api.Model): The Thinc Model powering the pipeline component. + model (thinc.api.Model[List[Doc], List[Floats2d]]): + The Thinc Model powering the pipeline component. It should take + a list of Doc objects as input, and output a list of 2d float arrays. name (str): The component instance name. DOCS: https://spacy.io/api/tok2vec#init @@ -48,9 +65,18 @@ class Tok2Vec(Pipe): self.cfg = {} def add_listener(self, listener: "Tok2VecListener") -> None: + """Add a listener for a downstream component. Usually internals.""" self.listeners.append(listener) def find_listeners(self, model: Model) -> None: + """Walk over a model, looking for layers that are Tok2vecListener + subclasses that have an upstream_name that matches this component. + Listeners can also set their upstream_name attribute to the wildcard + string '*' to match any `Tok2Vec`. + + You're unlikely to ever need multiple `Tok2Vec` components, so it's + fine to leave your listeners upstream_name on '*'. + """ for node in model.walk(): if isinstance(node, Tok2VecListener) and node.upstream_name in ( "*", @@ -59,7 +85,8 @@ class Tok2Vec(Pipe): self.add_listener(node) def __call__(self, doc: Doc) -> Doc: - """Add context-sensitive embeddings to the Doc.tensor attribute. + """Add context-sensitive embeddings to the Doc.tensor attribute, allowing + them to be used as features by downstream components. docs (Doc): The Doc to preocess. RETURNS (Doc): The processed Doc. @@ -205,11 +232,27 @@ class Tok2Vec(Pipe): class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, for instance from a component earlier in the pipeline. + + The Tok2VecListener layer is used as a sublayer within a component such + as a parser, NER or text categorizer. Usually you'll have multiple listeners + connecting to a single upstream Tok2Vec component, that's earlier in the + pipeline. The Tok2VecListener layers act as proxies, passing the predictions + from the Tok2Vec component into downstream components, and communicating + gradients back upstream. """ name = "tok2vec-listener" def __init__(self, upstream_name: str, width: int) -> None: + """ + upstream_name (str): A string to identify the 'upstream' Tok2Vec component + to communicate with. The upstream name should either be the wildcard + string '*', or the name of the `Tok2Vec` component. You'll almost + never have multiple upstream Tok2Vec components, so the wildcard + string will almost always be fine. + width (int): + The width of the vectors produced by the upstream tok2vec component. + """ Model.__init__(self, name=self.name, forward=forward, dims={"nO": width}) self.upstream_name = upstream_name self._batch_id = None @@ -217,15 +260,25 @@ class Tok2VecListener(Model): self._backprop = None @classmethod - def get_batch_id(cls, inputs) -> int: + def get_batch_id(cls, inputs: List[Doc]) -> int: + """Calculate a content-sensitive hash of the batch of documents, to check + whether the next batch of documents is unexpected. + """ return sum(sum(token.orth for token in doc) for doc in inputs) def receive(self, batch_id: int, outputs, backprop) -> None: + """Store a batch of training predictions and a backprop callback. The + predictions and callback are produced by the upstream Tok2Vec component, + and later will be used when the listener's component's model is called. + """ self._batch_id = batch_id self._outputs = outputs self._backprop = backprop def verify_inputs(self, inputs) -> bool: + """Check that the batch of Doc objects matches the ones we have a + prediction for. + """ if self._batch_id is None and self._outputs is None: raise ValueError(Errors.E954) else: @@ -237,6 +290,7 @@ class Tok2VecListener(Model): def forward(model: Tok2VecListener, inputs, is_train: bool): + """Supply the outputs from the upstream Tok2Vec component.""" if is_train: model.verify_inputs(inputs) return model._outputs, model._backprop diff --git a/spacy/scorer.py b/spacy/scorer.py index 4a81d39d0..d77881ad0 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -426,7 +426,7 @@ class Scorer: f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, } if len(labels) == 2 and not multi_label and positive_label: - positive_label_f = results[f"{attr}_f_per_type"][positive_label]['f'] + positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] results[f"{attr}_score"] = positive_label_f results[f"{attr}_score_desc"] = f"F ({positive_label})" elif not multi_label: diff --git a/spacy/tests/morphology/test_morph_pickle.py b/spacy/tests/morphology/test_morph_pickle.py index 0758a6c01..d9b0e3476 100644 --- a/spacy/tests/morphology/test_morph_pickle.py +++ b/spacy/tests/morphology/test_morph_pickle.py @@ -15,5 +15,7 @@ def morphology(): def test_morphology_pickle_roundtrip(morphology): b = pickle.dumps(morphology) reloaded_morphology = pickle.loads(b) - assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2" - assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4" + feat = reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) + assert feat == "Feat1=Val1|Feat2=Val2" + feat = reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) + assert feat == "Feat3=Val3|Feat4=Val4" diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index dbeb0a9cb..0ffe74273 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -144,10 +144,7 @@ def test_accept_blocked_token(): # 1. test normal behaviour nlp1 = English() doc1 = nlp1("I live in New York") - config = { - "learn_tokens": False, - "min_action_freq": 30, - } + config = {} ner1 = nlp1.create_pipe("ner", config=config) assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] @@ -166,10 +163,7 @@ def test_accept_blocked_token(): # 2. test blocking behaviour nlp2 = English() doc2 = nlp2("I live in New York") - config = { - "learn_tokens": False, - "min_action_freq": 30, - } + config = {} ner2 = nlp2.create_pipe("ner", config=config) # set "New York" to a blocked entity @@ -224,10 +218,7 @@ def test_overwrite_token(): assert [token.ent_iob_ for token in doc] == ["O", "O", "O", "O", "O"] assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] # Check that a new ner can overwrite O - config = { - "learn_tokens": False, - "min_action_freq": 30, - } + config = {} ner2 = nlp.create_pipe("ner", config=config) ner2.moves.add_action(5, "") ner2.add_label("GPE") diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 644fa0f01..8a70fdeeb 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -1,8 +1,7 @@ import pytest - from spacy import util, registry from spacy.lang.en import English -from spacy.lookups import Lookups, load_lookups +from spacy.lookups import Lookups from ..util import make_tempdir diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index 5f27a0afa..1af4a5121 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,10 +1,8 @@ import pytest - from spacy import util from spacy.gold import Example from spacy.lang.en import English from spacy.language import Language -from spacy.symbols import POS, NOUN from ..util import make_tempdir diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 363a16a11..17add7391 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -117,9 +117,7 @@ def test_overfitting_IO(): assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) # Test scoring - scores = nlp.evaluate( - train_examples, scorer_cfg={"positive_label": "POSITIVE"} - ) + scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"}) assert scores["cats_micro_f"] == 1.0 assert scores["cats_score"] == 1.0 assert "cats_score_desc" in scores diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index b642ca229..5c93ea3c8 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,11 +1,9 @@ import pytest import random - from spacy import util from spacy.gold import Example from spacy.matcher import Matcher from spacy.attrs import IS_PUNCT, ORTH, LOWER -from spacy.symbols import POS, VERB from spacy.vocab import Vocab from spacy.lang.en import English from spacy.lookups import Lookups diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 0ac895546..d6a4600e3 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -6,8 +6,7 @@ from spacy.lang.en import English from spacy.lang.lex_attrs import LEX_ATTRS from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer -from spacy.lookups import Lookups -from spacy.symbols import ORTH, LEMMA, POS, VERB +from spacy.symbols import ORTH, LEMMA, POS def test_issue1061(): diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 83afb11f3..4988575ea 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -271,10 +271,7 @@ def test_issue1963(en_tokenizer): @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): nlp = Language() - config = { - "learn_tokens": False, - "min_action_freq": 30, - } + config = {} ner = nlp.create_pipe("ner", config=config) example = Example.from_dict( Doc(ner.vocab, words=["word"]), diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index e42779ad7..de554a5ec 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -157,7 +157,11 @@ def test_issue3540(en_vocab): with doc.retokenize() as retokenizer: heads = [(doc[3], 1), doc[2]] - attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]} + attrs = { + "POS": ["PROPN", "PROPN"], + "LEMMA": ["New", "York"], + "DEP": ["pobj", "compound"], + } retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) gold_text = ["I", "live", "in", "New", "York", "right", "now"] diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index e1d03eaf5..423015106 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -138,10 +138,7 @@ def test_issue4042_bug2(): if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) - config = { - "learn_tokens": False, - "min_action_freq": 30, - } + config = {} ner2 = nlp1.create_pipe("ner", config=config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2 @@ -303,10 +300,7 @@ def test_issue4313(): beam_width = 16 beam_density = 0.0001 nlp = English() - config = { - "learn_tokens": False, - "min_action_freq": 30, - } + config = {} ner = nlp.create_pipe("ner", config=config) ner.add_label("SOME_LABEL") ner.begin_training([]) diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py index 0b3b4a9fc..96d4e1ca4 100644 --- a/spacy/tests/regression/test_issue4501-5000.py +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -185,20 +185,16 @@ def test_issue4725_1(): vocab = Vocab(vectors_name="test_vocab_add_vector") nlp = English(vocab=vocab) config = { - "learn_tokens": False, - "min_action_freq": 342, "update_with_oracle_cut_size": 111, } ner = nlp.create_pipe("ner", config=config) with make_tempdir() as tmp_path: with (tmp_path / "ner.pkl").open("wb") as file_: pickle.dump(ner, file_) - assert ner.cfg["min_action_freq"] == 342 assert ner.cfg["update_with_oracle_cut_size"] == 111 with (tmp_path / "ner.pkl").open("rb") as file_: ner2 = pickle.load(file_) - assert ner2.cfg["min_action_freq"] == 342 assert ner2.cfg["update_with_oracle_cut_size"] == 111 diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 6865cd1e5..ebc804235 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -236,3 +236,33 @@ def test_language_from_config_before_after_init_invalid(): config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}} with pytest.raises(ValueError): English.from_config(config) + + +def test_language_custom_tokenizer(): + """Test that a fully custom tokenizer can be plugged in via the registry.""" + name = "test_language_custom_tokenizer" + + class CustomTokenizer: + """Dummy "tokenizer" that splits on spaces and adds prefix to each word.""" + + def __init__(self, nlp, prefix): + self.vocab = nlp.vocab + self.prefix = prefix + + def __call__(self, text): + words = [f"{self.prefix}{word}" for word in text.split(" ")] + return Doc(self.vocab, words=words) + + @registry.tokenizers(name) + def custom_create_tokenizer(prefix: str = "_"): + def create_tokenizer(nlp): + return CustomTokenizer(nlp, prefix=prefix) + + return create_tokenizer + + config = {"nlp": {"tokenizer": {"@tokenizers": name}}} + nlp = English.from_config(config) + doc = nlp("hello world") + assert [t.text for t in doc] == ["_hello", "_world"] + doc = list(nlp.pipe(["hello world"]))[0] + assert [t.text for t in doc] == ["_hello", "_world"] diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 8d28a78c3..8b07102ce 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -3,7 +3,7 @@ title: Model Architectures teaser: Pre-defined model architectures included with the core library source: spacy/ml/models menu: - - ['Tok2Vec', 'tok2vec'] + - ['Tok2Vec', 'tok2vec-arch'] - ['Transformers', 'transformers'] - ['Parser & NER', 'parser'] - ['Tagging', 'tagger'] @@ -70,6 +70,47 @@ blog post for background. | `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. | | `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. | +### spacy.Tok2VecListener.v1 {#Tok2VecListener} + +> #### Example config +> +> ```ini +> [components.tok2vec] +> factory = "tok2vec" +> +> [components.tok2vec.model] +> @architectures = "spacy.HashEmbedCNN.v1" +> width = 342 +> +> [components.tagger] +> factory = "tagger" +> +> [components.tagger.model] +> @architectures = "spacy.Tagger.v1" +> +> [components.tagger.model.tok2vec] +> @architectures = "spacy.Tok2VecListener.v1" +> width = ${components.tok2vec.model:width} +> ``` + +A listener is used as a sublayer within a component such as a +[`DependencyParser`](/api/dependencyparser), +[`EntityRecognizer`](/api/entityrecognizer)or +[`TextCategorizer`](/api/textcategorizer). Usually you'll have multiple +listeners connecting to a single upstream [`Tok2Vec`](/api/tok2vec) component +that's earlier in the pipeline. The listener layers act as **proxies**, passing +the predictions from the `Tok2Vec` component into downstream components, and +communicating gradients back upstream. + +Instead of defining its own `Tok2Vec` instance, a model architecture like +[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec` +argument that connects to the shared `tok2vec` component in the pipeline. + +| Name | Type | Description | +| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `width` | int | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. | +| `upstream` | str | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. | + ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed} @@ -195,7 +236,7 @@ and residual connections. > depth = 4 > ``` -Encode context using bidirectonal LSTM layers. Requires +Encode context using bidirectional LSTM layers. Requires [PyTorch](https://pytorch.org). | Name | Type | Description | @@ -237,8 +278,6 @@ architectures into your training config. ### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener} - - > #### Example Config > > ```ini @@ -250,10 +289,41 @@ architectures into your training config. > @layers = "reduce_mean.v1" > ``` -| Name | Type | Description | -| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- | -| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. | -| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. | +Create a `TransformerListener` layer, which will connect to a +[`Transformer`](/api/transformer) component earlier in the pipeline. The layer +takes a list of [`Doc`](/api/doc) objects as input, and produces a list of +2-dimensional arrays as output, with each array having one row per token. Most +spaCy models expect a sublayer with this signature, making it easy to connect +them to a transformer model via this sublayer. Transformer models usually +operate over wordpieces, which usually don't align one-to-one against spaCy +tokens. The layer therefore requires a reduction operation in order to calculate +a single token vector given zero or more wordpiece vectors. + +| Name | Type | Description | +| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. | +| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. | + +### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer} + +> #### Example Config +> +> ```ini +> # TODO: +> ``` + +Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does +**not** allow multiple components to share the transformer weights, and does +**not** allow the transformer to set annotations into the [`Doc`](/api/doc) +object, but it's a **simpler solution** if you only need the transformer within +one component. + +| Name | Type | Description | +| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_spans` | callable | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. | +| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). | +| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. | +| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. | ## Parser & NER architectures {#parser} @@ -417,20 +487,18 @@ network has an internal CNN Tok2Vec layer and uses attention. > nO = null > ``` -| Name | Type | Description | -| -------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | -| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. | -| `width` | int | Output dimension of the feature encoding step. | -| `embed_size` | int | Input dimension of the feature encoding step. | -| `conv_depth` | int | Depth of the Tok2Vec layer. | -| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. | -| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | -| `dropout` | float | The dropout rate. | -| `nO` | int | Output dimension, determined by the number of different labels. | - -If the `nO` dimension is not set, the TextCategorizer component will set it when -`begin_training` is called. +| Name | Type | Description | +| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | +| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. | +| `width` | int | Output dimension of the feature encoding step. | +| `embed_size` | int | Input dimension of the feature encoding step. | +| `conv_depth` | int | Depth of the Tok2Vec layer. | +| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. | +| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | +| `dropout` | float | The dropout rate. | +| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when | +| `begin_training` is called. | ### spacy.TextCatCNN.v1 {#TextCatCNN} @@ -457,14 +525,12 @@ A neural network model where token vectors are calculated using a CNN. The vectors are mean pooled and used as features in a feed-forward network. This architecture is usually less accurate than the ensemble, but runs faster. -| Name | Type | Description | -| ------------------- | ------------------------------------------ | --------------------------------------------------------------- | -| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | -| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | -| `nO` | int | Output dimension, determined by the number of different labels. | - -If the `nO` dimension is not set, the TextCategorizer component will set it when -`begin_training` is called. +| Name | Type | Description | +| --------------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | +| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | +| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when | +| `begin_training` is called. | ### spacy.TextCatBOW.v1 {#TextCatBOW} @@ -482,17 +548,17 @@ If the `nO` dimension is not set, the TextCategorizer component will set it when An ngram "bag-of-words" model. This architecture should run much faster than the others, but may not be as accurate, especially if texts are short. -| Name | Type | Description | -| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------- | -| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | -| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | -| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. | -| `nO` | int | Output dimension, determined by the number of different labels. | - -If the `nO` dimension is not set, the TextCategorizer component will set it when -`begin_training` is called. +| Name | Type | Description | +| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | +| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | +| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. | +| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when | +| `begin_training` is called. | + ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} @@ -558,8 +624,6 @@ A function that creates a default, empty `KnowledgeBase` from a A function that takes as input a [`KnowledgeBase`](/api/kb) and a [`Span`](/api/span) object denoting a named entity, and returns a list of -plausible [`Candidate` objects](/api/kb/#candidate_init). - -The default `CandidateGenerator` simply uses the text of a mention to find its -potential aliases in the Knowledgebase. Note that this function is -case-dependent. +plausible [`Candidate` objects](/api/kb/#candidate_init). The default +`CandidateGenerator` simply uses the text of a mention to find its potential +aliases in the `KnowledgeBase`. Note that this function is case-dependent. diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 377b2456f..c4a774cd0 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -601,9 +601,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides ## Pretrain {#pretrain new="2.1" tag="experimental"} - - -Pre-train the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline +Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline components on [raw text](/api/data-formats#pretrain), using an approximate language-modeling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the @@ -611,7 +609,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can then include a **path to one of these pretrained weights files** in your [training config](/usage/training#config) as the `init_tok2vec` setting when you train your model. This technique may be especially helpful if you have little -labelled data. +labelled data. See the usage docs on [pretraining](/usage/training#pretraining) +for more info. @@ -634,8 +633,8 @@ $ python -m spacy pretrain [texts_loc] [output_dir] [config_path] | `output_dir` | positional | Directory to write models to on each epoch. | | `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | | `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | -| `--resume-path`, `-r` | option | TODO: | -| `--epoch-resume`, `-er` | option | TODO: | +| `--resume-path`, `-r` | option | Path to pretrained weights from which to resume pretraining. | +| `--epoch-resume`, `-er` | option | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. | | `--help`, `-h` | flag | Show help message and available arguments. | | overrides | | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. | | **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. | diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index c0a87756d..af7cb26de 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -20,9 +20,9 @@ Config files define the training process and model pipeline and can be passed to [`spacy train`](/api/cli#train). They use [Thinc's configuration system](https://thinc.ai/docs/usage-config) under the hood. For details on how to use training configs, see the -[usage documentation](/usage/training#config). - - +[usage documentation](/usage/training#config). To get started with a blank +config or fill a partial config with all defaults, you can use the +[`init config`](/api/cli#init-config) command. > #### What does the @ mean? > @@ -52,8 +52,6 @@ your config and check that it's valid, you can run the - - ### nlp {#config-nlp tag="section"} > #### Example @@ -154,8 +152,6 @@ This section is optional and defines settings and controls for [language model pretraining](/usage/training#pretraining). It's used when you run [`spacy pretrain`](/api/cli#pretrain). - - | Name | Type | Description | Default | | ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- | | `max_epochs` | int | Maximum number of epochs. | `1000` | diff --git a/website/docs/api/dependencymatcher.md b/website/docs/api/dependencymatcher.md index 3638575df..4f192783f 100644 --- a/website/docs/api/dependencymatcher.md +++ b/website/docs/api/dependencymatcher.md @@ -5,4 +5,194 @@ tag: class source: spacy/matcher/dependencymatcher.pyx --- -TODO: write +The `DependencyMatcher` follows the same API as the [`Matcher`](/api/matcher) +and [`PhraseMatcher`](/api/phrasematcher) and lets you match on dependency trees +using the +[Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). +It requires a pretrained [`DependencyParser`](/api/parser) or other component +that sets the `Token.dep` attribute. + +## Pattern format {#patterns} + +> ```json +> ### Example +> [ +> { +> "SPEC": {"NODE_NAME": "founded"}, +> "PATTERN": {"ORTH": "founded"} +> }, +> { +> "SPEC": { +> "NODE_NAME": "founder", +> "NBOR_RELOP": ">", +> "NBOR_NAME": "founded" +> }, +> "PATTERN": {"DEP": "nsubj"} +> }, +> { +> "SPEC": { +> "NODE_NAME": "object", +> "NBOR_RELOP": ">", +> "NBOR_NAME": "founded" +> }, +> "PATTERN": {"DEP": "dobj"} +> } +> ] +> ``` + +A pattern added to the `DependencyMatcher` consists of a list of dictionaries, +with each dictionary describing a node to match. Each pattern should have the +following top-level keys: + +| Name | Type | Description | +| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- | +| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). | +| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. | + +The `SPEC` includes the following fields: + +| Name | Type | Description | +| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. | +| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. | +| `NBOR_NAME` | str | The unique name of the node that this node is connected to. | + +## DependencyMatcher.\_\_init\_\_ {#init tag="method"} + +Create a rule-based `DependencyMatcher`. + +> #### Example +> +> ```python +> from spacy.matcher import DependencyMatcher +> matcher = DependencyMatcher(nlp.vocab) +> ``` + +| Name | Type | Description | +| ------- | ------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | + +## DependencyMatcher.\_\call\_\_ {#call tag="method"} + +Find all token sequences matching the supplied patterns on the `Doc` or `Span`. + +> #### Example +> +> ```python +> from spacy.matcher import Matcher +> +> matcher = Matcher(nlp.vocab) +> pattern = [ +> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, +> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, +> ] +> matcher.add("Founder", [pattern]) +> doc = nlp("Bill Gates founded Microsoft.") +> matches = matcher(doc) +> ``` + +| Name | Type | Description | +| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | +| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | + +## DependencyMatcher.\_\_len\_\_ {#len tag="method"} + +Get the number of rules (edges) added to the dependency matcher. Note that this +only returns the number of rules (identical with the number of IDs), not the +number of individual patterns. + +> #### Example +> +> ```python +> matcher = DependencyMatcher(nlp.vocab) +> assert len(matcher) == 0 +> pattern = [ +> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, +> {"SPEC": {"NODE_NAME": "START_ENTITY", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, +> ] +> matcher.add("Rule", [pattern]) +> assert len(matcher) == 1 +> ``` + +| Name | Type | Description | +| ----------- | ---- | -------------------- | +| **RETURNS** | int | The number of rules. | + +## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"} + +Check whether the matcher contains rules for a match ID. + +> #### Example +> +> ```python +> matcher = Matcher(nlp.vocab) +> assert "Rule" not in matcher +> matcher.add("Rule", [pattern]) +> assert "Rule" in matcher +> ``` + +| Name | Type | Description | +| ----------- | ---- | ----------------------------------------------------- | +| `key` | str | The match ID. | +| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | + +## DependencyMatcher.add {#add tag="method"} + +Add a rule to the matcher, consisting of an ID key, one or more patterns, and an +optional callback function to act on the matches. The callback function will +receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already +exists for the given ID, the patterns will be extended. An `on_match` callback +will be overwritten. + +> #### Example +> +> ```python +> def on_match(matcher, doc, id, matches): +> print('Matched!', matches) +> +> matcher = Matcher(nlp.vocab) +> matcher.add("TEST_PATTERNS", patterns) +> ``` + +| Name | Type | Description | +| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | +| _keyword-only_ | | | +| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | + +## DependencyMatcher.remove {#remove tag="method"} + +Remove a rule from the matcher. A `KeyError` is raised if the match ID does not +exist. + +> #### Example +> +> ```python +> matcher.add("Rule", [pattern]]) +> assert "Rule" in matcher +> matcher.remove("Rule") +> assert "Rule" not in matcher +> ``` + +| Name | Type | Description | +| ----- | ---- | ------------------------- | +| `key` | str | The ID of the match rule. | + +## DependencyMatcher.get {#get tag="method"} + +Retrieve the pattern stored for a key. Returns the rule as an +`(on_match, patterns)` tuple containing the callback and available patterns. + +> #### Example +> +> ```python +> matcher.add("Rule", [pattern], on_match=on_match) +> on_match, patterns = matcher.get("Rule") +> ``` + +| Name | Type | Description | +| ----------- | ----- | --------------------------------------------- | +| `key` | str | The ID of the match rule. | +| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index e56e85e64..6c9222781 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -8,6 +8,23 @@ api_string_name: parser api_trainable: true --- +A transition-based dependency parser component. The dependency parser jointly +learns sentence segmentation and labelled dependency parsing, and can optionally +learn to merge tokens that had been over-segmented by the tokenizer. The parser +uses a variant of the **non-monotonic arc-eager transition-system** described by +[Honnibal and Johnson (2014)](https://www.aclweb.org/anthology/D15-1162/), with +the addition of a "break" transition to perform the sentence segmentation. +[Nivre (2005)](https://www.aclweb.org/anthology/P05-1013/)'s **pseudo-projective +dependency transformation** is used to allow the parser to predict +non-projective parses. + +The parser is trained using an **imitation learning objective**. It follows the +actions predicted by the current weights, and at each state, determines which +actions are compatible with the optimal parse that could be reached from the +current state. The weights such that the scores assigned to the set of optimal +actions is increased, while scores assigned to other actions are decreased. Note +that more than one action may be optimal for a given state. + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -23,18 +40,21 @@ architectures and their arguments and hyperparameters. > from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL > config = { > "moves": None, -> # TODO: rest +> "update_with_oracle_cut_size": 100, +> "learn_tokens": False, +> "min_action_freq": 30, > "model": DEFAULT_PARSER_MODEL, > } > nlp.add_pipe("parser", config=config) > ``` - - -| Setting | Type | Description | Default | -| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | -| `moves` | list | | `None` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | +| Setting | Type | Description | Default | +| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | +| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | `None` | +| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` | +| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | `False` | +| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx @@ -61,19 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). - - -| Name | Type | Description | -| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | list | | -| _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | | -| `multitasks` | `Iterable` | | -| `learn_tokens` | bool | | -| `min_action_freq` | int | | +| Name | Type | Description | +| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | +| _keyword-only_ | | | +| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. | +| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | +| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | ## DependencyParser.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 0ab17f953..a6368e62b 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -8,6 +8,18 @@ api_string_name: ner api_trainable: true --- +A transition-based named entity recognition component. The entity recognizer +identifies **non-overlapping labelled spans** of tokens. The transition-based +algorithm used encodes certain assumptions that are effective for "traditional" +named entity recognition tasks, but may not be a good fit for every span +identification problem. Specifically, the loss function optimizes for **whole +entity accuracy**, so if your inter-annotator agreement on boundary tokens is +low, the component will likely perform poorly on your problem. The +transition-based algorithm also assumes that the most decisive information about +your entities will be close to their initial tokens. If your entities are long +and characterized by tokens in their middle, the component will likely not be a +good fit for your task. + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -23,18 +35,17 @@ architectures and their arguments and hyperparameters. > from spacy.pipeline.ner import DEFAULT_NER_MODEL > config = { > "moves": None, -> # TODO: rest +> "update_with_oracle_cut_size": 100, > "model": DEFAULT_NER_MODEL, > } > nlp.add_pipe("ner", config=config) > ``` - - -| Setting | Type | Description | Default | -| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | -| `moves` | list | | `None` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | +| Setting | Type | Description | Default | +| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- | +| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | +| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx @@ -61,19 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). - - -| Name | Type | Description | -| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `moves` | list | | -| _keyword-only_ | | | -| `update_with_oracle_cut_size` | int | | -| `multitasks` | `Iterable` | | -| `learn_tokens` | bool | | -| `min_action_freq` | int | | +| Name | Type | Description | +| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | +| _keyword-only_ | | | +| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. | ## EntityRecognizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 7464a029e..79782fd72 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -242,6 +242,21 @@ a batch of [Example](/api/example) objects. Update the models in the pipeline. + + +The `Language.update` method now takes a batch of [`Example`](/api/example) +objects instead of the raw texts and annotations or `Doc` and `GoldParse` +objects. An [`Example`](/api/example) streamlines how data is passed around. It +stores two `Doc` objects: one for holding the gold-standard reference data, and +one for holding the predictions of the pipeline. + +For most use cases, you shouldn't have to write your own training scripts +anymore. Instead, you can use [`spacy train`](/api/cli#train) with a config file +and custom registered functions if needed. See the +[training documentation](/usage/training) for details. + + + > #### Example > > ```python @@ -253,7 +268,7 @@ Update the models in the pipeline. | Name | Type | Description | | --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | -| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | _keyword-only_ | | | | `drop` | float | The dropout rate. | | `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 6a6bb1244..f1242d193 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -9,6 +9,28 @@ api_string_name: lemmatizer api_trainable: false --- +Component for assigning base forms to tokens using rules based on part-of-speech +tags, or lookup tables. Functionality to train the component is coming soon. +Different [`Language`](/api/language) subclasses can implement their own +lemmatizer components via +[language-specific factories](/usage/processing-pipelines#factories-language). +The default data used is provided by the +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) +extension package. + + + +As of v3.0, the `Lemmatizer` is a **standalone pipeline component** that can be +added to your pipeline, and not a hidden part of the vocab that runs behind the +scenes. This makes it easier to customize how lemmas should be assigned in your +pipeline. + +If the lemmatization mode is set to `"rule"` and requires part-of-speech tags to +be assigned, make sure a [`Tagger`](/api/tagger) or another component assigning +tags is available in the pipeline and runs _before_ the lemmatizer. + + + ## Config and implementation The default config is defined by the pipeline component factory and describes @@ -29,7 +51,7 @@ lemmatizers, see the | Setting | Type | Description | Default | | ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` | +| `mode` | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. | `"lookup"` | | `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` | | `overwrite` | bool | Whether to overwrite existing lemmas. | `False` | | `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` | @@ -55,15 +77,15 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Type | Description | -| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | [`Vocab`](/api/vocab) | The vocab. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". | -| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. | -| overwrite | bool | Whether to overwrite existing lemmas. | +| Name | Type | Description | +| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | [`Vocab`](/api/vocab) | The vocab. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| _keyword-only_ | | | +| mode | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. | +| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. | +| overwrite | bool | Whether to overwrite existing lemmas. | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 925c9ad2e..b481f1972 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -5,6 +5,82 @@ tag: class source: spacy/matcher/matcher.pyx --- +The `Matcher` lets you find words and phrases using rules describing their token +attributes. Rules can refer to token annotations (like the text or +part-of-speech tags), as well as lexical attributes like `Token.is_punct`. +Applying the matcher to a [`Doc`](/api/doc) gives you access to the matched +tokens in context. For in-depth examples and workflows for combining rules and +statistical models, see the [usage guide](/usage/rule-based-matching) on +rule-based matching. + +## Pattern format {#patterns} + +> ```json +> ### Example +> [ +> {"LOWER": "i"}, +> {"LEMMA": {"IN": ["like", "love"]}}, +> {"POS": "NOUN", "OP": "+"} +> ] +> ``` + +A pattern added to the `Matcher` consists of a list of dictionaries. Each +dictionary describes **one token** and its attributes. The available token +pattern keys correspond to a number of +[`Token` attributes](/api/token#attributes). The supported attributes for +rule-based matching are: + +| Attribute | Type |  Description | +| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | +| `ORTH` | str | The exact verbatim text of a token. | +| `TEXT` 2.1 | str | The exact verbatim text of a token. | +| `LOWER` | str | The lowercase form of the token text. | +|  `LENGTH` | int | The length of the token text. | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | +|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | +| `ENT_TYPE` | str | The token's entity label. | +| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | +| `OP` | str | Operator or quantifier to determine how often to match a token pattern. | + +Operators and quantifiers define **how often** a token pattern should be +matched: + +> ```json +> ### Example +> [ +> {"POS": "ADJ", "OP": "*"}, +> {"POS": "NOUN", "OP": "+"} +> ] +> ``` + +| OP | Description | +| --- | ---------------------------------------------------------------- | +| `!` | Negate the pattern, by requiring it to match exactly 0 times. | +| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | +| `+` | Require the pattern to match 1 or more times. | +| `*` | Allow the pattern to match zero or more times. | + +Token patterns can also map to a **dictionary of properties** instead of a +single value to indicate whether the expected value is a member of a list or how +it compares to another value. + +> ```json +> ### Example +> [ +> {"LEMMA": {"IN": ["like", "love", "enjoy"]}}, +> {"POS": "PROPN", "LENGTH": {">=": 10}}, +> ] +> ``` + +| Attribute | Type | Description | +| -------------------------- | ---------- | --------------------------------------------------------------------------------- | +| `IN` | any | Attribute value is member of a list. | +| `NOT_IN` | any | Attribute value is _not_ member of a list. | +| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. | + ## Matcher.\_\_init\_\_ {#init tag="method"} Create the rule-based `Matcher`. If `validate=True` is set, all patterns added @@ -60,7 +136,7 @@ Match a stream of documents, yielding them in turn. | Name | Type | Description | | --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `docs` | iterable | A stream of documents. | +| `docs` | iterable | A stream of documents or spans. | | `batch_size` | int | The number of documents to accumulate into a working set. | | `return_matches` 2.1 | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. | | `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. | @@ -105,11 +181,11 @@ Check whether the matcher contains rules for a match ID. ## Matcher.add {#add tag="method" new="2"} -Add a rule to the matcher, consisting of an ID key, one or more patterns, and a -callback function to act on the matches. The callback function will receive the -arguments `matcher`, `doc`, `i` and `matches`. If a pattern already exists for -the given ID, the patterns will be extended. An `on_match` callback will be -overwritten. +Add a rule to the matcher, consisting of an ID key, one or more patterns, and an +optional callback function to act on the matches. The callback function will +receive the arguments `matcher`, `doc`, `i` and `matches`. If a pattern already +exists for the given ID, the patterns will be extended. An `on_match` callback +will be overwritten. > #### Example > @@ -141,12 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]] -| Name | Type | Description | -| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | -| `match_id` | str | An ID for the thing you're matching. | -| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | -| _keyword-only_ | | | -| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | +| Name | Type | Description | +| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- | +| `match_id` | str | An ID for the thing you're matching. | +| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | +| _keyword-only_ | | | +| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | +| `greedy` 3 | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. | ## Matcher.remove {#remove tag="method" new="2"} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index bfe5c3c77..942440234 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -63,16 +63,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). - - | Name | Type | Description | | -------------- | ------- | ------------------------------------------------------------------------------------------- | | `vocab` | `Vocab` | The shared vocabulary. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | _keyword-only_ | | | -| `labels_morph` | dict | | -| `labels_pos` | dict | | +| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. | +| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. | ## Morphologizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 866aca096..71c7a463b 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -9,7 +9,8 @@ new: 2 The `PhraseMatcher` lets you efficiently match large terminology lists. While the [`Matcher`](/api/matcher) lets you match sequences based on lists of token descriptions, the `PhraseMatcher` accepts match patterns in the form of `Doc` -objects. +objects. See the [usage guide](/usage/rule-based-matching#phrasematcher) for +examples. ## PhraseMatcher.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index d9b8f4caf..233171779 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -28,10 +28,10 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tagger", config=config) > ``` -| Setting | Type | Description | Default | -| ---------------- | ------------------------------------------ | -------------------------------------- | ----------------------------------- | -| `set_morphology` | bool | Whether to set morphological features. | `False` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) | +| Setting | Type | Description | Default | +| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- | +| `set_morphology` | bool | Whether to set morphological features. | `False` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx @@ -58,13 +58,13 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Type | Description | -| ---------------- | ------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| _keyword-only_ | | | -| `set_morphology` | bool | Whether to set morphological features. | +| Name | Type | Description | +| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| _keyword-only_ | | | +| `set_morphology` | bool | Whether to set morphological features. | ## Tagger.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index 1efd5831c..5af540828 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -9,6 +9,12 @@ api_string_name: textcat api_trainable: true --- +The text categorizer predicts **categories over a whole document**. It can learn +one or more labels, and the labels can be mutually exclusive (i.e. one true +label per document) or non-mutually exclusive (i.e. zero or more labels may be +true per document). The multi-label setting is controlled by the model instance +that's provided. + ## Config and implementation {#config} The default config is defined by the pipeline component factory and describes @@ -29,10 +35,10 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("textcat", config=config) > ``` -| Setting | Type | Description | Default | -| -------- | ------------------------------------------ | ------------------ | ----------------------------------------------------- | -| `labels` | `Iterable[str]` | The labels to use. | `[]` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) | +| Setting | Type | Description | Default | +| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- | +| `labels` | `List[str]` | A list of categories to learn. If empty, the model infers the categories from the data. | `[]` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py @@ -67,23 +73,6 @@ shortcut for this and instantiate the component using its string name and | _keyword-only_ | | | | `labels` | `Iterable[str]` | The labels to use. | - - ## TextCategorizer.\_\_call\_\_ {#call tag="method"} Apply the pipe to one document. The document is modified in place, and returned. diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index f810793ce..dce595023 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -8,7 +8,20 @@ api_string_name: tok2vec api_trainable: true --- - +Apply a "token-to-vector" model and set its outputs in the `Doc.tensor` +attribute. This is mostly useful to **share a single subnetwork** between +multiple components, e.g. to have one embedding and CNN network shared between a +[`DependencyParser`](/api/dependencyparser), [`Tagger`](/api/tagger) and +[`EntityRecognizer`](/api/entityrecognizer). + +In order to use the `Tok2Vec` predictions, subsequent components should use the +[Tok2VecListener](/api/architectures#Tok2VecListener) layer as the tok2vec +subnetwork of their model. This layer will read data from the `doc.tensor` +attribute during prediction. During training, the `Tok2Vec` component will save +its prediction and backprop callback for each batch, so that the subsequent +components can backpropagate to the shared weights. This implementation is used +because it allows us to avoid relying on object identity within the models to +achieve the parameter sharing. ## Config and implementation {#config} @@ -27,9 +40,9 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("tok2vec", config=config) > ``` -| Setting | Type | Description | Default | -| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------- | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) | +| Setting | Type | Description | Default | +| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) | ```python https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py @@ -64,9 +77,11 @@ shortcut for this and instantiate the component using its string name and ## Tok2Vec.\_\_call\_\_ {#call tag="method"} -Apply the pipe to one document. The document is modified in place, and returned. -This usually happens under the hood when the `nlp` object is called on a text -and all pipeline components are applied to the `Doc` in order. Both +Apply the pipe to one document and add context-sensitive embeddings to the +`Doc.tensor` attribute, allowing them to be used as features by downstream +components. The document is modified in place, and returned. This usually +happens under the hood when the `nlp` object is called on a text and all +pipeline components are applied to the `Doc` in order. Both [`__call__`](/api/tok2vec#call) and [`pipe`](/api/tok2vec#pipe) delegate to the [`predict`](/api/tok2vec#predict) and [`set_annotations`](/api/tok2vec#set_annotations) methods. diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index b63a4adba..0b3167901 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -340,7 +340,7 @@ See the [`Transformer`](/api/transformer) API reference and ## Batchers {#batchers source="spacy/gold/batchers.py" new="3"} - + #### batch_by_words.v1 {#batch_by_words tag="registered function"} @@ -361,19 +361,16 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument > get_length = null > ``` - - -| Name | Type | Description | -| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | -| `tolerance` | float | | -| `discard_oversize` | bool | Discard items that are longer than the specified batch length. | -| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. | +| Name | Type | Description | +| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `seqs` | `Iterable[Any]` | The sequences to minibatch. | +| `size` | `Iterable[int]` / int | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | +| `tolerance` | float | What percentage of the size to allow batches to exceed. | +| `discard_oversize` | bool | Whether to discard sequences that by themselves exceed the tolerated size. | +| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | #### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"} - - > #### Example config > > ```ini @@ -383,34 +380,37 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument > get_length = null > ``` - +Create a batcher that creates batches of the specified size. -| Name | Type | Description | -| ------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | -| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. | +| Name | Type | Description | +| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `size` | `Iterable[int]` / int | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | +| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | #### batch_by_padded.v1 {#batch_by_padded tag="registered function"} - - > #### Example config > > ```ini > [training.batcher] -> @batchers = "batch_by_words.v1" +> @batchers = "batch_by_padded.v1" > size = 100 -> buffer = TODO: +> buffer = 256 > discard_oversize = false > get_length = null > ``` -| Name | Type | Description | -| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `size` | `Iterable[int]` / int | The batch size. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | -| `buffer` | int | | -| `discard_oversize` | bool | Discard items that are longer than the specified batch length. | -| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence and returns its length. Defaults to the built-in `len()` if not set. | +Minibatch a sequence by the size of padded batches that would result, with +sequences binned by length within a window. The padded size is defined as the +maximum length of sequences within the batch multiplied by the number of +sequences in the batch. + +| Name | Type | Description | +| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `size` | `Iterable[int]` / int | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | +| `buffer` | int | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. | +| `discard_oversize` | bool | Whether to discard sequences that are by themselves longer than the largest padded batch size. | +| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | ## Training data and alignment {#gold source="spacy/gold"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 6b6be6bd0..57f06cd9e 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -25,8 +25,15 @@ work out-of-the-box. -This pipeline component lets you use transformer models in your pipeline. The -component assigns the output of the transformer to the Doc's extension +This pipeline component lets you use transformer models in your pipeline. +Supports all models that are available via the +[HuggingFace `transformers`](https://huggingface.co/transformers) library. +Usually you will connect subsequent components to the shared transformer using +the [TransformerListener](/api/architectures#TransformerListener) layer. This +works similarly to spaCy's [Tok2Vec](/api/tok2vec) component and +[Tok2VecListener](/api/architectures/Tok2VecListener) sublayer. + +The component assigns the output of the transformer to the `Doc`'s extension attributes. We also calculate an alignment between the word-piece tokens and the spaCy tokenization, so that we can use the last hidden states to set the `Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy @@ -53,11 +60,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Type | Description | Default | -| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | -| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | -| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | +| Setting | Type | Description | Default | +| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | +| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | +| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. | [TransformerModel](/api/architectures#TransformerModel) | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py @@ -86,18 +93,22 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p > trf = Transformer(nlp.vocab, model) > ``` -Create a new pipeline instance. In your application, you would normally use a -shortcut for this and instantiate the component using its string name and -[`nlp.add_pipe`](/api/language#create_pipe). +Construct a `Transformer` component. One or more subsequent spaCy components can +use the transformer outputs as features in its model, with gradients +backpropagated to the single shared weights. The activations from the +transformer are saved in the [`Doc._.trf_data`](#custom-attributes) extension +attribute. You can also provide a callback to set additional annotations. In +your application, you would normally use a shortcut for this and instantiate the +component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Type | Description | -| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. | -| _keyword-only_ | | | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | +| Name | Type | Description | +| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. | +| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | +| _keyword-only_ | | | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | ## Transformer.\_\_call\_\_ {#call tag="method"} @@ -184,7 +195,10 @@ Apply the pipeline's model to a batch of docs, without modifying them. ## Transformer.set_annotations {#set_annotations tag="method"} -Modify a batch of documents, using pre-computed scores. +Assign the extracted features to the Doc objects. By default, the +[`TransformerData`](/api/transformer#transformerdata) object is written to the +[`Doc._.trf_data`](#custom-attributes) attribute. Your annotation_setter +callback is then called, if provided. > #### Example > @@ -201,8 +215,19 @@ Modify a batch of documents, using pre-computed scores. ## Transformer.update {#update tag="method"} -Learn from a batch of documents and gold-standard information, updating the -pipe's model. Delegates to [`predict`](/api/transformer#predict). +Prepare for an update to the transformer. Like the [`Tok2Vec`](/api/tok2vec) +component, the `Transformer` component is unusual in that it does not receive +"gold standard" annotations to calculate a weight update. The optimal output of +the transformer data is unknown – it's a hidden layer inside the network that is +updated by backpropagating from output layers. + +The `Transformer` component therefore does **not** perform a weight update +during its own `update` method. Instead, it runs its transformer model and +communicates the output and the backpropagation callback to any **downstream +components** that have been connected to it via the +[TransformerListener](/api/architectures#TransformerListener) sublayer. If there +are multiple listeners, the last layer will actually backprop to the transformer +and call the optimizer, while the others simply increment the gradients. > #### Example > @@ -212,15 +237,15 @@ pipe's model. Delegates to [`predict`](/api/transformer#predict). > losses = trf.update(examples, sgd=optimizer) > ``` -| Name | Type | Description | -| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). | -| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | -| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Type | Description | +| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## Transformer.create_optimizer {#create_optimizer tag="method"} @@ -394,21 +419,23 @@ Split a `TransformerData` object that represents a batch into a list with one | ----------- | ----------------------- | ----------- | | **RETURNS** | `List[TransformerData]` | | -## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} - - +## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} Span getters are functions that take a batch of [`Doc`](/api/doc) objects and return a lists of [`Span`](/api/span) objects for each doc, to be processed by -the transformer. The returned spans can overlap. Span getters can be referenced -in the config's `[components.transformer.model.get_spans]` block to customize -the sequences processed by the transformer. You can also register custom span -getters using the `@registry.span_getters` decorator. +the transformer. This is used to manage long documents, by cutting them into +smaller sequences before running the transformer. The spans are allowed to +overlap, and you can also omit sections of the Doc if they are not relevant. + +Span getters can be referenced in the `[components.transformer.model.get_spans]` +block of the config to customize the sequences processed by the transformer. You +can also register custom span getters using the `@spacy.registry.span_getters` +decorator. > #### Example > > ```python -> @registry.span_getters("sent_spans.v1") +> @spacy.registry.span_getters("sent_spans.v1") > def configure_get_sent_spans() -> Callable: > def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]: > return [list(doc.sents) for doc in docs] @@ -421,15 +448,55 @@ getters using the `@registry.span_getters` decorator. | `docs` | `Iterable[Doc]` | A batch of `Doc` objects. | | **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. | -The following built-in functions are available: +### doc_spans.v1 {#doc_spans tag="registered function"} - +> #### Example config +> +> ```ini +> [transformer.model.get_spans] +> @span_getters = "doc_spans.v1" +> ``` -| Name | Description | -| ------------------ | ------------------------------------------------------------------ | -| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). | -| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. | -| `strided_spans.v1` | | +Create a span getter that uses the whole document as its spans. This is the best +approach if your [`Doc`](/api/doc) objects already refer to relatively short +texts. + +### sent_spans.v1 {#sent_spans tag="registered function"} + +> #### Example config +> +> ```ini +> [transformer.model.get_spans] +> @span_getters = "sent_spans.v1" +> ``` + +Create a span getter that uses sentence boundary markers to extract the spans. +This requires sentence boundaries to be set (e.g. by the +[`Sentencizer`](/api/sentencizer)), and may result in somewhat uneven batches, +depending on the sentence lengths. However, it does provide the transformer with +more meaningful windows to attend over. + +### strided_spans.v1 {#strided_spans tag="registered function"} + +> #### Example config +> +> ```ini +> [transformer.model.get_spans] +> @span_getters = "strided_spans.v1" +> window = 128 +> stride = 96 +> ``` + +Create a span getter for strided spans. If you set the `window` and `stride` to +the same value, the spans will cover each token once. Setting `stride` lower +than `window` will allow for an overlap, so that some tokens are counted twice. +This can be desirable, because it allows all tokens to have both a left and +right context. + +| Name | Type | Description | +| --------- | ---- | ---------------- | +|  `window` | int | The window size. | +| `stride` | int | The stride size. | ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index d321b7cb9..42ba44dc8 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -1,54 +1,88 @@ -The central data structures in spaCy are the `Doc` and the `Vocab`. The `Doc` -object owns the **sequence of tokens** and all their annotations. The `Vocab` -object owns a set of **look-up tables** that make common information available -across documents. By centralizing strings, word vectors and lexical attributes, -we avoid storing multiple copies of this data. This saves memory, and ensures -there's a **single source of truth**. +The central data structures in spaCy are the [`Language`](/api/language) class, +the [`Vocab`](/api/vocab) and the [`Doc`](/api/doc) object. The `Language` class +is used to process a text and turn it into a `Doc` object. It's typically stored +as a variable called `nlp`. The `Doc` object owns the **sequence of tokens** and +all their annotations. By centralizing strings, word vectors and lexical +attributes in the `Vocab`, we avoid storing multiple copies of this data. This +saves memory, and ensures there's a **single source of truth**. Text annotations are also designed to allow a single source of truth: the `Doc` -object owns the data, and `Span` and `Token` are **views that point into it**. -The `Doc` object is constructed by the `Tokenizer`, and then **modified in -place** by the components of the pipeline. The `Language` object coordinates -these components. It takes raw text and sends it through the pipeline, returning -an **annotated document**. It also orchestrates training and serialization. +object owns the data, and [`Span`](/api/span) and [`Token`](/api/token) are +**views that point into it**. The `Doc` object is constructed by the +[`Tokenizer`](/api/tokenizer), and then **modified in place** by the components +of the pipeline. The `Language` object coordinates these components. It takes +raw text and sends it through the pipeline, returning an **annotated document**. +It also orchestrates training and serialization. - + ![Library architecture](../../images/architecture.svg) ### Container objects {#architecture-containers} -| Name | Description | -| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [`Doc`](/api/doc) | A container for accessing linguistic annotations. | -| [`Span`](/api/span) | A slice from a `Doc` object. | -| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. | -| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. | -| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. | +| Name | Description | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`Language`](/api/language) | Processing class that turns text into `Doc` objects. Different languages implement their own subclasses of it. The variable is typically called `nlp`. | +| [`Doc`](/api/doc) | A container for accessing linguistic annotations. | +| [`Span`](/api/span) | A slice from a `Doc` object. | +| [`Token`](/api/token) | An individual token — i.e. a word, punctuation symbol, whitespace, etc. | +| [`Lexeme`](/api/lexeme) | An entry in the vocabulary. It's a word type with no context, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse etc. | +| [`Example`](/api/example) | A collection of training annotations, containing two `Doc` objects: the reference data and the predictions. | +| [`DocBin`](/api/docbin) | A collection of `Doc` objects for efficient binary serialization. Also used for [training data](/api/data-formats#binary-training). | ### Processing pipeline {#architecture-pipeline} -| Name | Description | -| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. | -| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. | -| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. | -| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | -| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. | -| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. | -| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. | -| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. | -| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. | -| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. | -| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | -| [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. | -| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | +The processing pipeline consists of one or more **pipeline components** that are +called on the `Doc` in order. The tokenizer runs before the components. Pipeline +components can be added using [`Language.add_pipe`](/api/language#add_pipe). +They can contain a statistical model and trained weights, or only make +rule-based modifications to the `Doc`. spaCy provides a range of built-in +components for different language processing tasks and also allows adding +[custom components](/usage/processing-pipelines#custom-components). + +![The processing pipeline](../../images/pipeline.svg) + +| Name | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------- | +| [`Tokenizer`](/api/tokenizer) | Segment raw text and create `Doc` objects from the words. | +| [`Tok2Vec`](/api/tok2vec) | Apply a "token-to-vector" model and set its outputs. | +| [`Transformer`](/api/transformer) | Use a transformer model and set its outputs. | +| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. | +| [`Morphologizer`](/api/morphologizer) | Predict morphological features and coarse-grained part-of-speech tags. | +| [`Tagger`](/api/tagger) | Predict part-of-speech tags. | +| [`AttributeRuler`](/api/attributeruler) | Set token attributes using matcher rules. | +| [`DependencyParser`](/api/dependencyparser) | Predict syntactic dependencies. | +| [`EntityRecognizer`](/api/entityrecognizer) | Predict named entities, e.g. persons or products. | +| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | +| [`EntityLinker`](/api/entitylinker) | Disambiguate named entities to nodes in a knowledge base. | +| [`TextCategorizer`](/api/textcategorizer) | Predict categories or labels over the whole document. | +| [`Sentencizer`](/api/sentencizer) | Implement rule-based sentence boundary detection that doesn't require the dependency parse. | +| [`SentenceRecognizer`](/api/sentencerecognizer) | Predict sentence boundaries. | +| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | +| [`Pipe`](/api/pipe) | Base class that all trainable pipeline components inherit from. | + +### Matchers {#architecture-matchers} + +Matchers help you find and extract information from [`Doc`](/api/doc) objects +based on match patterns describing the sequences you're looking for. A matcher +operates on a `Doc` and gives you access to the matched tokens **in context**. + +| Name | Description | +| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. | +| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. | +| [`DependencyMatcher`](/api/dependencymatcher) | Match sequences of tokens based on dependency trees using the [Semgrex syntax](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html). | ### Other classes {#architecture-other} -| Name | Description | -| --------------------------------- | ----------------------------------------------------------------------------- | -| [`Vocab`](/api/vocab) | A lookup table for the vocabulary that allows you to access `Lexeme` objects. | -| [`StringStore`](/api/stringstore) | Map strings to and from hash values. | -| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. | -| [`Example`](/api/example) | Collection for training annotations. | +| Name | Description | +| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | +| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. | +| [`StringStore`](/api/stringstore) | Map strings to and from hash values. | +| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. | +| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. | +| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | +| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. | +| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. | +| [`Scorer`](/api/scorer) | Compute evaluation scores. | +| [`Corpus`](/api/corpis) | Class for managing annotated corpora for training and evaluation data. | diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 5ad59482f..589cef44c 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -750,16 +750,13 @@ print([w.text for w in nlp("gimme that")]) # ['gim', 'me', 'that'] The special case doesn't have to match an entire whitespace-delimited substring. The tokenizer will incrementally split off punctuation, and keep looking up the -remaining substring: +remaining substring. The special case rules also have precedence over the +punctuation splitting. ```python assert "gimme" not in [w.text for w in nlp("gimme!")] assert "gimme" not in [w.text for w in nlp('("...gimme...?")')] -``` -The special case rules have precedence over the punctuation splitting: - -```python nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}]) assert len(nlp("...gimme...?")) == 1 ``` @@ -813,19 +810,6 @@ domain. There are six things you may need to define: 6. An optional boolean function `url_match`, which is similar to `token_match` except that prefixes and suffixes are removed before applying the match. - - -In spaCy v2.2.2-v2.2.4, the `token_match` was equivalent to the `url_match` -above and there was no match pattern applied before prefixes and suffixes were -analyzed. As of spaCy v2.3.0, the `token_match` has been reverted to its -behavior in v2.2.1 and earlier with precedence over prefixes and suffixes. - -The `url_match` is introduced in v2.3.0 to handle cases like URLs where the -tokenizer should remove prefixes and suffixes (e.g., a comma at the end of a -URL) before applying the match. - - - You shouldn't usually need to create a `Tokenizer` subclass. Standard usage is to use `re.compile()` to build a regular expression object, and pass its `.search()` and `.finditer()` methods: @@ -905,12 +889,13 @@ function that behaves the same way. -If you're using a statistical model, writing to the `nlp.Defaults` or -`English.Defaults` directly won't work, since the regular expressions are read -from the model and will be compiled when you load it. If you modify -`nlp.Defaults`, you'll only see the effect if you call -[`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the tokenizer -loaded from a statistical model, you should modify `nlp.tokenizer` directly. +If you're using a statistical model, writing to the +[`nlp.Defaults`](/api/language#defaults) or `English.Defaults` directly won't +work, since the regular expressions are read from the model and will be compiled +when you load it. If you modify `nlp.Defaults`, you'll only see the effect if +you call [`spacy.blank`](/api/top-level#spacy.blank). If you want to modify the +tokenizer loaded from a statistical model, you should modify `nlp.tokenizer` +directly. @@ -961,51 +946,50 @@ and language-specific definitions such as [`lang/de/punctuation.py`](https://github.com/explosion/spaCy/blob/master/spacy/lang/de/punctuation.py) for German. -### Hooking an arbitrary tokenizer into the pipeline {#custom-tokenizer} +### Hooking a custom tokenizer into the pipeline {#custom-tokenizer} The tokenizer is the first component of the processing pipeline and the only one that can't be replaced by writing to `nlp.pipeline`. This is because it has a different signature from all the other components: it takes a text and returns a -`Doc`, whereas all other components expect to already receive a tokenized `Doc`. +[`Doc`](/api/doc), whereas all other components expect to already receive a +tokenized `Doc`. ![The processing pipeline](../images/pipeline.svg) To overwrite the existing tokenizer, you need to replace `nlp.tokenizer` with a -custom function that takes a text, and returns a `Doc`. +custom function that takes a text, and returns a [`Doc`](/api/doc). + +> #### Creating a Doc +> +> Constructing a [`Doc`](/api/doc) object manually requires at least two +> arguments: the shared `Vocab` and a list of words. Optionally, you can pass in +> a list of `spaces` values indicating whether the token at this position is +> followed by a space (default `True`). See the section on +> [pre-tokenized text](#own-annotations) for more info. +> +> ```python +> words = ["Let", "'s", "go", "!"] +> spaces = [False, True, False, False] +> doc = Doc(nlp.vocab, words=words, spaces=spaces) +> ``` ```python -nlp = spacy.load("en_core_web_sm") +nlp = spacy.blank("en") nlp.tokenizer = my_tokenizer ``` -| Argument | Type | Description | -| ----------- | ----- | ------------------------- | -| `text` | str | The raw text to tokenize. | -| **RETURNS** | `Doc` | The tokenized document. | +| Argument | Type | Description | +| ----------- | ----------------- | ------------------------- | +| `text` | str | The raw text to tokenize. | +| **RETURNS** | [`Doc`](/api/doc) | The tokenized document. | - +#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example} -In spaCy v1.x, you had to add a custom tokenizer by passing it to the `make_doc` -keyword argument, or by passing a tokenizer "factory" to `create_make_doc`. This -was unnecessarily complicated. Since spaCy v2.0, you can write to -`nlp.tokenizer` instead. If your tokenizer needs the vocab, you can write a -function and use `nlp.vocab`. - -```diff -- nlp = spacy.load("en_core_web_sm", make_doc=my_tokenizer) -- nlp = spacy.load("en_core_web_sm", create_make_doc=my_tokenizer_factory) - -+ nlp.tokenizer = my_tokenizer -+ nlp.tokenizer = my_tokenizer_factory(nlp.vocab) -``` - - - -### Example: A custom whitespace tokenizer {#custom-tokenizer-example} - -To construct the tokenizer, we usually want attributes of the `nlp` pipeline. -Specifically, we want the tokenizer to hold a reference to the vocabulary -object. Let's say we have the following class as our tokenizer: +Here's an example of the most basic whitespace tokenizer. It takes the shared +vocab, so it can construct `Doc` objects. When it's called on a text, it returns +a `Doc` object consisting of the text split on single space characters. We can +then overwrite the `nlp.tokenizer` attribute with an instance of our custom +tokenizer. ```python ### {executable="true"} @@ -1017,68 +1001,189 @@ class WhitespaceTokenizer: self.vocab = vocab def __call__(self, text): - words = text.split(' ') - # All tokens 'own' a subsequent space character in this tokenizer - spaces = [True] * len(words) - return Doc(self.vocab, words=words, spaces=spaces) + words = text.split(" ") + return Doc(self.vocab, words=words) -nlp = spacy.load("en_core_web_sm") +nlp = spacy.blank("en") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) doc = nlp("What's happened to me? he thought. It wasn't a dream.") -print([t.text for t in doc]) +print([token.text for token in doc]) ``` -As you can see, we need a `Vocab` instance to construct this — but we won't have -it until we get back the loaded `nlp` object. The simplest solution is to build -the tokenizer in two steps. This also means that you can reuse the "tokenizer -factory" and initialize it with different instances of `Vocab`. +#### Example 2: Third-party tokenizers (BERT word pieces) {#custom-tokenizer-example2} -### Bringing your own annotations {#own-annotations} +You can use the same approach to plug in any other third-party tokenizers. Your +custom callable just needs to return a `Doc` object with the tokens produced by +your tokenizer. In this example, the wrapper uses the **BERT word piece +tokenizer**, provided by the +[`tokenizers`](https://github.com/huggingface/tokenizers) library. The tokens +available in the `Doc` object returned by spaCy now match the exact word pieces +produced by the tokenizer. -spaCy generally assumes by default that your data is raw text. However, +> #### 💡 Tip: spacy-transformers +> +> If you're working with transformer models like BERT, check out the +> [`spacy-transformers`](https://github.com/explosion/spacy-transformers) +> extension package and [documentation](/usage/transformers). It includes a +> pipeline component for using pretrained transformer weights and **training +> transformer models** in spaCy, as well as helpful utilities for aligning word +> pieces to linguistic tokenization. + +```python +### Custom BERT word piece tokenizer +from tokenizers import BertWordPieceTokenizer +from spacy.tokens import Doc +import spacy + +class BertTokenizer: + def __init__(self, vocab, vocab_file, lowercase=True): + self.vocab = vocab + self._tokenizer = BertWordPieceTokenizer(vocab_file, lowercase=lowercase) + + def __call__(self, text): + tokens = self._tokenizer.encode(text) + words = [] + spaces = [] + for i, (text, (start, end)) in enumerate(zip(tokens.tokens, tokens.offsets)): + words.append(text) + if i < len(tokens.tokens) - 1: + # If next start != current end we assume a space in between + next_start, next_end = tokens.offsets[i + 1] + spaces.append(next_start > end) + else: + spaces.append(True) + return Doc(self.vocab, words=words, spaces=spaces) + +nlp = spacy.blank("en") +nlp.tokenizer = BertTokenizer(nlp.vocab, "bert-base-uncased-vocab.txt") +doc = nlp("Justin Drew Bieber is a Canadian singer, songwriter, and actor.") +print(doc.text, [token.text for token in doc]) +# [CLS]justin drew bi##eber is a canadian singer, songwriter, and actor.[SEP] +# ['[CLS]', 'justin', 'drew', 'bi', '##eber', 'is', 'a', 'canadian', 'singer', +# ',', 'songwriter', ',', 'and', 'actor', '.', '[SEP]'] +``` + + + +Keep in mind that your model's result may be less accurate if the tokenization +during training differs from the tokenization at runtime. So if you modify a +pretrained model's tokenization afterwards, it may produce very different +predictions. You should therefore train your model with the **same tokenizer** +it will be using at runtime. See the docs on +[training with custom tokenization](#custom-tokenizer-training) for details. + + + +#### Training with custom tokenization {#custom-tokenizer-training new="3"} + +spaCy's [training config](/usage/training#config) describe the settings, +hyperparameters, pipeline and tokenizer used for constructing and training the +model. The `[nlp.tokenizer]` block refers to a **registered function** that +takes the `nlp` object and returns a tokenizer. Here, we're registering a +function called `whitespace_tokenizer` in the +[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to +construct your tokenizer during training, you can pass in your Python file by +setting `--code functions.py` when you run [`spacy train`](/api/cli#train). + +> #### config.cfg +> +> ```ini +> [nlp.tokenizer] +> @tokenizers = "whitespace_tokenizer" +> ``` + +```python +### functions.py {highlight="1"} +@spacy.registry.tokenizers("whitespace_tokenizer") +def create_whitespace_tokenizer(): + def create_tokenizer(nlp): + return WhitespaceTokenizer(nlp.vocab) + + return create_tokenizer +``` + +Registered functions can also take arguments that are then passed in from the +config. This allows you to quickly change and keep track of different settings. +Here, the registered function called `bert_word_piece_tokenizer` takes two +arguments: the path to a vocabulary file and whether to lowercase the text. The +Python type hints `str` and `bool` ensure that the received values have the +correct type. + +> #### config.cfg +> +> ```ini +> [nlp.tokenizer] +> @tokenizers = "bert_word_piece_tokenizer" +> vocab_file = "bert-base-uncased-vocab.txt" +> lowercase = true +> ``` + +```python +### functions.py {highlight="1"} +@spacy.registry.tokenizers("bert_word_piece_tokenizer") +def create_whitespace_tokenizer(vocab_file: str, lowercase: bool): + def create_tokenizer(nlp): + return BertWordPieceTokenizer(nlp.vocab, vocab_file, lowercase) + + return create_tokenizer +``` + +To avoid hard-coding local paths into your config file, you can also set the +vocab path on the CLI by using the `--nlp.tokenizer.vocab_file` +[override](/usage/training#config-overrides) when you run +[`spacy train`](/api/cli#train). For more details on using registered functions, +see the docs in [training with custom code](/usage/training#custom-code). + + + +Remember that a registered function should always be a function that spaCy +**calls to create something**, not the "something" itself. In this case, it +**creates a function** that takes the `nlp` object and returns a callable that +takes a text and returns a `Doc`. + + + +#### Using pre-tokenized text {#own-annotations} + +spaCy generally assumes by default that your data is **raw text**. However, sometimes your data is partially annotated, e.g. with pre-existing tokenization, -part-of-speech tags, etc. The most common situation is that you have pre-defined -tokenization. If you have a list of strings, you can create a `Doc` object -directly. Optionally, you can also specify a list of boolean values, indicating -whether each word has a subsequent space. +part-of-speech tags, etc. The most common situation is that you have +**pre-defined tokenization**. If you have a list of strings, you can create a +[`Doc`](/api/doc) object directly. Optionally, you can also specify a list of +boolean values, indicating whether each word is followed by a space. + +> #### ✏️ Things to try +> +> 1. Change a boolean value in the list of `spaces`. You should see it reflected +> in the `doc.text` and whether the token is followed by a space. +> 2. Remove `spaces=spaces` from the `Doc`. You should see that every token is +> now followed by a space. +> 3. Copy-paste a random sentence from the internet and manually construct a +> `Doc` with `words` and `spaces` so that the `doc.text` matches the original +> input text. ```python ### {executable="true"} import spacy from spacy.tokens import Doc -from spacy.lang.en import English -nlp = English() -doc = Doc(nlp.vocab, words=["Hello", ",", "world", "!"], - spaces=[False, True, False, False]) +nlp = spacy.blank("en") +words = ["Hello", ",", "world", "!"] +spaces = [False, True, False, False] +doc = Doc(nlp.vocab, words=words, spaces=spaces) +print(doc.text) print([(t.text, t.text_with_ws, t.whitespace_) for t in doc]) ``` -If provided, the spaces list must be the same length as the words list. The +If provided, the spaces list must be the **same length** as the words list. The spaces list affects the `doc.text`, `span.text`, `token.idx`, `span.start_char` and `span.end_char` attributes. If you don't provide a `spaces` sequence, spaCy -will assume that all words are whitespace delimited. +will assume that all words are followed by a space. Once you have a +[`Doc`](/api/doc) object, you can write to its attributes to set the +part-of-speech tags, syntactic dependencies, named entities and other +attributes. -```python -### {executable="true"} -import spacy -from spacy.tokens import Doc -from spacy.lang.en import English - -nlp = English() -bad_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"]) -good_spaces = Doc(nlp.vocab, words=["Hello", ",", "world", "!"], - spaces=[False, True, False, False]) - -print(bad_spaces.text) # 'Hello , world !' -print(good_spaces.text) # 'Hello, world!' -``` - -Once you have a [`Doc`](/api/doc) object, you can write to its attributes to set -the part-of-speech tags, syntactic dependencies, named entities and other -attributes. For details, see the respective usage pages. - -### Aligning tokenization {#aligning-tokenization} +#### Aligning tokenization {#aligning-tokenization} spaCy's tokenization is non-destructive and uses language-specific rules optimized for compatibility with treebank annotations. Other tools and resources diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 2e07eff48..d7c3d49f8 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -979,8 +979,8 @@ added via [`nlp.add_pipe`](/api/language#add_pipe). When the `nlp` object is called on a text, it will find matches in the `doc` and add them as entities to the `doc.ents`, using the specified pattern label as the entity label. If any matches were to overlap, the pattern matching most tokens takes priority. If -they also happen to be equally long, then the match occuring first in the Doc is -chosen. +they also happen to be equally long, then the match occurring first in the `Doc` +is chosen. ```python ### {executable="true"} diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index d20d87863..36f934e96 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -6,25 +6,97 @@ menu: - ['New Features', 'features'] - ['Backwards Incompatibilities', 'incompat'] - ['Migrating from v2.x', 'migrating'] - - ['Migrating plugins', 'plugins'] --- ## Summary {#summary} ## New Features {#features} +### New training workflow and config system {#features-training} + +### Transformer-based pipelines {#features-transformers} + +### Custom models using any framework {#feautres-custom-models} + +### Manage end-to-end workflows with projects {#features-projects} + +### New built-in pipeline components {#features-pipeline-components} + +| Name | Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation. | +| [`Morphologizer`](/api/morphologizer) | Trainable component to predict morphological features. | +| [`Lemmatizer`](/api/lemmatizer) | Standalone component for rule-based and lookup lemmatization. | +| [`AttributeRuler`](/api/attributeruler) | Component for setting token attributes using match patterns. | +| [`Transformer`](/api/transformer) | Component for using [transformer models](/usage/transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). | + +### New and improved pipeline component APIs {#features-components} + +- `Language.factory`, `Language.component` +- `Language.analyze_pipes` +- Adding components from other models + +### Type hints and type-based data validation {#features-types} + +spaCy v3.0 officially drops support for Python 2 and now requires **Python +3.6+**. This also means that the code base can take full advantage of +[type hints](https://docs.python.org/3/library/typing.html). spaCy's user-facing +API that's implemented in pure Python (as opposed to Cython) now comes with type +hints. The new version of spaCy's machine learning library +[Thinc](https://thinc.ai) also features extensive +[type support](https://thinc.ai/docs/usage-type-checking/), including custom +types for models and arrays, and a custom `mypy` plugin that can be used to +type-check model definitions. + +For data validation, spacy v3.0 adopts +[`pydantic`](https://github.com/samuelcolvin/pydantic). It also powers the data +validation of Thinc's [config system](https://thinc.ai/docs/usage-config), which +lets you to register **custom functions with typed arguments**, reference them +in your config and see validation errors if the argument values don't match. + +### CLI + +| Name | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------- | +| [`init config`](/api/cli#init-config) | Initialize a [training config](/usage/training) file for a blank language or auto-fill a partial config. | +| [`debug config`](/api/cli#debug-config) | Debug a [training config](/usage/training) file and show validation errors. | +| [`project`](/api/cli#project) | Subcommand for cloning and running [spaCy projects](/usage/projects). | + ## Backwards Incompatibilities {#incompat} -### Removed or renamed objects, methods, attributes and arguments {#incompat-removed} +As always, we've tried to keep the breaking changes to a minimum and focus on +changes that were necessary to support the new features, fix problems or improve +usability. The following section lists the relevant changes to the user-facing +API. For specific examples of how to rewrite your code, check out the +[migration guide](#migrating). -| Removed | Replacement | -| -------------------------------------------------------- | ----------------------------------------- | -| `GoldParse` | [`Example`](/api/example) | -| `GoldCorpus` | [`Corpus`](/api/corpus) | -| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | -| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | +### Compatibility {#incompat-compat} -### Removed deprecated methods, attributes and arguments {#incompat-removed-deprecated} +- spaCy now requires **Python 3.6+**. + +### API changes {#incompat-api} + +- [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of + the component factory instead of the component function. +- **Custom pipeline components** now needs to be decorated with the + [`@Language.component`](/api/language#component) or + [`@Language.factory`](/api/language#factory) decorator. +- [`Language.update`](/api/language#update) now takes a batch of + [`Example`](/api/example) objects instead of raw texts and annotations, or + `Doc` and `GoldParse` objects. +- The `Language.disable_pipes` contextmanager has been replaced by + [`Language.select_pipes`](/api/language#select_pipes), which can explicitly + disable or enable components. + +### Removed or renamed API {#incompat-removed} + +| Removed | Replacement | +| -------------------------------------------------------- | ----------------------------------------------------- | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes) | +| `GoldParse` | [`Example`](/api/example) | +| `GoldCorpus` | [`Corpus`](/api/corpus) | +| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | +| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, model symlinks are deprecated | The following deprecated methods, attributes and arguments were removed in v3.0. Most of them have been **deprecated for a while** and many would previously @@ -214,17 +286,14 @@ python -m spacy package ./model ./packages - python setup.py sdist ``` -## Migration notes for plugin maintainers {#plugins} +#### Migration notes for plugin maintainers {#migrating-plugins} Thanks to everyone who's been contributing to the spaCy ecosystem by developing and maintaining one of the many awesome [plugins and extensions](/universe). -We've tried to keep breaking changes to a minimum and make it as easy as -possible for you to upgrade your packages for spaCy v3. - -### Custom pipeline components - -The most common use case for plugins is providing pipeline components and -extension attributes. +We've tried to make it as easy as possible for you to upgrade your packages for +spaCy v3. The most common use case for plugins is providing pipeline components +and extension attributes. When migrating your plugin, double-check the +following: - Use the [`@Language.factory`](/api/language#factory) decorator to register your component and assign it a name. This allows users to refer to your diff --git a/website/src/components/code.js b/website/src/components/code.js index a51986634..952014ed5 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -11,7 +11,7 @@ import Link from './link' import GitHubCode from './github' import classes from '../styles/code.module.sass' -const WRAP_THRESHOLD = 15 +const WRAP_THRESHOLD = 16 export default props => (