diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index ac5987aa4..11ad564ec 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -16,7 +16,7 @@ from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc from spacy.gold import Example from spacy.util import compounding, minibatch, minibatch_by_words -from spacy.syntax.nonproj import projectivize +from spacy.pipeline._parser_internals.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy from collections import defaultdict diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 95c2f28bd..0e0d4d4c3 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -20,20 +20,20 @@ seed = 0 accumulate_gradient = 1 use_pytorch_for_gpu_memory = false # Control how scores are printed and checkpoints are evaluated. -scores = ["speed", "tags_acc", "uas", "las", "ents_f"] -score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} -# These settings are invalid for the transformer models. +eval_batch_size = 128 +score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} init_tok2vec = null discard_oversize = false -omit_extra_lookups = false batch_by = "words" -use_gpu = -1 raw_text = null tag_map = null +vectors = null +base_model = null +morph_rules = null [training.batch_size] @schedules = "compounding.v1" -start = 1000 +start = 100 stop = 1000 compound = 1.001 @@ -46,74 +46,79 @@ L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 1e-8 -#learn_rate = 0.001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.001 +learn_rate = 0.001 [nlp] lang = "en" -base_model = null -vectors = null +load_vocab_data = false +pipeline = ["tok2vec", "ner", "tagger", "parser"] -[nlp.pipeline] +[nlp.tokenizer] +@tokenizers = "spacy.Tokenizer.v1" -[nlp.pipeline.tok2vec] +[nlp.lemmatizer] +@lemmatizers = "spacy.Lemmatizer.v1" + +[components] + +[components.tok2vec] factory = "tok2vec" - -[nlp.pipeline.ner] +[components.ner] factory = "ner" learn_tokens = false min_action_freq = 1 -[nlp.pipeline.tagger] +[components.tagger] factory = "tagger" -[nlp.pipeline.parser] +[components.parser] factory = "parser" learn_tokens = false min_action_freq = 30 -[nlp.pipeline.tagger.model] +[components.tagger.model] @architectures = "spacy.Tagger.v1" -[nlp.pipeline.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} -[nlp.pipeline.parser.model] +[components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 128 maxout_pieces = 2 use_upper = true -[nlp.pipeline.parser.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +[components.parser.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} -[nlp.pipeline.ner.model] +[components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 3 hidden_width = 128 maxout_pieces = 2 use_upper = true -[nlp.pipeline.ner.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} -[nlp.pipeline.tok2vec.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${nlp:vectors} -width = 128 +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +rows = 2000 +also_embed_subwords = true +also_use_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 depth = 4 window_size = 1 -embed_size = 7000 maxout_pieces = 3 -subword_features = true -dropout = ${training:dropout} diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index d694ceac8..eed76cb7b 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -9,11 +9,11 @@ max_epochs = 100 orth_variant_level = 0.0 gold_preproc = true max_length = 0 -scores = ["tag_acc", "dep_uas", "dep_las"] +scores = ["tag_acc", "dep_uas", "dep_las", "speed"] score_weights = {"dep_las": 0.8, "tag_acc": 0.2} limit = 0 seed = 0 -accumulate_gradient = 2 +accumulate_gradient = 1 discard_oversize = false raw_text = null tag_map = null @@ -22,7 +22,7 @@ base_model = null eval_batch_size = 128 use_pytorch_for_gpu_memory = false -batch_by = "padded" +batch_by = "words" [training.batch_size] @schedules = "compounding.v1" @@ -64,8 +64,8 @@ min_action_freq = 1 @architectures = "spacy.Tagger.v1" [components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${components.tok2vec.model:width} +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" @@ -74,16 +74,22 @@ hidden_width = 64 maxout_pieces = 3 [components.parser.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${components.tok2vec.model:width} +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} [components.tok2vec.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${training:vectors} +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +rows = 2000 +also_embed_subwords = true +also_use_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" width = 96 depth = 4 window_size = 1 -embed_size = 2000 maxout_pieces = 3 -subword_features = true -dropout = null diff --git a/examples/training/conllu.py b/examples/training/conllu.py index ecc07ccf2..a398b0ae0 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -13,7 +13,7 @@ import spacy import spacy.util from spacy.tokens import Token, Doc from spacy.gold import Example -from spacy.syntax.nonproj import projectivize +from spacy.pipeline._parser_internals.nonproj import projectivize from collections import defaultdict from spacy.matcher import Matcher diff --git a/setup.py b/setup.py index 6d962ab59..af4cd0ec6 100755 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ MOD_NAMES = [ "spacy.vocab", "spacy.attrs", "spacy.kb", + "spacy.ml.parser_model", "spacy.morphology", "spacy.pipeline.dep_parser", "spacy.pipeline.morphologizer", @@ -40,14 +41,14 @@ MOD_NAMES = [ "spacy.pipeline.sentencizer", "spacy.pipeline.senter", "spacy.pipeline.tagger", - "spacy.syntax.stateclass", - "spacy.syntax._state", + "spacy.pipeline.transition_parser", + "spacy.pipeline._parser_internals.arc_eager", + "spacy.pipeline._parser_internals.ner", + "spacy.pipeline._parser_internals.nonproj", + "spacy.pipeline._parser_internals._state", + "spacy.pipeline._parser_internals.stateclass", + "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", - "spacy.syntax.nn_parser", - "spacy.syntax._parser_model", - "spacy.syntax.nonproj", - "spacy.syntax.transition_system", - "spacy.syntax.arc_eager", "spacy.gold.gold_io", "spacy.tokens.doc", "spacy.tokens.span", @@ -57,7 +58,6 @@ MOD_NAMES = [ "spacy.matcher.matcher", "spacy.matcher.phrasematcher", "spacy.matcher.dependencymatcher", - "spacy.syntax.ner", "spacy.symbols", "spacy.vectors", ] diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 1ffceeca1..fa6f7a7d5 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -10,7 +10,7 @@ from thinc.api import Config from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli from ..gold import Corpus, Example -from ..syntax import nonproj +from ..pipeline._parser_internals import nonproj from ..language import Language from .. import util diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 83281543a..ee1be57a3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -67,10 +67,7 @@ def evaluate( corpus = Corpus(data_path, data_path) nlp = util.load_model(model) dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) - begin = timer() scores = nlp.evaluate(dev_dataset, verbose=False) - end = timer() - nwords = sum(len(ex.predicted) for ex in dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", @@ -82,17 +79,21 @@ def evaluate( "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", - "Textcat": "cats_score", - "Sent P": "sents_p", - "Sent R": "sents_r", - "Sent F": "sents_f", + "TEXTCAT": "cats_score", + "SENT P": "sents_p", + "SENT R": "sents_r", + "SENT F": "sents_f", + "SPEED": "speed", } results = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - results[metric] = f"{scores[key]*100:.2f}" + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} msg.table(results, title="Results") diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 1bd28cb7e..e42935e2f 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -11,7 +11,6 @@ from ...util import ensure_path, working_dir from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum - # TODO: find a solution for caches # CACHES = [ # Path.home() / ".torch", diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 44597c73e..b0bc145ff 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,5 +1,4 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List -from timeit import default_timer as timer import srsly import tqdm from pathlib import Path @@ -81,16 +80,20 @@ def train( msg.info("Using CPU") msg.info(f"Loading config and nlp from: {config_path}") config = Config().from_disk(config_path) + if config.get("training", {}).get("seed") is not None: + fix_random_seed(config["training"]["seed"]) with show_validation_error(): nlp, config = util.load_model_from_config(config, overrides=config_overrides) if config["training"]["base_model"]: - base_nlp = util.load_model(config["training"]["base_model"]) # TODO: do something to check base_nlp against regular nlp described in config? - nlp = base_nlp + # If everything matches it will look something like: + # base_nlp = util.load_model(config["training"]["base_model"]) + # nlp = base_nlp + raise NotImplementedError("base_model not supported yet.") + if config["training"]["vectors"] is not None: + util.load_vectors_into_model(nlp, config["training"]["vectors"]) verify_config(nlp) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) if config["training"]["use_pytorch_for_gpu_memory"]: # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() @@ -243,19 +246,16 @@ def create_evaluation_callback( ) -> Callable[[], Tuple[float, Dict[str, float]]]: def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = corpus.dev_dataset( - nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True + nlp, gold_preproc=cfg["gold_preproc"] ) dev_examples = list(dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples) batch_size = cfg["eval_batch_size"] - start_time = timer() if optimizer.averages: with nlp.use_params(optimizer.averages): scores = nlp.evaluate(dev_examples, batch_size=batch_size) else: scores = nlp.evaluate(dev_examples, batch_size=batch_size) - end_time = timer() - wps = n_words / (end_time - start_time) # Calculate a weighted sum based on score_weights for the main score weights = cfg["score_weights"] try: @@ -264,7 +264,6 @@ def create_evaluation_callback( keys = list(scores.keys()) err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) raise KeyError(err) - scores["speed"] = wps return weighted_score, scores return evaluate @@ -446,7 +445,7 @@ def update_meta( training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] ) -> None: nlp.meta["performance"] = {} - for metric in training["scores_weights"]: + for metric in training["score_weights"]: nlp.meta["performance"][metric] = info["other_scores"][metric] for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] diff --git a/spacy/errors.py b/spacy/errors.py index a10e5d9bd..3fe53d6db 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -432,12 +432,12 @@ class Errors: "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") E170 = ("Cannot apply transition {name}: invalid for the current state.") - E171 = ("Matcher.add received invalid on_match callback argument: expected " + E171 = ("Matcher.add received invalid 'on_match' callback argument: expected " "callable or None, but got: {arg_type}") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") - E178 = ("Invalid pattern. Expected list of dicts but got: {pat}. Maybe you " + E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you " "accidentally passed a single pattern to Matcher.add instead of a " "list of patterns? If you only want to add one pattern, make sure " "to wrap it in a list. For example: matcher.add('{key}', [pattern])") @@ -483,6 +483,10 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E947 = ("Matcher.add received invalid 'greedy' argument: expected " + "a string value from {expected} but got: '{arg}'") + E948 = ("Matcher.add received invalid 'patterns' argument: expected " + "a List, but got: {arg_type}") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 427c00caa..d23f70bee 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,7 +1,15 @@ +from typing import Union, List, Iterable, Iterator, TYPE_CHECKING +from pathlib import Path import random + from .. import util from .example import Example from ..tokens import DocBin, Doc +from ..vocab import Vocab + +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from ..language import Language # noqa: F401 class Corpus: @@ -11,20 +19,23 @@ class Corpus: DOCS: https://spacy.io/api/corpus """ - def __init__(self, train_loc, dev_loc, limit=0): + def __init__( + self, train_loc: Union[str, Path], dev_loc: Union[str, Path], limit: int = 0 + ) -> None: """Create a Corpus. train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. - limit (int): Max. number of examples returned - RETURNS (Corpus): The newly created object. + limit (int): Max. number of examples returned. + + DOCS: https://spacy.io/api/corpus#init """ self.train_loc = train_loc self.dev_loc = dev_loc self.limit = limit @staticmethod - def walk_corpus(path): + def walk_corpus(path: Union[str, Path]) -> List[Path]: path = util.ensure_path(path) if not path.is_dir(): return [path] @@ -43,7 +54,9 @@ class Corpus: locs.append(path) return locs - def _make_example(self, nlp, reference, gold_preproc): + def _make_example( + self, nlp: "Language", reference: Doc, gold_preproc: bool + ) -> Example: if gold_preproc or reference.has_unknown_spaces: return Example( Doc( @@ -56,7 +69,9 @@ class Corpus: else: return Example(nlp.make_doc(reference.text), reference) - def make_examples(self, nlp, reference_docs, max_length=0): + def make_examples( + self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0 + ) -> Iterator[Example]: for reference in reference_docs: if len(reference) == 0: continue @@ -69,7 +84,9 @@ class Corpus: elif max_length == 0 or len(ref_sent) < max_length: yield self._make_example(nlp, ref_sent.as_doc(), False) - def make_examples_gold_preproc(self, nlp, reference_docs): + def make_examples_gold_preproc( + self, nlp: "Language", reference_docs: Iterable[Doc] + ) -> Iterator[Example]: for reference in reference_docs: if reference.is_sentenced: ref_sents = [sent.as_doc() for sent in reference.sents] @@ -80,7 +97,9 @@ class Corpus: if len(eg.x): yield eg - def read_docbin(self, vocab, locs): + def read_docbin( + self, vocab: Vocab, locs: Iterable[Union[str, Path]] + ) -> Iterator[Doc]: """ Yield training examples as example dicts """ i = 0 for loc in locs: @@ -96,8 +115,14 @@ class Corpus: if self.limit >= 1 and i >= self.limit: break - def count_train(self, nlp): - """Returns count of words in train examples""" + def count_train(self, nlp: "Language") -> int: + """Returns count of words in train examples. + + nlp (Language): The current nlp. object. + RETURNS (int): The word count. + + DOCS: https://spacy.io/api/corpus#count_train + """ n = 0 i = 0 for example in self.train_dataset(nlp): @@ -108,8 +133,25 @@ class Corpus: return n def train_dataset( - self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs - ): + self, + nlp: "Language", + *, + shuffle: bool = True, + gold_preproc: bool = False, + max_length: int = 0 + ) -> Iterator[Example]: + """Yield examples from the training data. + + nlp (Language): The current nlp object. + shuffle (bool): Whether to shuffle the examples. + gold_preproc (bool): Whether to train on gold-standard sentences and tokens. + max_length (int): Maximum document length. Longer documents will be + split into sentences, if sentence boundaries are available. 0 for + no limit. + YIELDS (Example): The examples. + + DOCS: https://spacy.io/api/corpus#train_dataset + """ ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) @@ -120,7 +162,17 @@ class Corpus: random.shuffle(examples) yield from examples - def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs): + def dev_dataset( + self, nlp: "Language", *, gold_preproc: bool = False + ) -> Iterator[Example]: + """Yield examples from the development data. + + nlp (Language): The current nlp object. + gold_preproc (bool): Whether to train on gold-standard sentences and tokens. + YIELDS (Example): The examples. + + DOCS: https://spacy.io/api/corpus#dev_dataset + """ ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 9101cefce..84d9f1622 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -10,7 +10,7 @@ from .align import Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import spans_from_biluo_tags from ..errors import Errors, Warnings -from ..syntax import nonproj +from ..pipeline._parser_internals import nonproj cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): diff --git a/spacy/language.py b/spacy/language.py index 79fceec95..e415869b3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -14,13 +14,14 @@ from thinc.api import get_current_ops, Config, require_gpu, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle +from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer -from .util import link_vectors_to_models, create_default_optimizer, registry +from .util import create_default_optimizer, registry from .util import SimpleFrozenDict, combine_score_weights from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES @@ -36,6 +37,7 @@ from . import util from . import about +# TODO: integrate pipeline analyis ENABLE_PIPELINE_ANALYSIS = False # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" @@ -43,6 +45,11 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH) class BaseDefaults: + """Language data defaults, available via Language.Defaults. Can be + overwritten by language subclasses by defining their own subclasses of + Language.Defaults. + """ + config: Config = Config() tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES @@ -58,6 +65,10 @@ class BaseDefaults: @registry.tokenizers("spacy.Tokenizer.v1") def create_tokenizer() -> Callable[["Language"], Tokenizer]: + """Registered function to create a tokenizer. Returns a factory that takes + the nlp object and returns a Tokenizer instance using the language detaults. + """ + def tokenizer_factory(nlp: "Language") -> Tokenizer: prefixes = nlp.Defaults.prefixes suffixes = nlp.Defaults.suffixes @@ -80,6 +91,11 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: @registry.lemmatizers("spacy.Lemmatizer.v1") def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]: + """Registered function to create a lemmatizer. Returns a factory that takes + the nlp object and returns a Lemmatizer instance with data loaded in from + spacy-lookups-data, if the package is installed. + """ + # TODO: Will be replaced when the lemmatizer becomes a pipeline component tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] def lemmatizer_factory(nlp: "Language") -> "Lemmatizer": @@ -116,7 +132,7 @@ class Language: create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, create_lemmatizer: Optional[Callable[["Language"], Callable]] = None, **kwargs, - ): + ) -> None: """Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. @@ -134,7 +150,8 @@ class Language: returns a tokenizer. create_lemmatizer (Callable): Function that takes the nlp object and returns a lemmatizer. - RETURNS (Language): The newly constructed object. + + DOCS: https://spacy.io/api/language#init """ # We're only calling this to import all factories provided via entry # points. The factory decorator applied to these functions takes care @@ -189,6 +206,13 @@ class Language: @property def meta(self) -> Dict[str, Any]: + """Custom meta data of the language class. If a model is loaded, this + includes details from the model's meta.json. + + RETURNS (Dict[str, Any]): The meta. + + DOCS: https://spacy.io/api/language#meta + """ spacy_version = util.get_model_version_range(about.__version__) if self.vocab.lang: self._meta.setdefault("lang", self.vocab.lang) @@ -221,6 +245,13 @@ class Language: @property def config(self) -> Config: + """Trainable config for the current language instance. Includes the + current pipeline components, as well as default training config. + + RETURNS (thinc.api.Config): The config. + + DOCS: https://spacy.io/api/language#config + """ self._config.setdefault("nlp", {}) self._config.setdefault("training", {}) self._config["nlp"]["lang"] = self.lang @@ -382,6 +413,8 @@ class Language: select the best model. Weights should sum to 1.0 per component and will be combined and normalized for the whole pipeline. func (Optional[Callable]): Factory function if not used as a decorator. + + DOCS: https://spacy.io/api/language#factory """ if not isinstance(name, str): raise ValueError(Errors.E963.format(decorator="factory")) @@ -460,6 +493,8 @@ class Language: select the best model. Weights should sum to 1.0 per component and will be combined and normalized for the whole pipeline. func (Optional[Callable]): Factory function if not used as a decorator. + + DOCS: https://spacy.io/api/language#component """ if name is not None and not isinstance(name, str): raise ValueError(Errors.E963.format(decorator="component")) @@ -504,6 +539,7 @@ class Language: self, factory_name: str, name: Optional[str] = None, + *, config: Optional[Dict[str, Any]] = SimpleFrozenDict(), overrides: Optional[Dict[str, Any]] = SimpleFrozenDict(), validate: bool = True, @@ -521,6 +557,8 @@ class Language: validate (bool): Whether to validate the component config against the arguments and types expected by the factory. RETURNS (Callable[[Doc], Doc]): The pipeline component. + + DOCS: https://spacy.io/api/language#create_pipe """ name = name if name is not None else factory_name if not isinstance(config, dict): @@ -692,6 +730,7 @@ class Language: self, name: str, factory_name: str, + *, config: Dict[str, Any] = SimpleFrozenDict(), validate: bool = True, ) -> None: @@ -761,6 +800,7 @@ class Language: def __call__( self, text: str, + *, disable: Iterable[str] = tuple(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, ) -> Doc: @@ -770,8 +810,8 @@ class Language: text (str): The text to be processed. disable (list): Names of the pipeline components to disable. - component_cfg (dict): An optional dictionary with extra keyword arguments - for specific components. + component_cfg (Dict[str, dict]): An optional dictionary with extra + keyword arguments for specific components. RETURNS (Doc): A container for accessing the annotations. DOCS: https://spacy.io/api/language#call @@ -811,6 +851,7 @@ class Language: def select_pipes( self, + *, disable: Optional[Union[str, Iterable[str]]] = None, enable: Optional[Union[str, Iterable[str]]] = None, ) -> "DisabledPipes": @@ -853,7 +894,7 @@ class Language: def update( self, examples: Iterable[Example], - dummy: Optional[Any] = None, + _: Optional[Any] = None, *, drop: float = 0.0, sgd: Optional[Optimizer] = None, @@ -863,7 +904,7 @@ class Language: """Update the models in the pipeline. examples (Iterable[Example]): A batch of examples - dummy: Should not be set - serves to catch backwards-incompatible scripts. + _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (Optimizer): An optimizer. losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. @@ -873,7 +914,7 @@ class Language: DOCS: https://spacy.io/api/language#update """ - if dummy is not None: + if _ is not None: raise ValueError(Errors.E989) if losses is None: losses = {} @@ -890,12 +931,10 @@ class Language: raise TypeError( Errors.E978.format(name="language", method="update", types=wrong_types) ) - if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer() sgd = self._optimizer - if component_cfg is None: component_cfg = {} for i, (name, proc) in enumerate(self.pipeline): @@ -915,6 +954,7 @@ class Language: def rehearse( self, examples: Iterable[Example], + *, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, @@ -937,8 +977,9 @@ class Language: >>> nlp.update(labelled_batch) >>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)] >>> nlp.rehearse(raw_batch) + + DOCS: https://spacy.io/api/language#rehearse """ - # TODO: document if len(examples) == 0: return if not isinstance(examples, IterableInstance): @@ -983,17 +1024,18 @@ class Language: def begin_training( self, - get_examples: Optional[Callable] = None, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, sgd: Optional[Optimizer] = None, device: int = -1, ) -> Optimizer: - """Allocate models, pre-process training data and acquire a trainer and - optimizer. Used as a contextmanager. + """Initialize the pipe for training, using data examples if available. - get_examples (function): Function returning example training data. - TODO: document format change since 3.0. - sgd (Optional[Optimizer]): An optimizer. - RETURNS: An optimizer. + get_examples (Callable[[], Iterable[Example]]): Optional function that + returns gold-standard Example objects. + sgd (thinc.api.Optimizer): Optional optimizer. Will be created with + create_optimizer if it doesn't exist. + RETURNS (thinc.api.Optimizer): The optimizer. DOCS: https://spacy.io/api/language#begin_training """ @@ -1009,7 +1051,6 @@ class Language: if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - link_vectors_to_models(self.vocab) if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd @@ -1022,25 +1063,26 @@ class Language: return self._optimizer def resume_training( - self, sgd: Optional[Optimizer] = None, device: int = -1 + self, *, sgd: Optional[Optimizer] = None, device: int = -1 ) -> Optimizer: """Continue training a pretrained model. Create and return an optimizer, and initialize "rehearsal" for any pipeline component that has a .rehearse() method. Rehearsal is used to prevent - models from "forgetting" their initialised "knowledge". To perform + models from "forgetting" their initialized "knowledge". To perform rehearsal, collect samples of text you want the models to retain performance on, and call nlp.rehearse() with a batch of Example objects. sgd (Optional[Optimizer]): An optimizer. RETURNS (Optimizer): The optimizer. + + DOCS: https://spacy.io/api/language#resume_training """ if device >= 0: # TODO: do we need this here? require_gpu(device) ops = get_current_ops() if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - link_vectors_to_models(self.vocab) if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd @@ -1052,11 +1094,12 @@ class Language: def evaluate( self, examples: Iterable[Example], + *, verbose: bool = False, batch_size: int = 256, scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, - ) -> Scorer: + ) -> Dict[str, Union[float, dict]]: """Evaluate a model's pipeline components. examples (Iterable[Example]): `Example` objects. @@ -1088,7 +1131,14 @@ class Language: kwargs.setdefault("verbose", verbose) kwargs.setdefault("nlp", self) scorer = Scorer(**kwargs) - docs = list(eg.predicted for eg in examples) + texts = [eg.reference.text for eg in examples] + docs = [eg.predicted for eg in examples] + start_time = timer() + # tokenize the texts only for timing purposes + if not hasattr(self.tokenizer, "pipe"): + _ = [self.tokenizer(text) for text in texts] + else: + _ = list(self.tokenizer.pipe(texts)) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) @@ -1096,11 +1146,18 @@ class Language: docs = _pipe(docs, pipe, kwargs) else: docs = pipe.pipe(docs, **kwargs) + # iterate over the final generator + if len(self.pipeline): + docs = list(docs) + end_time = timer() for i, (doc, eg) in enumerate(zip(docs, examples)): if verbose: print(doc) eg.predicted = doc - return scorer.score(examples) + results = scorer.score(examples) + n_words = sum(len(eg.predicted) for eg in examples) + results["speed"] = n_words / (end_time - start_time) + return results @contextmanager def use_params(self, params: dict): @@ -1112,7 +1169,9 @@ class Language: EXAMPLE: >>> with nlp.use_params(optimizer.averages): - >>> nlp.to_disk('/tmp/checkpoint') + >>> nlp.to_disk("/tmp/checkpoint") + + DOCS: https://spacy.io/api/language#use_params """ contexts = [ pipe.use_params(params) @@ -1136,6 +1195,7 @@ class Language: def pipe( self, texts: Iterable[str], + *, as_tuples: bool = False, batch_size: int = 1000, disable: Iterable[str] = tuple(), @@ -1305,6 +1365,16 @@ class Language: """Create the nlp object from a loaded config. Will set up the tokenizer and language data, add pipeline components etc. If no config is provided, the default config of the given language is used. + + config (Dict[str, Any] / Config): The loaded config. + disable (Iterable[str]): List of pipeline component names to disable. + auto_fill (bool): Automatically fill in missing values in config based + on defaults and function argument annotations. + validate (bool): Validate the component config and arguments against + the types expected by the factory. + RETURNS (Language): The initialized Language class. + + DOCS: https://spacy.io/api/language#from_config """ if auto_fill: config = util.deep_merge_configs(config, cls.default_config) @@ -1338,6 +1408,10 @@ class Language: nlp = cls( create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer, ) + # Note that we don't load vectors here, instead they get loaded explicitly + # inside stuff like the spacy train function. If we loaded them here, + # then we would load them twice at runtime: once when we make from config, + # and then again when we load from disk. pipeline = config.get("components", {}) for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: @@ -1362,7 +1436,9 @@ class Language: nlp.resolved = resolved return nlp - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Save the current state to a directory. If a model is loaded, this will include the model. @@ -1391,7 +1467,7 @@ class Language: util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the @@ -1418,7 +1494,6 @@ class Language: _fix_pretrained_vectors_name(self) path = util.ensure_path(path) - deserializers = {} if Path(path / "config.cfg").exists(): deserializers["config.cfg"] = lambda p: self.config.from_disk(p) @@ -1443,7 +1518,7 @@ class Language: self._link_components() return self - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: """Serialize the current state to a binary string. exclude (list): Names of components or serialization fields to exclude. @@ -1465,7 +1540,7 @@ class Language: return util.to_bytes(serializers, exclude) def from_bytes( - self, bytes_data: bytes, exclude: Iterable[str] = tuple() + self, bytes_data: bytes, *, exclude: Iterable[str] = tuple() ) -> "Language": """Load state from a binary string. @@ -1509,6 +1584,12 @@ class Language: @dataclass class FactoryMeta: + """Dataclass containing information about a component and its defaults + provided by the @Language.component or @Language.factory decorator. It's + created whenever a component is defined and stored on the Language class for + each component instance and factory instance. + """ + factory: str default_config: Optional[Dict[str, Any]] = None # noqa: E704 assigns: Iterable[str] = tuple() @@ -1539,8 +1620,6 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None: nlp.vocab.vectors.name = vectors_name else: raise ValueError(Errors.E092) - if nlp.vocab.vectors.size != 0: - link_vectors_to_models(nlp.vocab) for name, proc in nlp.pipeline: if not hasattr(proc, "cfg"): continue @@ -1551,7 +1630,7 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None: class DisabledPipes(list): """Manager for temporary pipeline disabling.""" - def __init__(self, nlp: Language, names: List[str]): + def __init__(self, nlp: Language, names: List[str]) -> None: self.nlp = nlp self.names = names # Important! Not deep copy -- we just want the container (but we also diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 1cfb681f4..adba79686 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -21,7 +21,6 @@ class Lemmatizer: lookups (Lookups): The lookups object containing the (optional) tables "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". - RETURNS (Lemmatizer): The newly constructed object. """ self.lookups = lookups if lookups is not None else Lookups() self.is_base_form = is_base_form diff --git a/spacy/lookups.py b/spacy/lookups.py index bf71ba877..7862b9805 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -52,8 +52,6 @@ class Lookups: def __init__(self) -> None: """Initialize the Lookups object. - RETURNS (Lookups): The newly created object. - DOCS: https://spacy.io/api/lookups#init """ self._tables = {} @@ -202,7 +200,6 @@ class Table(OrderedDict): data (dict): The dictionary. name (str): Optional table name for reference. - RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.from_dict """ @@ -215,7 +212,6 @@ class Table(OrderedDict): name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. - RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.init """ diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ddeeedd06..716af9909 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -36,7 +36,6 @@ cdef class DependencyMatcher: vocab (Vocab): The vocabulary object, which must be shared with the documents the matcher will operate on. - RETURNS (DependencyMatcher): The newly constructed object. """ size = 20 # TODO: make matcher work with validation diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 689734079..e1f6bc773 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -66,6 +66,7 @@ cdef class Matcher: cdef public object validate cdef public object _patterns cdef public object _callbacks + cdef public object _filter cdef public object _extensions cdef public object _extra_predicates cdef public object _seen_attrs diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 6c8ee4204..325c81369 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,6 +1,9 @@ # cython: infer_types=True, cython: profile=True +from typing import List + from libcpp.vector cimport vector from libc.stdint cimport int32_t +from libc.string cimport memset, memcmp from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 @@ -37,11 +40,11 @@ cdef class Matcher: vocab (Vocab): The vocabulary object, which must be shared with the documents the matcher will operate on. - RETURNS (Matcher): The newly constructed object. """ self._extra_predicates = [] self._patterns = {} self._callbacks = {} + self._filter = {} self._extensions = {} self._seen_attrs = set() self.vocab = vocab @@ -69,7 +72,7 @@ cdef class Matcher: """ return self._normalize_key(key) in self._patterns - def add(self, key, patterns, *_patterns, on_match=None): + def add(self, key, patterns, *, on_match=None, greedy: str=None): """Add a match-rule to the matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. @@ -87,11 +90,10 @@ cdef class Matcher: '+': Require the pattern to match 1 or more times. '*': Allow the pattern to zero or more times. - The + and * operators are usually interpretted "greedily", i.e. longer - matches are returned where possible. However, if you specify two '+' - and '*' patterns in a row and their matches overlap, the first - operator will behave non-greedily. This quirk in the semantics makes - the matcher more efficient, by avoiding the need for back-tracking. + The + and * operators return all possible matches (not just the greedy + ones). However, the "greedy" argument can filter the final matches + by returning a non-overlapping set per key, either taking preference to + the first greedy match ("FIRST"), or the longest ("LONGEST"). As of spaCy v2.2.2, Matcher.add supports the future API, which makes the patterns the second argument and a list (instead of a variable @@ -101,16 +103,15 @@ cdef class Matcher: key (str): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. - *_patterns (list): For backwards compatibility: list of patterns to add - as variable arguments. Will be ignored if a list of patterns is - provided as the second argument. + greedy (str): Optional filter: "FIRST" or "LONGEST". """ errors = {} if on_match is not None and not hasattr(on_match, "__call__"): raise ValueError(Errors.E171.format(arg_type=type(on_match))) - if patterns is None or hasattr(patterns, "__call__"): # old API - on_match = patterns - patterns = _patterns + if patterns is None or not isinstance(patterns, List): # old API + raise ValueError(Errors.E948.format(arg_type=type(patterns))) + if greedy is not None and greedy not in ["FIRST", "LONGEST"]: + raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy)) for i, pattern in enumerate(patterns): if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) @@ -133,6 +134,7 @@ cdef class Matcher: raise ValueError(Errors.E154.format()) self._patterns.setdefault(key, []) self._callbacks[key] = on_match + self._filter[key] = greedy self._patterns[key].extend(patterns) def remove(self, key): @@ -218,6 +220,7 @@ cdef class Matcher: length = doclike.end - doclike.start else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) + cdef Pool tmp_pool = Pool() if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) @@ -225,11 +228,42 @@ cdef class Matcher: raise ValueError(Errors.E156.format()) matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) - for i, (key, start, end) in enumerate(matches): + final_matches = [] + pairs_by_id = {} + # For each key, either add all matches, or only the filtered, non-overlapping ones + for (key, start, end) in matches: + span_filter = self._filter.get(key) + if span_filter is not None: + pairs = pairs_by_id.get(key, []) + pairs.append((start,end)) + pairs_by_id[key] = pairs + else: + final_matches.append((key, start, end)) + matched = tmp_pool.alloc(length, sizeof(char)) + empty = tmp_pool.alloc(length, sizeof(char)) + for key, pairs in pairs_by_id.items(): + memset(matched, 0, length * sizeof(matched[0])) + span_filter = self._filter.get(key) + if span_filter == "FIRST": + sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start + elif span_filter == "LONGEST": + sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length + else: + raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter)) + for (start, end) in sorted_pairs: + assert 0 <= start < end # Defend against segfaults + span_len = end-start + # If no tokens in the span have matched + if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0: + final_matches.append((key, start, end)) + # Mark tokens that have matched + memset(&matched[start], 1, span_len * sizeof(matched[0])) + # perform the callbacks on the filtered set of results + for i, (key, start, end) in enumerate(final_matches): on_match = self._callbacks.get(key, None) if on_match is not None: - on_match(self, doc, i, matches) - return matches + on_match(self, doc, i, final_matches) + return final_matches def _normalize_key(self, key): if isinstance(key, basestring): @@ -240,9 +274,9 @@ cdef class Matcher: def unpickle_matcher(vocab, patterns, callbacks): matcher = Matcher(vocab) - for key, specs in patterns.items(): + for key, pattern in patterns.items(): callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) + matcher.add(key, pattern, on_match=callback) return matcher diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index a2141dc02..060c4d37f 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -32,7 +32,6 @@ cdef class PhraseMatcher: vocab (Vocab): The shared vocabulary. attr (int / str): Token attribute to match on. validate (bool): Perform additional validation when patterns are added. - RETURNS (PhraseMatcher): The newly constructed object. DOCS: https://spacy.io/api/phrasematcher#init """ diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index 57fbf73b3..ab0cb85c7 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -1,16 +1,18 @@ +from typing import List from thinc.api import Model +from thinc.types import Floats2d +from ..tokens import Doc -def CharacterEmbed(nM, nC): +def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: # nM: Number of dimensions per character. nC: Number of characters. - nO = nM * nC if (nM is not None and nC is not None) else None return Model( "charembed", forward, init=init, - dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256}, + dims={"nM": nM, "nC": nC, "nO": nM * nC, "nV": 256}, params={"E": None}, - ).initialize() + ) def init(model, X=None, Y=None): diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index e5f4af2fb..53200c165 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -5,11 +5,11 @@ from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_ from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued from thinc.api import Relu, residual, expand_window, FeatureExtractor -from ..spacy_vectors import SpacyVectors from ... import util from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams +from ..staticvectors import StaticVectors @registry.architectures.register("spacy.TextCatCNN.v1") @@ -102,13 +102,7 @@ def build_text_classifier( ) if pretrained_vectors: - nlp = util.load_model(pretrained_vectors) - vectors = nlp.vocab.vectors - vector_dim = vectors.data.shape[1] - - static_vectors = SpacyVectors(vectors) >> with_array( - Linear(width, vector_dim) - ) + static_vectors = StaticVectors(width) vector_layer = trained_vectors | static_vectors vectors_width = width * 2 else: @@ -159,16 +153,11 @@ def build_text_classifier( @registry.architectures.register("spacy.TextCatLowData.v1") def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): - nlp = util.load_model(pretrained_vectors) - vectors = nlp.vocab.vectors - vector_dim = vectors.data.shape[1] - # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): model = ( - SpacyVectors(vectors) + StaticVectors(width) >> list2ragged() - >> with_ragged(0, Linear(width, vector_dim)) >> ParametricAttention(width) >> reduce_sum() >> residual(Relu(width, width)) ** 2 diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 1766fa80e..1460b3005 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,223 +1,140 @@ -from thinc.api import chain, clone, concatenate, with_array, uniqued -from thinc.api import Model, noop, with_padded, Maxout, expand_window -from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM -from thinc.api import residual, LayerNorm, FeatureExtractor, Mish +from typing import Optional, List +from thinc.api import chain, clone, concatenate, with_array, with_padded +from thinc.api import Model, noop, list2ragged, ragged2list +from thinc.api import FeatureExtractor, HashEmbed +from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM +from thinc.types import Floats2d +from ...tokens import Doc from ... import util from ...util import registry from ...ml import _character_embed +from ..staticvectors import StaticVectors from ...pipeline.tok2vec import Tok2VecListener from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE -@registry.architectures.register("spacy.Tok2VecTensors.v1") -def tok2vec_tensors_v1(width, upstream="*"): +@registry.architectures.register("spacy.Tok2VecListener.v1") +def tok2vec_listener_v1(width, upstream="*"): tok2vec = Tok2VecListener(upstream_name=upstream, width=width) return tok2vec -@registry.architectures.register("spacy.VocabVectors.v1") -def get_vocab_vectors(name): - nlp = util.load_model(name) - return nlp.vocab.vectors - +@registry.architectures.register("spacy.HashEmbedCNN.v1") +def build_hash_embed_cnn_tok2vec( + *, + width: int, + depth: int, + embed_size: int, + window_size: int, + maxout_pieces: int, + subword_features: bool, + dropout: Optional[float], + pretrained_vectors: Optional[bool] +) -> Model[List[Doc], List[Floats2d]]: + """Build spaCy's 'standard' tok2vec layer, which uses hash embedding + with subword features and a CNN with layer-normalized maxout.""" + return build_Tok2Vec_model( + embed=MultiHashEmbed( + width=width, + rows=embed_size, + also_embed_subwords=subword_features, + also_use_static_vectors=bool(pretrained_vectors), + ), + encode=MaxoutWindowEncoder( + width=width, + depth=depth, + window_size=window_size, + maxout_pieces=maxout_pieces + ) + ) @registry.architectures.register("spacy.Tok2Vec.v1") -def Tok2Vec(extract, embed, encode): - field_size = 0 - if encode.attrs.get("receptive_field", None): - field_size = encode.attrs["receptive_field"] - with Model.define_operators({">>": chain, "|": concatenate}): - tok2vec = extract >> with_array(embed >> encode, pad=field_size) +def build_Tok2Vec_model( + embed: Model[List[Doc], List[Floats2d]], + encode: Model[List[Floats2d], List[Floats2d]], +) -> Model[List[Doc], List[Floats2d]]: + + receptive_field = encode.attrs.get("receptive_field", 0) + tok2vec = chain(embed, with_array(encode, pad=receptive_field)) tok2vec.set_dim("nO", encode.get_dim("nO")) tok2vec.set_ref("embed", embed) tok2vec.set_ref("encode", encode) return tok2vec -@registry.architectures.register("spacy.Doc2Feats.v1") -def Doc2Feats(columns): - return FeatureExtractor(columns) - - -@registry.architectures.register("spacy.HashEmbedCNN.v1") -def hash_embed_cnn( - pretrained_vectors, - width, - depth, - embed_size, - maxout_pieces, - window_size, - subword_features, - dropout, -): - # Does not use character embeddings: set to False by default - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - conv_depth=depth, - bilstm_depth=0, - maxout_pieces=maxout_pieces, - window_size=window_size, - subword_features=subword_features, - char_embed=False, - nM=0, - nC=0, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.HashCharEmbedCNN.v1") -def hash_charembed_cnn( - pretrained_vectors, - width, - depth, - embed_size, - maxout_pieces, - window_size, - nM, - nC, - dropout, -): - # Allows using character embeddings by setting nC, nM and char_embed=True - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - conv_depth=depth, - bilstm_depth=0, - maxout_pieces=maxout_pieces, - window_size=window_size, - subword_features=False, - char_embed=True, - nM=nM, - nC=nC, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.HashEmbedBiLSTM.v1") -def hash_embed_bilstm_v1( - pretrained_vectors, - width, - depth, - embed_size, - subword_features, - maxout_pieces, - dropout, -): - # Does not use character embeddings: set to False by default - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - bilstm_depth=depth, - conv_depth=0, - maxout_pieces=maxout_pieces, - window_size=1, - subword_features=subword_features, - char_embed=False, - nM=0, - nC=0, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") -def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout -): - # Allows using character embeddings by setting nC, nM and char_embed=True - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - bilstm_depth=depth, - conv_depth=0, - maxout_pieces=maxout_pieces, - window_size=1, - subword_features=False, - char_embed=True, - nM=nM, - nC=nC, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") -def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True) - - @registry.architectures.register("spacy.MultiHashEmbed.v1") def MultiHashEmbed( - columns, width, rows, use_subwords, pretrained_vectors, mix, dropout + width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool ): - norm = HashEmbed( - nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6 - ) - if use_subwords: - prefix = HashEmbed( - nO=width, - nV=rows // 2, - column=columns.index("PREFIX"), - dropout=dropout, - seed=7, - ) - suffix = HashEmbed( - nO=width, - nV=rows // 2, - column=columns.index("SUFFIX"), - dropout=dropout, - seed=8, - ) - shape = HashEmbed( - nO=width, - nV=rows // 2, - column=columns.index("SHAPE"), - dropout=dropout, - seed=9, + cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] + + seed = 7 + + def make_hash_embed(feature): + nonlocal seed + seed += 1 + return HashEmbed( + width, + rows if feature == NORM else rows // 2, + column=cols.index(feature), + seed=seed, + dropout=0.0, ) - if pretrained_vectors: - glove = StaticVectors( - vectors=pretrained_vectors.data, - nO=width, - column=columns.index(ID), - dropout=dropout, + if also_embed_subwords: + embeddings = [ + make_hash_embed(NORM), + make_hash_embed(PREFIX), + make_hash_embed(SUFFIX), + make_hash_embed(SHAPE), + ] + else: + embeddings = [make_hash_embed(NORM)] + concat_size = width * (len(embeddings) + also_use_static_vectors) + if also_use_static_vectors: + model = chain( + concatenate( + chain( + FeatureExtractor(cols), + list2ragged(), + with_array(concatenate(*embeddings)), + ), + StaticVectors(width, dropout=0.0), + ), + with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), + ragged2list(), ) - - with Model.define_operators({">>": chain, "|": concatenate}): - if not use_subwords and not pretrained_vectors: - embed_layer = norm - else: - if use_subwords and pretrained_vectors: - concat_columns = glove | norm | prefix | suffix | shape - elif use_subwords: - concat_columns = norm | prefix | suffix | shape - else: - concat_columns = glove | norm - - embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) - - return embed_layer + else: + model = chain( + FeatureExtractor(cols), + list2ragged(), + with_array(concatenate(*embeddings)), + with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), + ragged2list(), + ) + return model @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): - norm = HashEmbed( - nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5 +def CharacterEmbed(width: int, rows: int, nM: int, nC: int): + model = chain( + concatenate( + chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), + chain( + FeatureExtractor([NORM]), + list2ragged(), + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)) + ) + ), + with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), + ragged2list() ) - chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) - with Model.define_operators({">>": chain, "|": concatenate}): - embed_layer = chr_embed | features >> with_array(norm) - embed_layer.set_dim("nO", nM * nC + width) - return embed_layer + return model @registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): +def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int): cnn = chain( expand_window(window_size=window_size), Maxout( @@ -238,8 +155,12 @@ def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): def MishWindowEncoder(width, window_size, depth): cnn = chain( expand_window(window_size=window_size), - Mish(nO=width, nI=width * ((window_size * 2) + 1)), - LayerNorm(width), + Mish( + nO=width, + nI=width * ((window_size * 2) + 1), + dropout=0.0, + normalize=True + ), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) @@ -247,133 +168,7 @@ def MishWindowEncoder(width, window_size, depth): @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") -def TorchBiLSTMEncoder(width, depth): - import torch.nn - - # TODO FIX - from thinc.api import PyTorchRNNWrapper - +def BiLSTMEncoder(width, depth, dropout): if depth == 0: return noop() - return with_padded( - PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) - ) - - -def build_Tok2Vec_model( - width, - embed_size, - pretrained_vectors, - window_size, - maxout_pieces, - subword_features, - char_embed, - nM, - nC, - conv_depth, - bilstm_depth, - dropout, -) -> Model: - if char_embed: - subword_features = False - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed( - nO=width, nV=embed_size, column=cols.index(NORM), dropout=None, seed=0 - ) - if subword_features: - prefix = HashEmbed( - nO=width, - nV=embed_size // 2, - column=cols.index(PREFIX), - dropout=None, - seed=1, - ) - suffix = HashEmbed( - nO=width, - nV=embed_size // 2, - column=cols.index(SUFFIX), - dropout=None, - seed=2, - ) - shape = HashEmbed( - nO=width, - nV=embed_size // 2, - column=cols.index(SHAPE), - dropout=None, - seed=3, - ) - else: - prefix, suffix, shape = (None, None, None) - if pretrained_vectors is not None: - glove = StaticVectors( - vectors=pretrained_vectors.data, - nO=width, - column=cols.index(ID), - dropout=dropout, - ) - - if subword_features: - columns = 5 - embed = uniqued( - (glove | norm | prefix | suffix | shape) - >> Maxout( - nO=width, nI=width * columns, nP=3, dropout=0.0, normalize=True, - ), - column=cols.index(ORTH), - ) - else: - columns = 2 - embed = uniqued( - (glove | norm) - >> Maxout( - nO=width, nI=width * columns, nP=3, dropout=0.0, normalize=True, - ), - column=cols.index(ORTH), - ) - elif subword_features: - columns = 4 - embed = uniqued( - concatenate(norm, prefix, suffix, shape) - >> Maxout( - nO=width, nI=width * columns, nP=3, dropout=0.0, normalize=True, - ), - column=cols.index(ORTH), - ) - elif char_embed: - embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) | FeatureExtractor( - cols - ) >> with_array(norm) - reduce_dimensions = Maxout( - nO=width, nI=nM * nC + width, nP=3, dropout=0.0, normalize=True, - ) - else: - embed = norm - - convolution = residual( - expand_window(window_size=window_size) - >> Maxout( - nO=width, - nI=width * ((window_size * 2) + 1), - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ) - ) - if char_embed: - tok2vec = embed >> with_array( - reduce_dimensions >> convolution ** conv_depth, pad=conv_depth - ) - else: - tok2vec = FeatureExtractor(cols) >> with_array( - embed >> convolution ** conv_depth, pad=conv_depth - ) - - if bilstm_depth >= 1: - tok2vec = tok2vec >> PyTorchLSTM( - nO=width, nI=width, depth=bilstm_depth, bi=True - ) - if tok2vec.has_dim("nO") is not False: - tok2vec.set_dim("nO", width) - tok2vec.set_ref("embed", embed) - return tok2vec + return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout)) diff --git a/spacy/syntax/_parser_model.pxd b/spacy/ml/parser_model.pxd similarity index 88% rename from spacy/syntax/_parser_model.pxd rename to spacy/ml/parser_model.pxd index 15befb372..6582b3468 100644 --- a/spacy/syntax/_parser_model.pxd +++ b/spacy/ml/parser_model.pxd @@ -1,8 +1,6 @@ from libc.string cimport memset, memcpy -from libc.stdlib cimport calloc, free, realloc -from ..typedefs cimport weight_t, class_t, hash_t - -from ._state cimport StateC +from ..typedefs cimport weight_t, hash_t +from ..pipeline._parser_internals._state cimport StateC cdef struct SizesC: diff --git a/spacy/syntax/_parser_model.pyx b/spacy/ml/parser_model.pyx similarity index 97% rename from spacy/syntax/_parser_model.pyx rename to spacy/ml/parser_model.pyx index 7acee5efd..da937ca4f 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/ml/parser_model.pyx @@ -1,29 +1,18 @@ # cython: infer_types=True, cdivision=True, boundscheck=False -cimport cython.parallel cimport numpy as np from libc.math cimport exp -from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc -from cymem.cymem cimport Pool -from thinc.extra.search cimport Beam from thinc.backends.linalg cimport Vec, VecVec cimport blis.cy import numpy import numpy.random -from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop +from thinc.api import Model, CupyOps, NumpyOps -from ..typedefs cimport weight_t, class_t, hash_t -from ..tokens.doc cimport Doc -from .stateclass cimport StateClass -from .transition_system cimport Transition - -from ..compat import copy_array -from ..errors import Errors, TempErrors -from ..util import link_vectors_to_models, create_default_optimizer from .. import util -from . import nonproj +from ..typedefs cimport weight_t, class_t, hash_t +from ..pipeline._parser_internals.stateclass cimport StateClass cdef WeightsC get_c_weights(model) except *: diff --git a/spacy/ml/spacy_vectors.py b/spacy/ml/spacy_vectors.py deleted file mode 100644 index 2a4988494..000000000 --- a/spacy/ml/spacy_vectors.py +++ /dev/null @@ -1,27 +0,0 @@ -import numpy -from thinc.api import Model, Unserializable - - -def SpacyVectors(vectors) -> Model: - attrs = {"vectors": Unserializable(vectors)} - model = Model("spacy_vectors", forward, attrs=attrs) - return model - - -def forward(model, docs, is_train: bool): - batch = [] - vectors = model.attrs["vectors"].obj - for doc in docs: - indices = numpy.zeros((len(doc),), dtype="i") - for i, word in enumerate(doc): - if word.orth in vectors.key2row: - indices[i] = vectors.key2row[word.orth] - else: - indices[i] = 0 - batch_vectors = vectors.data[indices] - batch.append(batch_vectors) - - def backprop(dY): - return None - - return batch, backprop diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py new file mode 100644 index 000000000..41afdbf80 --- /dev/null +++ b/spacy/ml/staticvectors.py @@ -0,0 +1,100 @@ +from typing import List, Tuple, Callable, Optional, cast + +from thinc.initializers import glorot_uniform_init +from thinc.util import partial +from thinc.types import Ragged, Floats2d, Floats1d +from thinc.api import Model, Ops, registry + +from ..tokens import Doc + + +@registry.layers("spacy.StaticVectors.v1") +def StaticVectors( + nO: Optional[int] = None, + nM: Optional[int] = None, + *, + dropout: Optional[float] = None, + init_W: Callable = glorot_uniform_init, + key_attr: str = "ORTH" +) -> Model[List[Doc], Ragged]: + """Embed Doc objects with their vocab's vectors table, applying a learned + linear projection to control the dimensionality. If a dropout rate is + specified, the dropout is applied per dimension over the whole batch. + """ + return Model( + "static_vectors", + forward, + init=partial(init, init_W), + params={"W": None}, + attrs={"key_attr": key_attr, "dropout_rate": dropout}, + dims={"nO": nO, "nM": nM}, + ) + + +def forward( + model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool +) -> Tuple[Ragged, Callable]: + if not len(docs): + return _handle_empty(model.ops, model.get_dim("nO")) + key_attr = model.attrs["key_attr"] + W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) + V = cast(Floats2d, docs[0].vocab.vectors.data) + mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate")) + rows = model.ops.flatten( + [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs] + ) + output = Ragged( + model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True), + model.ops.asarray([len(doc) for doc in docs], dtype="i"), + ) + if mask is not None: + output.data *= mask + + def backprop(d_output: Ragged) -> List[Doc]: + if mask is not None: + d_output.data *= mask + model.inc_grad( + "W", + model.ops.gemm(d_output.data, model.ops.as_contig(V[rows]), trans1=True), + ) + return [] + + return output, backprop + + +def init( + init_W: Callable, + model: Model[List[Doc], Ragged], + X: Optional[List[Doc]] = None, + Y: Optional[Ragged] = None, +) -> Model[List[Doc], Ragged]: + nM = model.get_dim("nM") if model.has_dim("nM") else None + nO = model.get_dim("nO") if model.has_dim("nO") else None + if X is not None and len(X): + nM = X[0].vocab.vectors.data.shape[1] + if Y is not None: + nO = Y.data.shape[1] + + if nM is None: + raise ValueError( + "Cannot initialize StaticVectors layer: nM dimension unset. " + "This dimension refers to the width of the vectors table." + ) + if nO is None: + raise ValueError( + "Cannot initialize StaticVectors layer: nO dimension unset. " + "This dimension refers to the output width, after the linear " + "projection has been applied." + ) + model.set_dim("nM", nM) + model.set_dim("nO", nO) + model.set_param("W", init_W(model.ops, (nO, nM))) + return model + + +def _handle_empty(ops: Ops, nO: int): + return Ragged(ops.alloc2f(0, nO), ops.alloc1i(0)), lambda d_ragged: [] + + +def _get_drop_mask(ops: Ops, nO: int, rate: Optional[float]) -> Optional[Floats1d]: + return ops.get_dropout_mask((nO,), rate) if rate is not None else None diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 39d4b0a14..44f125a04 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,5 +1,5 @@ from thinc.api import Model, noop, use_ops, Linear -from ..syntax._parser_model import ParserStepModel +from .parser_model import ParserStepModel def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()): diff --git a/spacy/syntax/__init__.py b/spacy/pipeline/_parser_internals/__init__.py similarity index 100% rename from spacy/syntax/__init__.py rename to spacy/pipeline/_parser_internals/__init__.py diff --git a/spacy/syntax/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd similarity index 98% rename from spacy/syntax/_state.pxd rename to spacy/pipeline/_parser_internals/_state.pxd index fef4f0c92..0d0dd8c05 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,15 +1,14 @@ -from libc.string cimport memcpy, memset, memmove -from libc.stdlib cimport malloc, calloc, free +from libc.string cimport memcpy, memset +from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from murmurhash.mrmr cimport hash64 -from ..vocab cimport EMPTY_LEXEME -from ..structs cimport TokenC, SpanC -from ..lexeme cimport Lexeme -from ..symbols cimport punct -from ..attrs cimport IS_SPACE -from ..typedefs cimport attr_t +from ...vocab cimport EMPTY_LEXEME +from ...structs cimport TokenC, SpanC +from ...lexeme cimport Lexeme +from ...attrs cimport IS_SPACE +from ...typedefs cimport attr_t cdef inline bint is_space_token(const TokenC* token) nogil: diff --git a/spacy/syntax/_state.pyx b/spacy/pipeline/_parser_internals/_state.pyx similarity index 100% rename from spacy/syntax/_state.pyx rename to spacy/pipeline/_parser_internals/_state.pyx diff --git a/spacy/syntax/arc_eager.pxd b/spacy/pipeline/_parser_internals/arc_eager.pxd similarity index 65% rename from spacy/syntax/arc_eager.pxd rename to spacy/pipeline/_parser_internals/arc_eager.pxd index a59be716a..e05a34f56 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/pipeline/_parser_internals/arc_eager.pxd @@ -1,8 +1,6 @@ -from cymem.cymem cimport Pool - from .stateclass cimport StateClass -from ..typedefs cimport weight_t, attr_t -from .transition_system cimport TransitionSystem, Transition +from ...typedefs cimport weight_t, attr_t +from .transition_system cimport Transition, TransitionSystem cdef class ArcEager(TransitionSystem): diff --git a/spacy/syntax/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx similarity index 98% rename from spacy/syntax/arc_eager.pyx rename to spacy/pipeline/_parser_internals/arc_eager.pyx index 6e63859f0..7db8aae0f 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -1,24 +1,17 @@ # cython: profile=True, cdivision=True, infer_types=True -from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool, Address from libc.stdint cimport int32_t from collections import defaultdict, Counter -import json -from ..typedefs cimport hash_t, attr_t -from ..strings cimport hash_string -from ..structs cimport TokenC -from ..tokens.doc cimport Doc, set_children_from_heads +from ...typedefs cimport hash_t, attr_t +from ...strings cimport hash_string +from ...structs cimport TokenC +from ...tokens.doc cimport Doc, set_children_from_heads +from ...gold.example cimport Example +from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC -from .transition_system cimport move_cost_func_t, label_cost_func_t -from ..gold.example cimport Example - -from ..errors import Errors -from .nonproj import is_nonproj_tree -from . import nonproj - # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 diff --git a/spacy/syntax/ner.pxd b/spacy/pipeline/_parser_internals/ner.pxd similarity index 58% rename from spacy/syntax/ner.pxd rename to spacy/pipeline/_parser_internals/ner.pxd index 989593a92..2264a1518 100644 --- a/spacy/syntax/ner.pxd +++ b/spacy/pipeline/_parser_internals/ner.pxd @@ -1,6 +1,4 @@ from .transition_system cimport TransitionSystem -from .transition_system cimport Transition -from ..typedefs cimport attr_t cdef class BiluoPushDown(TransitionSystem): diff --git a/spacy/syntax/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx similarity index 98% rename from spacy/syntax/ner.pyx rename to spacy/pipeline/_parser_internals/ner.pyx index c4125bbdf..2570ccdee 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -2,17 +2,14 @@ from collections import Counter from libc.stdint cimport int32_t from cymem.cymem cimport Pool -from ..typedefs cimport weight_t +from ...typedefs cimport weight_t, attr_t +from ...lexeme cimport Lexeme +from ...attrs cimport IS_SPACE +from ...gold.example cimport Example +from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC -from .transition_system cimport Transition -from .transition_system cimport do_func_t -from ..lexeme cimport Lexeme -from ..attrs cimport IS_SPACE -from ..gold.iob_utils import biluo_tags_from_offsets -from ..gold.example cimport Example - -from ..errors import Errors +from .transition_system cimport Transition, do_func_t cdef enum: diff --git a/spacy/syntax/nonproj.pxd b/spacy/pipeline/_parser_internals/nonproj.pxd similarity index 100% rename from spacy/syntax/nonproj.pxd rename to spacy/pipeline/_parser_internals/nonproj.pxd diff --git a/spacy/syntax/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx similarity index 98% rename from spacy/syntax/nonproj.pyx rename to spacy/pipeline/_parser_internals/nonproj.pyx index 5ccb11f37..8f5fdaa71 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -5,9 +5,9 @@ scheme. """ from copy import copy -from ..tokens.doc cimport Doc, set_children_from_heads +from ...tokens.doc cimport Doc, set_children_from_heads -from ..errors import Errors +from ...errors import Errors DELIMITER = '||' diff --git a/spacy/syntax/stateclass.pxd b/spacy/pipeline/_parser_internals/stateclass.pxd similarity index 95% rename from spacy/syntax/stateclass.pxd rename to spacy/pipeline/_parser_internals/stateclass.pxd index 567982a3f..1d9f05538 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/pipeline/_parser_internals/stateclass.pxd @@ -1,12 +1,8 @@ -from libc.string cimport memcpy, memset - from cymem.cymem cimport Pool -cimport cython -from ..structs cimport TokenC, SpanC -from ..typedefs cimport attr_t +from ...structs cimport TokenC, SpanC +from ...typedefs cimport attr_t -from ..vocab cimport EMPTY_LEXEME from ._state cimport StateC diff --git a/spacy/syntax/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx similarity index 97% rename from spacy/syntax/stateclass.pyx rename to spacy/pipeline/_parser_internals/stateclass.pyx index e472e9861..880cf6cc5 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/pipeline/_parser_internals/stateclass.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True import numpy -from ..tokens.doc cimport Doc +from ...tokens.doc cimport Doc cdef class StateClass: diff --git a/spacy/syntax/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd similarity index 91% rename from spacy/syntax/transition_system.pxd rename to spacy/pipeline/_parser_internals/transition_system.pxd index 836c08168..ba4c33814 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -1,11 +1,11 @@ from cymem.cymem cimport Pool -from ..typedefs cimport attr_t, weight_t -from ..structs cimport TokenC -from ..strings cimport StringStore +from ...typedefs cimport attr_t, weight_t +from ...structs cimport TokenC +from ...strings cimport StringStore +from ...gold.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC -from ..gold.example cimport Example cdef struct Transition: diff --git a/spacy/syntax/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx similarity index 97% rename from spacy/syntax/transition_system.pyx rename to spacy/pipeline/_parser_internals/transition_system.pyx index 17166dcf5..7694e7f34 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -1,19 +1,17 @@ # cython: infer_types=True from __future__ import print_function -from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from collections import Counter import srsly -from ..typedefs cimport weight_t -from ..tokens.doc cimport Doc -from ..structs cimport TokenC +from ...typedefs cimport weight_t, attr_t +from ...tokens.doc cimport Doc +from ...structs cimport TokenC from .stateclass cimport StateClass -from ..typedefs cimport attr_t -from ..errors import Errors -from .. import util +from ...errors import Errors +from ... import util cdef weight_t MIN_SCORE = -90000 diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index a952385b4..65ffbbe50 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -1,13 +1,13 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional, Iterable -from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config +from thinc.api import Model, Config -from ..syntax.nn_parser cimport Parser -from ..syntax.arc_eager cimport ArcEager +from .transition_parser cimport Parser +from ._parser_internals.arc_eager cimport ArcEager from .functions import merge_subtokens from ..language import Language -from ..syntax import nonproj +from ._parser_internals import nonproj from ..scorer import Scorer diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 4165dab83..742b349e5 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -222,9 +222,9 @@ class EntityLinker(Pipe): set_dropout_rate(self.model, drop) if not sentence_docs: warnings.warn(Warnings.W093.format(name="Entity Linker")) - return 0.0 + return losses sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_similarity_loss( + loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) bp_context(d_scores) @@ -235,7 +235,7 @@ class EntityLinker(Pipe): self.set_annotations(docs, predictions) return losses - def get_similarity_loss(self, examples: Iterable[Example], sentence_encodings): + def get_loss(self, examples: Iterable[Example], sentence_encodings): entity_encodings = [] for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) @@ -247,7 +247,7 @@ class EntityLinker(Pipe): entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") if sentence_encodings.shape != entity_encodings.shape: err = Errors.E147.format( - method="get_similarity_loss", msg="gold entities do not match up" + method="get_loss", msg="gold entities do not match up" ) raise RuntimeError(err) gradients = self.distance.get_grad(sentence_encodings, entity_encodings) @@ -337,13 +337,13 @@ class EntityLinker(Pipe): final_kb_ids.append(candidates[0].entity_) else: random.shuffle(candidates) - # this will set all prior probabilities to 0 if they should be excluded from the model + # set all prior probabilities to 0 if incl_prior=False prior_probs = xp.asarray( [c.prior_prob for c in candidates] ) if not self.cfg.get("incl_prior"): prior_probs = xp.asarray( - [0.0 for c in candidates] + [0.0 for _ in candidates] ) scores = prior_probs # add in similarity from the context @@ -387,7 +387,7 @@ class EntityLinker(Pipe): docs (Iterable[Doc]): The documents to modify. kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. - DOCS: https://spacy.io/api/entitylinker#predict + DOCS: https://spacy.io/api/entitylinker#set_annotations """ count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): @@ -400,7 +400,9 @@ class EntityLinker(Pipe): for token in ent: token.ent_kb_id_ = kb_id - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -417,7 +419,7 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "EntityLinker": """Load the pipe from disk. Modifies the object in place and returns it. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 96a5d3d67..8f280547e 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -86,7 +86,6 @@ class EntityRuler: overwrite_ents (bool): If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. ent_id_sep (str): Separator used internally for entity IDs. - RETURNS (EntityRuler): The newly constructed object. DOCS: https://spacy.io/api/entityruler#init """ @@ -316,7 +315,7 @@ class EntityRuler: return Scorer.score_spans(examples, "ents", **kwargs) def from_bytes( - self, patterns_bytes: bytes, exclude: Iterable[str] = tuple() + self, patterns_bytes: bytes, *, exclude: Iterable[str] = tuple() ) -> "EntityRuler": """Load the entity ruler from a bytestring. @@ -340,7 +339,7 @@ class EntityRuler: self.add_patterns(cfg) return self - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: """Serialize the entity ruler patterns to a bytestring. RETURNS (bytes): The serialized patterns. @@ -356,7 +355,7 @@ class EntityRuler: return srsly.msgpack_dumps(serial) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "EntityRuler": """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. @@ -392,7 +391,9 @@ class EntityRuler: from_disk(path, deserializers_patterns, {}) return self - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 8a6a5188f..501884873 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -58,7 +58,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: """ # TODO: make stateful component with "label" config merger = Matcher(doc.vocab) - merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) + merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]]) matches = merger(doc) spans = filter_spans([doc[start : end + 1] for _, start, end in matches]) with doc.retokenize() as retokenizer: diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a6be129ba..18673f85b 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -22,17 +22,23 @@ default_model_config = """ @architectures = "spacy.Tagger.v1" [model.tok2vec] -@architectures = "spacy.HashCharEmbedCNN.v1" -pretrained_vectors = null +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.CharacterEmbed.v1" width = 128 -depth = 4 -embed_size = 7000 -window_size = 1 -maxout_pieces = 3 +rows = 7000 nM = 64 nC = 8 -dropout = null + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 128 +depth = 4 +window_size = 1 +maxout_pieces = 3 """ + DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] @@ -149,7 +155,6 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.set_output(len(self.labels)) self.model.initialize() - util.link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @@ -160,7 +165,7 @@ class Morphologizer(Tagger): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by Morphologizer.predict. - DOCS: https://spacy.io/api/morphologizer#predict + DOCS: https://spacy.io/api/morphologizer#set_annotations """ if isinstance(docs, Doc): docs = [docs] @@ -230,7 +235,7 @@ class Morphologizer(Tagger): "morph", **kwargs)) return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -244,7 +249,7 @@ class Morphologizer(Tagger): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -267,7 +272,7 @@ class Morphologizer(Tagger): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -282,7 +287,7 @@ class Morphologizer(Tagger): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 4945afe4f..d85030adb 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional import numpy -from thinc.api import CosineDistance, to_categorical, to_categorical, Model, Config +from thinc.api import CosineDistance, to_categorical, Model, Config from thinc.api import set_dropout_rate from ..tokens.doc cimport Doc @@ -9,9 +9,8 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from .tagger import Tagger from ..language import Language -from ..syntax import nonproj +from ._parser_internals import nonproj from ..attrs import POS, ID -from ..util import link_vectors_to_models from ..errors import Errors @@ -91,7 +90,6 @@ class MultitaskObjective(Tagger): if label is not None and label not in self.labels: self.labels[label] = len(self.labels) self.model.initialize() - link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @@ -179,7 +177,6 @@ class ClozeMultitask(Pipe): pass def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None): - link_vectors_to_models(self.vocab) self.model.initialize() X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) @@ -222,3 +219,6 @@ class ClozeMultitask(Pipe): if losses is not None: losses[self.name] += loss + + def add_label(self, label): + raise NotImplementedError diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 7ee4448fb..7f4fb8363 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -1,9 +1,9 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional, Iterable -from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config +from thinc.api import Model, Config -from ..syntax.nn_parser cimport Parser -from ..syntax.ner cimport BiluoPushDown +from .transition_parser cimport Parser +from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import Scorer diff --git a/spacy/syntax/__init__.pxd b/spacy/pipeline/nn_parser.pyx similarity index 100% rename from spacy/syntax/__init__.pxd rename to spacy/pipeline/nn_parser.pyx diff --git a/spacy/pipeline/pipe.pxd b/spacy/pipeline/pipe.pxd new file mode 100644 index 000000000..bb97f79d0 --- /dev/null +++ b/spacy/pipeline/pipe.pxd @@ -0,0 +1,2 @@ +cdef class Pipe: + cdef public str name diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index f8ca28724..1a94905a2 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -3,12 +3,12 @@ import srsly from ..tokens.doc cimport Doc -from ..util import link_vectors_to_models, create_default_optimizer +from ..util import create_default_optimizer from ..errors import Errors from .. import util -class Pipe: +cdef class Pipe: """This class is a base class and not instantiated directly. Trainable pipeline components like the EntityRecognizer or TextCategorizer inherit from it and it defines the interface that components should follow to @@ -17,8 +17,6 @@ class Pipe: DOCS: https://spacy.io/api/pipe """ - name = None - def __init__(self, vocab, model, name, **cfg): """Initialize a pipeline component. @@ -32,7 +30,9 @@ class Pipe: raise NotImplementedError def __call__(self, Doc doc): - """Add context-sensitive embeddings to the Doc.tensor attribute. + """Apply the pipe to one document. The document is modified in place, + and returned. This usually happens under the hood when the nlp object + is called on a text and all components are applied to the Doc. docs (Doc): The Doc to preocess. RETURNS (Doc): The processed Doc. @@ -74,9 +74,9 @@ class Pipe: """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - tokvecses: The tensors to set, produced by Pipe.predict. + scores: The scores to assign. - DOCS: https://spacy.io/api/pipe#predict + DOCS: https://spacy.io/api/pipe#set_annotations """ raise NotImplementedError @@ -145,8 +145,6 @@ class Pipe: DOCS: https://spacy.io/api/pipe#begin_training """ self.model.initialize() - if hasattr(self, "vocab"): - link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @@ -178,7 +176,7 @@ class Pipe: """ return {} - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -193,7 +191,7 @@ class Pipe: serialize["vocab"] = self.vocab.to_bytes return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -216,7 +214,7 @@ class Pipe: util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -230,7 +228,7 @@ class Pipe: serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 8203249d7..be4351212 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -162,7 +162,7 @@ class Sentencizer(Pipe): del results["sents_per_type"] return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the sentencizer to a bytestring. RETURNS (bytes): The serialized object. @@ -171,7 +171,7 @@ class Sentencizer(Pipe): """ return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the sentencizer from a bytestring. bytes_data (bytes): The data to load. @@ -183,7 +183,7 @@ class Sentencizer(Pipe): self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the sentencizer to disk. DOCS: https://spacy.io/api/sentencizer#to_disk @@ -193,7 +193,7 @@ class Sentencizer(Pipe): srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the sentencizer from disk. DOCS: https://spacy.io/api/sentencizer#from_disk @@ -203,3 +203,9 @@ class Sentencizer(Pipe): cfg = srsly.read_json(path) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self + + def get_loss(self, examples, scores): + raise NotImplementedError + + def add_label(self, label): + raise NotImplementedError diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 743ceb32b..f826f21de 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -76,7 +76,7 @@ class SentenceRecognizer(Tagger): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict. - DOCS: https://spacy.io/api/sentencerecognizer#predict + DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ if isinstance(docs, Doc): docs = [docs] @@ -109,7 +109,7 @@ class SentenceRecognizer(Tagger): for eg in examples: eg_truth = [] for x in eg.get_aligned("sent_start"): - if x == None: + if x is None: eg_truth.append(None) elif x == 1: eg_truth.append(labels[1]) @@ -138,7 +138,6 @@ class SentenceRecognizer(Tagger): """ self.set_output(len(self.labels)) self.model.initialize() - util.link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @@ -157,7 +156,7 @@ class SentenceRecognizer(Tagger): del results["sents_per_type"] return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -171,7 +170,7 @@ class SentenceRecognizer(Tagger): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -194,7 +193,7 @@ class SentenceRecognizer(Tagger): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -209,7 +208,7 @@ class SentenceRecognizer(Tagger): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index ec7ab6b7a..44e1182c1 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -131,8 +131,6 @@ class SimpleNER(Pipe): return losses def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]: - loss = 0 - d_scores = [] truths = [] for eg in examples: tags = eg.get_aligned("TAG", as_string=True) @@ -159,7 +157,6 @@ class SimpleNER(Pipe): if not hasattr(get_examples, "__call__"): gold_tuples = get_examples get_examples = lambda: gold_tuples - labels = _get_labels(get_examples()) for label in _get_labels(get_examples()): self.add_label(label) labels = self.labels @@ -168,7 +165,6 @@ class SimpleNER(Pipe): self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) - util.link_vectors_to_models(self.vocab) self.loss_func = SequenceCategoricalCrossentropy( names=self.get_tag_names(), normalize=True, missing_value=None ) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index c52a7889b..f2e06efed 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -145,7 +145,7 @@ class Tagger(Pipe): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by Tagger.predict. - DOCS: https://spacy.io/api/tagger#predict + DOCS: https://spacy.io/api/tagger#set_annotations """ if isinstance(docs, Doc): docs = [docs] @@ -318,7 +318,6 @@ class Tagger(Pipe): self.model.initialize(X=doc_sample) # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. - util.link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @@ -370,7 +369,7 @@ class Tagger(Pipe): scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) return scores - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -388,7 +387,7 @@ class Tagger(Pipe): serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -424,7 +423,7 @@ class Tagger(Pipe): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -443,7 +442,7 @@ class Tagger(Pipe): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 2aaa4a769..639ce5514 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -163,7 +163,7 @@ class TextCategorizer(Pipe): docs (Iterable[Doc]): The documents to modify. scores: The scores to set, produced by TextCategorizer.predict. - DOCS: https://spacy.io/api/textcategorizer#predict + DOCS: https://spacy.io/api/textcategorizer#set_annotations """ for i, doc in enumerate(docs): for j, label in enumerate(self.labels): @@ -238,8 +238,11 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer#rehearse """ + + if losses is not None: + losses.setdefault(self.name, 0.0) if self._rehearsal_model is None: - return + return losses try: docs = [eg.predicted for eg in examples] except AttributeError: @@ -250,7 +253,7 @@ class TextCategorizer(Pipe): raise TypeError(err) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. - return + return losses set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) target = self._rehearsal_model(examples) @@ -259,7 +262,6 @@ class TextCategorizer(Pipe): if sgd is not None: self.model.finish_update(sgd) if losses is not None: - losses.setdefault(self.name, 0.0) losses[self.name] += (gradient ** 2).sum() return losses @@ -356,7 +358,6 @@ class TextCategorizer(Pipe): docs = [Doc(Vocab(), words=["hello"])] truths, _ = self._examples_to_truth(examples) self.set_output(len(self.labels)) - util.link_vectors_to_models(self.vocab) self.model.initialize(X=docs, Y=truths) if sgd is None: sgd = self.create_optimizer() diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5bda12d1b..31643a7d3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -7,7 +7,7 @@ from ..tokens import Doc from ..vocab import Vocab from ..language import Language from ..errors import Errors -from ..util import link_vectors_to_models, minibatch +from ..util import minibatch default_model_config = """ @@ -109,7 +109,7 @@ class Tok2Vec(Pipe): docs (Iterable[Doc]): The documents to modify. tokvecses: The tensors to set, produced by Tok2Vec.predict. - DOCS: https://spacy.io/api/tok2vec#predict + DOCS: https://spacy.io/api/tok2vec#set_annotations """ for doc, tokvecs in zip(docs, tokvecses): assert tokvecs.shape[0] == len(doc) @@ -196,9 +196,11 @@ class Tok2Vec(Pipe): DOCS: https://spacy.io/api/tok2vec#begin_training """ - docs = [Doc(Vocab(), words=["hello"])] + docs = [Doc(self.vocab, words=["hello"])] self.model.initialize(X=docs) - link_vectors_to_models(self.vocab) + + def add_label(self, label): + raise NotImplementedError class Tok2VecListener(Model): diff --git a/spacy/syntax/nn_parser.pxd b/spacy/pipeline/transition_parser.pxd similarity index 62% rename from spacy/syntax/nn_parser.pxd rename to spacy/pipeline/transition_parser.pxd index 7840ec27a..e594a3098 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/pipeline/transition_parser.pxd @@ -1,16 +1,15 @@ -from .stateclass cimport StateClass -from .arc_eager cimport TransitionSystem +from cymem.cymem cimport Pool + from ..vocab cimport Vocab -from ..tokens.doc cimport Doc -from ..structs cimport TokenC -from ._state cimport StateC -from ._parser_model cimport WeightsC, ActivationsC, SizesC +from .pipe cimport Pipe +from ._parser_internals.transition_system cimport Transition, TransitionSystem +from ._parser_internals._state cimport StateC +from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC -cdef class Parser: +cdef class Parser(Pipe): cdef readonly Vocab vocab cdef public object model - cdef public str name cdef public object _rehearsal_model cdef readonly TransitionSystem moves cdef readonly object cfg diff --git a/spacy/syntax/nn_parser.pyx b/spacy/pipeline/transition_parser.pyx similarity index 94% rename from spacy/syntax/nn_parser.pyx rename to spacy/pipeline/transition_parser.pyx index 5313ec9bd..b14a55cb4 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -1,42 +1,32 @@ # cython: infer_types=True, cdivision=True, boundscheck=False -cimport cython.parallel +from __future__ import print_function +from cymem.cymem cimport Pool cimport numpy as np from itertools import islice -from cpython.ref cimport PyObject, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno -from libc.math cimport exp from libcpp.vector cimport vector -from libc.string cimport memset, memcpy +from libc.string cimport memset from libc.stdlib cimport calloc, free -from cymem.cymem cimport Pool -from thinc.backends.linalg cimport Vec, VecVec -from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops -from thinc.api import get_array_module, zero_init, set_dropout_rate -from itertools import islice import srsly + +from ._parser_internals.stateclass cimport StateClass +from ..ml.parser_model cimport alloc_activations, free_activations +from ..ml.parser_model cimport predict_states, arg_max_if_valid +from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss +from ..ml.parser_model cimport get_c_weights, get_c_sizes + +from ..tokens.doc cimport Doc +from ..errors import Errors, Warnings +from .. import util +from ..util import create_default_optimizer + +from thinc.api import set_dropout_rate import numpy.random import numpy import warnings -from ..tokens.doc cimport Doc -from ..typedefs cimport weight_t, class_t, hash_t -from ._parser_model cimport alloc_activations, free_activations -from ._parser_model cimport predict_states, arg_max_if_valid -from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss -from ._parser_model cimport get_c_weights, get_c_sizes -from .stateclass cimport StateClass -from ._state cimport StateC -from .transition_system cimport Transition -from ..util import link_vectors_to_models, create_default_optimizer, registry -from ..compat import copy_array -from ..errors import Errors, Warnings -from .. import util -from . import nonproj - - -cdef class Parser: +cdef class Parser(Pipe): """ Base class of the DependencyParser and EntityRecognizer. """ @@ -107,7 +97,7 @@ cdef class Parser: @property def tok2vec(self): - '''Return the embedding and convolutional layer of the model.''' + """Return the embedding and convolutional layer of the model.""" return self.model.get_ref("tok2vec") @property @@ -138,13 +128,13 @@ cdef class Parser: raise NotImplementedError def init_multitask_objectives(self, get_examples, pipeline, **cfg): - '''Setup models for secondary objectives, to benefit from multi-task + """Setup models for secondary objectives, to benefit from multi-task learning. This method is intended to be overridden by subclasses. For instance, the dependency parser can benefit from sharing an input representation with a label prediction model. These auxiliary models are discarded after training. - ''' + """ pass def use_params(self, params): @@ -456,7 +446,6 @@ cdef class Parser: self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) - link_vectors_to_models(self.vocab) return sgd def to_disk(self, path, exclude=tuple()): diff --git a/spacy/schemas.py b/spacy/schemas.py index 3f3c01f22..971d283e2 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -171,17 +171,6 @@ class ModelMetaSchema(BaseModel): # fmt: on -# JSON training format - - -class TrainingSchema(BaseModel): - # TODO: write - - class Config: - title = "Schema for training data in spaCy's JSON format" - extra = "forbid" - - # Config schema # We're not setting any defaults here (which is too messy) and are making all # fields required, so we can raise validation errors for missing values. To diff --git a/spacy/scorer.py b/spacy/scorer.py index 24009aec6..40a819e7c 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -84,7 +84,6 @@ class Scorer: **cfg, ) -> None: """Initialize the Scorer. - RETURNS (Scorer): The newly created object. DOCS: https://spacy.io/api/scorer#init """ diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 9e584ce8a..136eda9ff 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -97,7 +97,6 @@ cdef class StringStore: """Create the StringStore. strings (iterable): A sequence of unicode strings to add to the store. - RETURNS (StringStore): The newly constructed object. """ self.mem = Pool() self._map = PreshMap() diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 98542e80f..bcb224bd3 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -63,18 +63,11 @@ def test_matcher_len_contains(matcher): assert "TEST2" not in matcher -def test_matcher_add_new_old_api(en_vocab): +def test_matcher_add_new_api(en_vocab): doc = Doc(en_vocab, words=["a", "b"]) patterns = [[{"TEXT": "a"}], [{"TEXT": "a"}, {"TEXT": "b"}]] matcher = Matcher(en_vocab) - matcher.add("OLD_API", None, *patterns) - assert len(matcher(doc)) == 2 - matcher = Matcher(en_vocab) on_match = Mock() - matcher.add("OLD_API_CALLBACK", on_match, *patterns) - assert len(matcher(doc)) == 2 - assert on_match.call_count == 2 - # New API: add(key: str, patterns: List[List[dict]], on_match: Callable) matcher = Matcher(en_vocab) matcher.add("NEW_API", patterns) assert len(matcher(doc)) == 2 @@ -176,7 +169,7 @@ def test_matcher_match_zero_plus(matcher): def test_matcher_match_one_plus(matcher): control = Matcher(matcher.vocab) - control.add("BasicPhilippe", None, [{"ORTH": "Philippe"}]) + control.add("BasicPhilippe", [[{"ORTH": "Philippe"}]]) doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index a2b2cd83f..8f4c13471 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -7,18 +7,10 @@ from spacy.tokens import Doc, Span pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] -pattern2 = [{"ORTH": "A"}, {"ORTH": "A"}] +pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}] pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}] -pattern4 = [ - {"ORTH": "B"}, - {"ORTH": "A", "OP": "*"}, - {"ORTH": "B"}, -] -pattern5 = [ - {"ORTH": "B", "OP": "*"}, - {"ORTH": "A", "OP": "*"}, - {"ORTH": "B"}, -] +pattern4 = [{"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] +pattern5 = [{"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] re_pattern1 = "AA*" re_pattern2 = "A*A" @@ -26,10 +18,16 @@ re_pattern3 = "AA" re_pattern4 = "BA*B" re_pattern5 = "B*A*B" +longest1 = "A A A A A" +longest2 = "A A A A A" +longest3 = "A A" +longest4 = "B A A A A A B" # "FIRST" would be "B B" +longest5 = "B B A A A A A B" + @pytest.fixture def text(): - return "(ABBAAAAAB)." + return "(BBAAAAAB)." @pytest.fixture @@ -41,25 +39,63 @@ def doc(en_tokenizer, text): @pytest.mark.parametrize( "pattern,re_pattern", [ - pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()), - pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()), - pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()), + (pattern1, re_pattern1), + (pattern2, re_pattern2), + (pattern3, re_pattern3), (pattern4, re_pattern4), - pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()), + (pattern5, re_pattern5), ], ) -def test_greedy_matching(doc, text, pattern, re_pattern): - """Test that the greedy matching behavior of the * op is consistant with +def test_greedy_matching_first(doc, text, pattern, re_pattern): + """Test that the greedy matching behavior "FIRST" is consistent with other re implementations.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, [pattern]) + matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] - for match, re_match in zip(matches, re_matches): - assert match[1:] == re_match + for (key, m_s, m_e), (re_s, re_e) in zip(matches, re_matches): + # matching the string, not the exact position + assert doc[m_s:m_e].text == doc[re_s:re_e].text + + +@pytest.mark.parametrize( + "pattern,longest", + [ + (pattern1, longest1), + (pattern2, longest2), + (pattern3, longest3), + (pattern4, longest4), + (pattern5, longest5), + ], +) +def test_greedy_matching_longest(doc, text, pattern, longest): + """Test the "LONGEST" greedy matching behavior""" + matcher = Matcher(doc.vocab) + matcher.add("RULE", [pattern], greedy="LONGEST") + matches = matcher(doc) + for (key, s, e) in matches: + assert doc[s:e].text == longest + + +def test_greedy_matching_longest_first(en_tokenizer): + """Test that "LONGEST" matching prefers the first of two equally long matches""" + doc = en_tokenizer(" ".join("CCC")) + matcher = Matcher(doc.vocab) + pattern = [{"ORTH": "C"}, {"ORTH": "C"}] + matcher.add("RULE", [pattern], greedy="LONGEST") + matches = matcher(doc) + # out of 0-2 and 1-3, the first should be picked + assert len(matches) == 1 + assert matches[0][1] == 0 + assert matches[0][2] == 2 + + +def test_invalid_greediness(doc, text): + matcher = Matcher(doc.vocab) + with pytest.raises(ValueError): + matcher.add("RULE", [pattern1], greedy="GREEDY") -@pytest.mark.xfail @pytest.mark.parametrize( "pattern,re_pattern", [ @@ -74,7 +110,7 @@ def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, [pattern]) + matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] assert len(matches) == len(re_matches) diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 77e142215..fd1880030 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -4,8 +4,8 @@ from spacy import registry from spacy.gold import Example from spacy.pipeline import DependencyParser from spacy.tokens import Doc -from spacy.syntax.nonproj import projectivize -from spacy.syntax.arc_eager import ArcEager +from spacy.pipeline._parser_internals.nonproj import projectivize +from spacy.pipeline._parser_internals.arc_eager import ArcEager from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 4a6bf73a5..013ae6b7e 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -5,7 +5,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.lookups import Lookups -from spacy.syntax.ner import BiluoPushDown +from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.gold import Example from spacy.tokens import Doc from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index feae52f7f..6594c7e78 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -3,8 +3,8 @@ import pytest from spacy import registry from spacy.gold import Example from spacy.vocab import Vocab -from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.nn_parser import Parser +from spacy.pipeline._parser_internals.arc_eager import ArcEager +from spacy.pipeline.transition_parser import Parser from spacy.tokens.doc import Doc from thinc.api import Model from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 496ec7e03..5bdebd0ca 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -1,7 +1,7 @@ import pytest -from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc -from spacy.syntax.nonproj import is_nonproj_tree -from spacy.syntax import nonproj +from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc +from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree +from spacy.pipeline._parser_internals import nonproj from ..util import get_doc diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index ac0867189..cf4e402e2 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -9,7 +9,6 @@ from spacy.matcher import Matcher from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.compat import pickle -from spacy.util import link_vectors_to_models import numpy import random @@ -190,7 +189,6 @@ def test_issue2871(): _ = vocab[word] # noqa: F841 vocab.set_vector(word, vector_data[0]) vocab.vectors.name = "dummy_vectors" - link_vectors_to_models(vocab) assert vocab["dog"].rank == 0 assert vocab["cat"].rank == 1 assert vocab["SUFFIX"].rank == 2 diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 90a79994e..ce35add42 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -5,6 +5,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.util import registry, deep_merge_configs, load_model_from_config from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model +from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from ..util import make_tempdir @@ -40,7 +41,7 @@ factory = "tagger" @architectures = "spacy.Tagger.v1" [components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" +@architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model:width} """ @@ -68,18 +69,18 @@ dropout = null @registry.architectures.register("my_test_parser") def my_parser(): tok2vec = build_Tok2Vec_model( - width=321, - embed_size=5432, - pretrained_vectors=None, - window_size=3, - maxout_pieces=4, - subword_features=True, - char_embed=True, - nM=64, - nC=8, - conv_depth=2, - bilstm_depth=0, - dropout=None, + MultiHashEmbed( + width=321, + rows=5432, + also_embed_subwords=True, + also_use_static_vectors=False + ), + MaxoutWindowEncoder( + width=321, + window_size=3, + maxout_pieces=4, + depth=2 + ) ) parser = build_tb_parser_model( tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index fc1988fcd..4c38ea6c6 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -5,12 +5,32 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate from numpy.testing import assert_array_equal import numpy -from spacy.ml.models import build_Tok2Vec_model +from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES +def get_textcat_kwargs(): + return { + "width": 64, + "embed_size": 2000, + "pretrained_vectors": None, + "exclusive_classes": False, + "ngram_size": 1, + "window_size": 1, + "conv_depth": 2, + "dropout": None, + "nO": 7, + } + +def get_textcat_cnn_kwargs(): + return { + "tok2vec": test_tok2vec(), + "exclusive_classes": False, + "nO": 13, + } + def get_all_params(model): params = [] for node in model.walk(): @@ -35,50 +55,34 @@ def get_gradient(model, Y): raise ValueError(f"Could not get gradient for type {type(Y)}") +def get_tok2vec_kwargs(): + # This actually creates models, so seems best to put it in a function. + return { + "embed": MultiHashEmbed( + width=32, + rows=500, + also_embed_subwords=True, + also_use_static_vectors=False + ), + "encode": MaxoutWindowEncoder( + width=32, + depth=2, + maxout_pieces=2, + window_size=1, + ) + } + + def test_tok2vec(): - return build_Tok2Vec_model(**TOK2VEC_KWARGS) - - -TOK2VEC_KWARGS = { - "width": 96, - "embed_size": 2000, - "subword_features": True, - "char_embed": False, - "conv_depth": 4, - "bilstm_depth": 0, - "maxout_pieces": 4, - "window_size": 1, - "dropout": 0.1, - "nM": 0, - "nC": 0, - "pretrained_vectors": None, -} - -TEXTCAT_KWARGS = { - "width": 64, - "embed_size": 2000, - "pretrained_vectors": None, - "exclusive_classes": False, - "ngram_size": 1, - "window_size": 1, - "conv_depth": 2, - "dropout": None, - "nO": 7, -} - -TEXTCAT_CNN_KWARGS = { - "tok2vec": test_tok2vec(), - "exclusive_classes": False, - "nO": 13, -} + return build_Tok2Vec_model(**get_tok2vec_kwargs()) @pytest.mark.parametrize( "seed,model_func,kwargs", [ - (0, build_Tok2Vec_model, TOK2VEC_KWARGS), - (0, build_text_classifier, TEXTCAT_KWARGS), - (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS), + (0, build_Tok2Vec_model, get_tok2vec_kwargs()), + (0, build_text_classifier, get_textcat_kwargs()), + (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), ], ) def test_models_initialize_consistently(seed, model_func, kwargs): @@ -96,9 +100,9 @@ def test_models_initialize_consistently(seed, model_func, kwargs): @pytest.mark.parametrize( "seed,model_func,kwargs,get_X", [ - (0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), - (0, build_text_classifier, TEXTCAT_KWARGS, get_docs), - (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), + (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), + (0, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) def test_models_predict_consistently(seed, model_func, kwargs, get_X): @@ -131,9 +135,9 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): @pytest.mark.parametrize( "seed,dropout,model_func,kwargs,get_X", [ - (0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), - (0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs), - (0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), + (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), + (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 32f4c5774..76b5e64df 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,6 +1,8 @@ import pytest from spacy.ml.models.tok2vec import build_Tok2Vec_model +from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed +from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder from spacy.vocab import Vocab from spacy.tokens import Doc @@ -13,18 +15,18 @@ def test_empty_doc(): vocab = Vocab() doc = Doc(vocab, words=[]) tok2vec = build_Tok2Vec_model( - width, - embed_size, - pretrained_vectors=None, - conv_depth=4, - bilstm_depth=0, - window_size=1, - maxout_pieces=3, - subword_features=True, - char_embed=False, - nM=64, - nC=8, - dropout=None, + MultiHashEmbed( + width=width, + rows=embed_size, + also_use_static_vectors=False, + also_embed_subwords=True + ), + MaxoutWindowEncoder( + width=width, + depth=4, + window_size=1, + maxout_pieces=3 + ) ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update([doc]) @@ -38,18 +40,18 @@ def test_empty_doc(): def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) tok2vec = build_Tok2Vec_model( - width, - embed_size, - pretrained_vectors=None, - conv_depth=4, - bilstm_depth=0, - window_size=1, - maxout_pieces=3, - subword_features=True, - char_embed=False, - nM=64, - nC=8, - dropout=None, + MultiHashEmbed( + width=width, + rows=embed_size, + also_use_static_vectors=False, + also_embed_subwords=True + ), + MaxoutWindowEncoder( + width=width, + depth=4, + window_size=1, + maxout_pieces=3, + ) ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) @@ -60,24 +62,25 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): # fmt: off @pytest.mark.parametrize( - "tok2vec_config", + "width,embed_arch,embed_config,encode_arch,encode_config", [ - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], ) # fmt: on -def test_tok2vec_configs(tok2vec_config): +def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_config): + embed_config["width"] = width + encode_config["width"] = width docs = get_batch(3) - tok2vec = build_Tok2Vec_model(**tok2vec_config) + tok2vec = build_Tok2Vec_model( + embed_arch(**embed_config), + encode_arch(**encode_config) + ) tok2vec.initialize(docs) vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) - assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"]) + assert vectors[0].shape == (len(docs[0]), width) backprop(vectors) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 793bb5a25..bffbf5829 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -50,7 +50,6 @@ cdef class Tokenizer: recognised as tokens. url_match (callable): A boolean function matching strings to be recognised as tokens after considering prefixes and suffixes. - RETURNS (Tokenizer): The newly constructed object. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) @@ -729,7 +728,7 @@ cdef class Tokenizer: with path.open("wb") as file_: file_.write(self.to_bytes(**kwargs)) - def from_disk(self, path, **kwargs): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -742,10 +741,10 @@ cdef class Tokenizer: path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **kwargs) + self.from_bytes(bytes_data, exclude=exclude) return self - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -764,7 +763,7 @@ cdef class Tokenizer: } return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 3943767a0..b89ce3bdd 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -312,6 +312,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): """Retokenize the document, such that the token at `doc[token_index]` is split into tokens with the orth 'orths' token_index(int): token index of the token to split. + orths: IDs of the verbatim text content of the tokens to create **attributes: Attributes to assign to each of the newly created tokens. By default, attributes are inherited from the original token. diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 96245a0e1..192067ed4 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,10 +1,12 @@ +from typing import Iterable, Iterator import numpy import zlib import srsly from thinc.api import NumpyOps +from .doc import Doc +from ..vocab import Vocab from ..compat import copy_reg -from ..tokens import Doc from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors @@ -44,13 +46,18 @@ class DocBin: document from the DocBin. """ - def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]): + def __init__( + self, + attrs: Iterable[str] = ALL_ATTRS, + store_user_data: bool = False, + docs: Iterable[Doc] = tuple(), + ) -> None: """Create a DocBin object to hold serialized annotations. - attrs (list): List of attributes to serialize. 'orth' and 'spacy' are - always serialized, so they're not required. Defaults to None. + attrs (Iterable[str]): List of attributes to serialize. 'orth' and + 'spacy' are always serialized, so they're not required. store_user_data (bool): Whether to include the `Doc.user_data`. - RETURNS (DocBin): The newly constructed object. + docs (Iterable[Doc]): Docs to add. DOCS: https://spacy.io/api/docbin#init """ @@ -68,11 +75,11 @@ class DocBin: for doc in docs: self.add(doc) - def __len__(self): + def __len__(self) -> int: """RETURNS: The number of Doc objects added to the DocBin.""" return len(self.tokens) - def add(self, doc): + def add(self, doc: Doc) -> None: """Add a Doc's annotations to the DocBin for serialization. doc (Doc): The Doc object to add. @@ -100,7 +107,7 @@ class DocBin: if self.store_user_data: self.user_data.append(srsly.msgpack_dumps(doc.user_data)) - def get_docs(self, vocab): + def get_docs(self, vocab: Vocab) -> Iterator[Doc]: """Recover Doc objects from the annotations, using the given vocab. vocab (Vocab): The shared vocab. @@ -125,7 +132,7 @@ class DocBin: doc.user_data.update(user_data) yield doc - def merge(self, other): + def merge(self, other: "DocBin") -> None: """Extend the annotations of this DocBin with the annotations from another. Will raise an error if the pre-defined attrs of the two DocBins don't match. @@ -144,7 +151,7 @@ class DocBin: if self.store_user_data: self.user_data.extend(other.user_data) - def to_bytes(self): + def to_bytes(self) -> bytes: """Serialize the DocBin's annotations to a bytestring. RETURNS (bytes): The serialized DocBin. @@ -156,7 +163,6 @@ class DocBin: lengths = [len(tokens) for tokens in self.tokens] tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([]) spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) - msg = { "version": self.version, "attrs": self.attrs, @@ -171,7 +177,7 @@ class DocBin: msg["user_data"] = self.user_data return zlib.compress(srsly.msgpack_dumps(msg)) - def from_bytes(self, bytes_data): + def from_bytes(self, bytes_data: bytes) -> "DocBin": """Deserialize the DocBin's annotations from a bytestring. bytes_data (bytes): The data to load from. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index adc7059e5..2fcc0983b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -173,7 +173,6 @@ cdef class Doc: words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. - RETURNS (Doc): The newly constructed object. DOCS: https://spacy.io/api/doc#init """ @@ -988,20 +987,20 @@ cdef class Doc: other.c = &tokens[PADDING] return other - def to_disk(self, path, **kwargs): + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. - exclude (list): String names of serialization fields to exclude. + exclude (Iterable[str]): String names of serialization fields to exclude. DOCS: https://spacy.io/api/doc#to_disk """ path = util.ensure_path(path) with path.open("wb") as file_: - file_.write(self.to_bytes(**kwargs)) + file_.write(self.to_bytes(exclude=exclude)) - def from_disk(self, path, **kwargs): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -1015,9 +1014,9 @@ cdef class Doc: path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() - return self.from_bytes(bytes_data, **kwargs) + return self.from_bytes(bytes_data, exclude=exclude) - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, *, exclude=tuple()): """Serialize, i.e. export the document contents to a binary string. exclude (list): String names of serialization fields to exclude. @@ -1026,9 +1025,9 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs)) + return srsly.msgpack_dumps(self.to_dict(exclude=exclude)) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. @@ -1037,13 +1036,9 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#from_bytes """ - return self.from_dict( - srsly.msgpack_loads(bytes_data), - exclude=exclude, - **kwargs - ) + return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude) - def to_dict(self, exclude=tuple(), **kwargs): + def to_dict(self, *, exclude=tuple()): """Export the document contents to a dictionary for serialization. exclude (list): String names of serialization fields to exclude. @@ -1091,14 +1086,14 @@ cdef class Doc: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) return util.to_dict(serializers, exclude) - def from_dict(self, msg, exclude=tuple(), **kwargs): + def from_dict(self, msg, *, exclude=tuple()): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. exclude (list): String names of serialization fields to exclude. RETURNS (Doc): Itself. - DOCS: https://spacy.io/api/doc#from_bytes + DOCS: https://spacy.io/api/doc#from_dict """ if self.length != 0: raise ValueError(Errors.E033.format(length=self.length)) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 203308749..5b55d8e88 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -94,7 +94,6 @@ cdef class Span: kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. - RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/span#init """ diff --git a/spacy/util.py b/spacy/util.py index d1951145f..677f5e8e0 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,7 +7,7 @@ import importlib.util import re from pathlib import Path import thinc -from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer +from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model import functools import itertools import numpy.random @@ -24,6 +24,8 @@ import tempfile import shutil import shlex import inspect +from thinc.types import Unserializable + try: import cupy.random @@ -187,6 +189,20 @@ def get_module_path(module: ModuleType) -> Path: return Path(sys.modules[module.__module__].__file__).parent +def load_vectors_into_model( + nlp: "Language", name: Union[str, Path], *, add_strings=True +) -> None: + """Load word vectors from an installed model or path into a model instance.""" + vectors_nlp = load_model(name) + nlp.vocab.vectors = vectors_nlp.vocab.vectors + if add_strings: + # I guess we should add the strings from the vectors_nlp model? + # E.g. if someone does a similarity query, they might expect the strings. + for key in nlp.vocab.vectors.key2row: + if key in vectors_nlp.vocab.strings: + nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) + + def load_model( name: Union[str, Path], disable: Iterable[str] = tuple(), @@ -1184,22 +1200,6 @@ class DummyTokenizer: return self -def link_vectors_to_models(vocab: "Vocab") -> None: - vectors = vocab.vectors - if vectors.name is None: - vectors.name = VECTORS_KEY - if vectors.data.size != 0: - warnings.warn(Warnings.W020.format(shape=vectors.data.shape)) - for word in vocab: - if word.orth in vectors.key2row: - word.rank = vectors.key2row[word.orth] - else: - word.rank = 0 - - -VECTORS_KEY = "spacy_pretrained_vectors" - - def create_default_optimizer() -> Optimizer: # TODO: Do we still want to allow env_opt? learn_rate = env_opt("learn_rate", 0.001) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 0cc7409a7..bcea87e67 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -58,7 +58,6 @@ cdef class Vectors: data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. name (str): A name to identify the vectors table. - RETURNS (Vectors): The newly created object. DOCS: https://spacy.io/api/vectors#init """ diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 2115789e6..ce95786f2 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -16,7 +16,7 @@ from .errors import Errors from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM, IS_STOP from .vectors import Vectors -from .util import link_vectors_to_models, registry +from .util import registry from .lookups import Lookups, load_lookups from . import util from .lang.norm_exceptions import BASE_NORMS @@ -74,7 +74,6 @@ cdef class Vocab: lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. vectors_name (unicode): Optional name to identify the vectors table. - RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): @@ -345,7 +344,6 @@ cdef class Vocab: synonym = self.strings[syn_keys[i][0]] score = scores[i][0] remap[word] = (synonym, score) - link_vectors_to_models(self) return remap def get_vector(self, orth, minn=None, maxn=None): @@ -440,7 +438,7 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if @@ -460,7 +458,7 @@ cdef class Vocab: if "lookups" not in "exclude" and self.lookups is not None: self.lookups.to_disk(path) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -477,8 +475,6 @@ cdef class Vocab: if "vectors" not in exclude: if self.vectors is not None: self.vectors.from_disk(path, exclude=["strings"]) - if self.vectors.name is not None: - link_vectors_to_models(self) if "lookups" not in exclude: self.lookups.from_disk(path) if "lexeme_norm" in self.lookups: @@ -489,7 +485,7 @@ cdef class Vocab: self._by_orth = PreshMap() return self - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -510,7 +506,7 @@ cdef class Vocab: } return util.to_bytes(getters, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. @@ -538,8 +534,6 @@ cdef class Vocab: ) self.length = 0 self._by_orth = PreshMap() - if self.vectors.name is not None: - link_vectors_to_models(self) return self def _reset_cache(self, keys, strings): diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index abc2b7bfa..95f7d0597 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -4,6 +4,7 @@ teaser: Pre-defined model architectures included with the core library source: spacy/ml/models menu: - ['Tok2Vec', 'tok2vec'] + - ['Transformers', 'transformers'] - ['Parser & NER', 'parser'] - ['Text Classification', 'textcat'] - ['Entity Linking', 'entitylinker'] @@ -13,7 +14,7 @@ TODO: intro and how architectures work, link to [`registry`](/api/top-level#registry), [custom models](/usage/training#custom-models) usage etc. -## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}} +## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"} ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} @@ -21,12 +22,61 @@ TODO: intro and how architectures work, link to ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM} +## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} + +The following architectures are provided by the package +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the +[usage documentation](/usage/transformers) for how to integrate the +architectures into your training config. + +### spacy-transformers.TransformerModel.v1 {#TransformerModel} + + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy-transformers.TransformerModel.v1" +> name = "roberta-base" +> tokenizer_config = {"use_fast": true} +> +> [model.get_spans] +> @span_getters = "strided_spans.v1" +> window = 128 +> stride = 96 +> ``` + +| Name | Type | Description | +| ------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). | +| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. | +| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). | + +### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener} + + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy-transformers.Tok2VecListener.v1" +> grad_factor = 1.0 +> +> [model.pooling] +> @layers = "reduce_mean.v1" +> ``` + +| Name | Type | Description | +| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- | +| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. | +| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. | + ## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"} ### spacy.TransitionBasedParser.v1 {#TransitionBasedParser} - - > #### Example Config > > ```ini diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 3256849c3..38e19129d 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -13,25 +13,84 @@ datasets in the [DocBin](/api/docbin) (`.spacy`) format. Create a `Corpus`. The input data can be a file or a directory of files. -| Name | Type | Description | -| ----------- | ------------ | ---------------------------------------------------------------- | -| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | -| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | -| `limit` | int | Maximum number of examples returned. | -| **RETURNS** | `Corpus` | The newly constructed object. | +> #### Example +> +> ```python +> from spacy.gold import Corpus +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> ``` - - -## Corpus.walk_corpus {#walk_corpus tag="staticmethod"} - -## Corpus.make_examples {#make_examples tag="method"} - -## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"} - -## Corpus.read_docbin {#read_docbin tag="method"} - -## Corpus.count_train {#count_train tag="method"} +| Name | Type | Description | +| ------- | ------------ | ---------------------------------------------------------------- | +| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | +| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | +| `limit` | int | Maximum number of examples returned. `0` for no limit (default). | ## Corpus.train_dataset {#train_dataset tag="method"} +Yield examples from the training data. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> train_data = corpus.train_dataset(nlp) +> ``` + +| Name | Type | Description | +| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `nlp` | `Language` | The current `nlp` object. | +| _keyword-only_ | | | +| `shuffle` | bool | Whether to shuffle the examples. Defaults to `True`. | +| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | +| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. `0` for no limit (default).  | +| **YIELDS** | `Example` | The examples. | + ## Corpus.dev_dataset {#dev_dataset tag="method"} + +Yield examples from the development data. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> dev_data = corpus.dev_dataset(nlp) +> ``` + +| Name | Type | Description | +| -------------- | ---------- | ---------------------------------------------------------------------------- | +| `nlp` | `Language` | The current `nlp` object. | +| _keyword-only_ | | | +| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | +| **YIELDS** | `Example` | The examples. | + +## Corpus.count_train {#count_train tag="method"} + +Get the word count of all training examples. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> word_count = corpus.count_train(nlp) +> ``` + +| Name | Type | Description | +| ----------- | ---------- | ------------------------- | +| `nlp` | `Language` | The current `nlp` object. | +| **RETURNS** | int | The word count. | + + diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 9dea04284..6e54fb112 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -87,13 +87,12 @@ Create a `Token` object from a `TokenC*` pointer. > token = Token.cinit(&doc.c[3], doc, 3) > ``` -| Name | Type | Description | -| ----------- | --------- | ------------------------------------------------------------ | -| `vocab` | `Vocab` | A reference to the shared `Vocab`. | -| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | -| `offset` | `int` | The offset of the token within the document. | -| `doc` | `Doc` | The parent document. | -| **RETURNS** | `Token` | The newly constructed object. | +| Name | Type | Description | +| -------- | --------- | ------------------------------------------------------------ | +| `vocab` | `Vocab` | A reference to the shared `Vocab`. | +| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | +| `offset` | `int` | The offset of the token within the document. | +| `doc` | `Doc` | The parent document. | ## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 425b669ce..a18e9e582 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and ## DependencyParser.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -290,10 +290,11 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -306,11 +307,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > parser.from_disk("/path/to/parser") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | +| Name | Type | Description | +| -------------- | ------------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | ## DependencyParser.to_bytes {#to_bytes tag="method"} @@ -323,10 +325,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | ## DependencyParser.from_bytes {#from_bytes tag="method"} @@ -340,11 +343,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > parser.from_bytes(parser_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | +| Name | Type | Description | +| -------------- | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | ## DependencyParser.labels {#labels tag="property"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index b5871f2ab..a9499f6d4 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -30,12 +30,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `words` | iterable | A list of strings to add to the container. | -| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | -| **RETURNS** | `Doc` | The newly constructed object. | +| Name | Type | Description | +| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `words` | iterable | A list of strings to add to the container. | +| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} @@ -386,10 +385,11 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -403,11 +403,12 @@ Loads state from a directory. Modifies the object in place and returns it. > doc = Doc(Vocab()).from_disk("/path/to/doc") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The modified `Doc` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The modified `Doc` object. | ## Doc.to_bytes {#to_bytes tag="method"} @@ -420,10 +421,11 @@ Serialize, i.e. export the document contents to a binary string. > doc_bytes = doc.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | ## Doc.from_bytes {#from_bytes tag="method"} @@ -439,11 +441,12 @@ Deserialize, i.e. import the document contents from a binary string. > assert doc.text == doc2.text > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `data` | bytes | The string to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The `Doc` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `data` | bytes | The string to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The `Doc` object. | ## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"} diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 07f95f91d..65d1153d1 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -44,11 +44,11 @@ Create a `DocBin` object to hold serialized annotations. > doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) > ``` -| Argument | Type | Description | -| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | -| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | -| **RETURNS** | `DocBin` | The newly constructed object. | +| Argument | Type | Description | +| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | +| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | +| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. | ## DocBin.\_\len\_\_ {#len tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index b2b1eec32..2a1ba94d2 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and ## EntityLinker.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). @@ -265,10 +265,11 @@ Serialize the pipe to disk. > entity_linker.to_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityLinker.from_disk {#from_disk tag="method"} @@ -281,11 +282,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 63404e087..b5b549a04 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and ## EntityRecognizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -289,10 +289,11 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -305,11 +306,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > ner.from_disk("/path/to/ner") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | ------------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | ## EntityRecognizer.to_bytes {#to_bytes tag="method"} @@ -322,10 +324,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | ## EntityRecognizer.from_bytes {#from_bytes tag="method"} @@ -339,11 +342,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > ner.from_bytes(ner_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | ## EntityRecognizer.labels {#labels tag="property"} diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 0d06c79a1..e6299fc31 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -37,7 +37,6 @@ both documents. | `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. | | _keyword-only_ | | | | `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. | -| **RETURNS** | `Example` | The newly constructed object. | ## Example.from_dict {#from_dict tag="classmethod"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index f088815fd..7b2c4edf4 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -27,11 +27,10 @@ Create the knowledge base. > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > ``` -| Name | Type | Description | -| ---------------------- | --------------- | ---------------------------------------- | -| `vocab` | `Vocab` | A `Vocab` object. | -| `entity_vector_length` | int | Length of the fixed-size entity vectors. | -| **RETURNS** | `KnowledgeBase` | The newly constructed object. | +| Name | Type | Description | +| ---------------------- | ------- | ---------------------------------------- | +| `vocab` | `Vocab` | A `Vocab` object. | +| `entity_vector_length` | int | Length of the fixed-size entity vectors. | ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} @@ -255,7 +254,6 @@ but instead these objects are returned by the | `entity_freq` | float | The entity frequency as recorded in the KB. | | `alias_hash` | int | The hash of the textual mention or alias. | | `prior_prob` | float | The prior probability of the `alias` referring to the `entity` | -| **RETURNS** | `Candidate` | The newly constructed object. | ## Candidate attributes {#candidate_attributes} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index d685c014b..7e25106d1 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -15,6 +15,58 @@ the tagger or parser that are called on a document in order. You can also add your own processing pipeline components that take a `Doc` object, modify it and return it. +## Language.\_\_init\_\_ {#init tag="method"} + +Initialize a `Language` object. + +> #### Example +> +> ```python +> # Construction from subclass +> from spacy.lang.en import English +> nlp = English() +> +> # Construction from scratch +> from spacy.vocab import Vocab +> from spacy.language import Language +> nlp = Language(Vocab()) +> ``` + +| Name | Type | Description | +| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | +| _keyword-only_ | | | +| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | +| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | +| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | + +## Language.from_config {#from_config tag="classmethod"} + +Create a `Language` object from a loaded config. Will set up the tokenizer and +language data, add pipeline components based on the pipeline and components +define in the config and validate the results. If no config is provided, the +default config of the given language is used. This is also how spaCy loads a +model under the hood based on its [`config.cfg`](/api/data-formats#config). + +> #### Example +> +> ```python +> from thinc.api import Config +> from spacy.language import Language +> +> config = Config().from_disk("./config.cfg") +> nlp = Language.from_config(config) +> ``` + +| Name | Type | Description | +| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | +| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | +| _keyword-only_ | | +| `disable` | `Iterable[str]` | List of pipeline component names to disable. | +| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | +| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | +| **RETURNS** | `Language` | The initialized object. | + ## Language.component {#component tag="classmethod" new="3"} Register a custom pipeline component under a given name. This allows @@ -101,57 +153,6 @@ examples, see the | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | -## Language.\_\_init\_\_ {#init tag="method"} - -Initialize a `Language` object. - -> #### Example -> -> ```python -> from spacy.vocab import Vocab -> from spacy.language import Language -> nlp = Language(Vocab()) -> -> from spacy.lang.en import English -> nlp = English() -> ``` - -| Name | Type | Description | -| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | -| _keyword-only_ | | | -| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | -| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | -| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | -| **RETURNS** | `Language` | The newly constructed object. | - -## Language.from_config {#from_config tag="classmethod"} - -Create a `Language` object from a loaded config. Will set up the tokenizer and -language data, add pipeline components based on the pipeline and components -define in the config and validate the results. If no config is provided, the -default config of the given language is used. This is also how spaCy loads a -model under the hood based on its [`config.cfg`](/api/data-formats#config). - -> #### Example -> -> ```python -> from thinc.api import Config -> from spacy.language import Language -> -> config = Config().from_disk("./config.cfg") -> nlp = Language.from_config(config) -> ``` - -| Name | Type | Description | -| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | -| _keyword-only_ | | -| `disable` | `Iterable[str]` | List of pipeline component names to disable. | -| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | -| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | -| **RETURNS** | `Language` | The initialized object. | - ## Language.\_\_call\_\_ {#call tag="method"} Apply the pipeline to some text. The text can span multiple sentences, and can @@ -164,11 +165,13 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` -| Name | Type | Description | -| ----------- | ----------- | --------------------------------------------------------------------------------- | -| `text` | str | The text to be processed. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Doc` | A container for accessing the annotations. | +| Name | Type | Description | +| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ | +| `text` | str | The text to be processed. | +| _keyword-only_ | | | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. | ## Language.pipe {#pipe tag="method"} @@ -183,15 +186,57 @@ more efficient than processing texts one-by-one. > assert doc.is_parsed > ``` -| Name | Type | Description | -| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | `Iterable[str]` | A sequence of strings. | -| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | -| `batch_size` | int | The number of texts to buffer. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | -| **YIELDS** | `Doc` | Documents in the order of the original text. | +| Name | Type | Description | +| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | `Iterable[str]` | A sequence of strings. | +| _keyword-only_ | | | +| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | +| `batch_size` | int | The number of texts to buffer. | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | +| **YIELDS** | `Doc` | Documents in the order of the original text. | + +## Language.begin_training {#begin_training tag="method"} + +Initialize the pipe for training, using data examples if available. Returns an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. + +> #### Example +> +> ```python +> optimizer = nlp.begin_training(get_examples) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | +| _keyword-only_ | | | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Language.resume_training {#resume_training tag="method,experimental" new="3"} + +Continue training a pretrained model. Create and return an optimizer, and +initialize "rehearsal" for any pipeline component that has a `rehearse` method. +Rehearsal is used to prevent models from "forgetting" their initialized +"knowledge". To perform rehearsal, collect samples of text you want the models +to retain performance on, and call [`nlp.rehearse`](/api/language#rehearse) with +a batch of [Example](/api/example) objects. + +> #### Example +> +> ```python +> optimizer = nlp.resume_training() +> nlp.rehearse(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | ## Language.update {#update tag="method"} @@ -206,15 +251,37 @@ Update the models in the pipeline. > nlp.update([example], sgd=optimizer) > ``` -| Name | Type | Description | -| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | -| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Type | Description | +| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | + +## Language.rehearse {#rehearse tag="method,experimental"} + +Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the +current model to make predictions similar to an initial model, to try to address +the "catastrophic forgetting" problem. This feature is experimental. + +> #### Example +> +> ```python +> optimizer = nlp.resume_training() +> losses = nlp.rehearse(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## Language.evaluate {#evaluate tag="method"} @@ -227,33 +294,15 @@ Evaluate a model's pipeline components. > print(scores) > ``` -| Name | Type | Description | -| -------------------------------------------- | ------------------------------- | ------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| `verbose` | bool | Print debugging information. | -| `batch_size` | int | The batch size to use. | -| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores. | - -## Language.begin_training {#begin_training tag="method"} - -Allocate models, pre-process training data and acquire an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). - -> #### Example -> -> ```python -> optimizer = nlp.begin_training(get_examples) -> ``` - -| Name | Type | Description | -| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ | -| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | -| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| `**cfg` | - | Config parameters (sent to all components). | -| **RETURNS** | `Optimizer` | An optimizer. | +| Name | Type | Description | +| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `verbose` | bool | Print debugging information. | +| `batch_size` | int | The batch size to use. | +| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. | ## Language.use_params {#use_params tag="contextmanager, method"} @@ -296,6 +345,7 @@ To create a component and add it to the pipeline, you should always use | ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | `factory_name` | str | Name of the registered component factory. | | `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | +| _keyword-only_ | | | | `config` 3 | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | | `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | **RETURNS** | callable | The pipeline component. | @@ -418,10 +468,13 @@ Replace a component in the pipeline. > nlp.replace_pipe("parser", my_custom_parser) > ``` -| Name | Type | Description | -| ----------- | -------- | --------------------------------- | -| `name` | str | Name of the component to replace. | -| `component` | callable | The pipeline component to insert. | +| Name | Type | Description | +| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str | Name of the component to replace. | +| `component` | callable | The pipeline component to insert. | +| _keyword-only_ | | | +| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. | +| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | ## Language.rename_pipe {#rename_pipe tag="method" new="2"} @@ -492,11 +545,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------------------ | -| `disable` | str / list | Name(s) of pipeline components to disable. | -| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | -| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------------------ | +| _keyword-only_ | | | +| `disable` | str / list | Name(s) of pipeline components to disable. | +| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | +| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} @@ -591,10 +645,11 @@ the model**. > nlp.to_disk("/path/to/models") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -616,11 +671,12 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| ----------- | ------------ | ----------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The modified `Language` object. | ## Language.to_bytes {#to_bytes tag="method"} @@ -632,10 +688,11 @@ Serialize the current state to a binary string. > nlp_bytes = nlp.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------------------------------------------------------- | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Language` object. | ## Language.from_bytes {#from_bytes tag="method"} @@ -653,11 +710,12 @@ available to the loaded object. > nlp2.from_bytes(nlp_bytes) > ``` -| Name | Type | Description | -| ------------ | ---------- | ----------------------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The `Language` object. | ## Attributes {#attributes} @@ -767,8 +825,8 @@ serialization by passing in the string names via the `exclude` argument. The `FactoryMeta` contains the information about the component and its default provided by the [`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory) decorator. It's created whenever a -component is added to the pipeline and stored on the `Language` class for each -component instance and factory instance. +component is defined and stored on the `Language` class for each component +instance and factory instance. | Name | Type | Description | | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 237bfa468..73f8aa71f 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -31,7 +31,6 @@ when a `Language` subclass and its `Vocab` is initialized. | Name | Type | Description | | -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | -| **RETURNS** | `Lemmatizer` | The newly created object. | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index b39664a55..625a26412 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -13,11 +13,10 @@ lemmatization depends on the part-of-speech tag). Create a `Lexeme` object. -| Name | Type | Description | -| ----------- | -------- | ----------------------------- | -| `vocab` | `Vocab` | The parent vocabulary. | -| `orth` | int | The orth id of the lexeme. | -| **RETURNS** | `Lexeme` | The newly constructed object. | +| Name | Type | Description | +| ------- | ------- | -------------------------- | +| `vocab` | `Vocab` | The parent vocabulary. | +| `orth` | int | The orth id of the lexeme. | ## Lexeme.set_flag {#set_flag tag="method"} diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index b91d92646..099b5306e 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -236,10 +236,9 @@ Initialize a new table. > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ----------- | ------- | ---------------------------------- | -| `name` | str | Optional table name for reference. | -| **RETURNS** | `Table` | The newly constructed object. | +| Name | Type | Description | +| ------ | ---- | ---------------------------------- | +| `name` | str | Optional table name for reference. | ### Table.from_dict {#table.from_dict tag="classmethod"} diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index c59a58c81..925c9ad2e 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -19,11 +19,10 @@ string where an integer is expected) or unexpected property names. > matcher = Matcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `validate` 2.1 | bool | Validate all patterns added to this matcher. | -| **RETURNS** | `Matcher` | The newly constructed object. | +| Name | Type | Description | +| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `validate` 2.1 | bool | Validate all patterns added to this matcher. | ## Matcher.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/morphanalysis.md b/website/docs/api/morphanalysis.md index 5c2356ad9..4df9a3f7f 100644 --- a/website/docs/api/morphanalysis.md +++ b/website/docs/api/morphanalysis.md @@ -6,7 +6,6 @@ source: spacy/tokens/morphanalysis.pyx Stores a single morphological analysis. - ## MorphAnalysis.\_\_init\_\_ {#init tag="method"} Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of @@ -16,17 +15,15 @@ morphological features. > > ```python > from spacy.tokens import MorphAnalysis -> +> > feats = "Feat1=Val1|Feat2=Val2" > m = MorphAnalysis(nlp.vocab, feats) > ``` -| Name | Type | Description | -| ----------- | ------------------ | ----------------------------- | -| `vocab` | `Vocab` | The vocab. | -| `features` | `Union[Dict, str]` | The morphological features. | -| **RETURNS** | `MorphAnalysis` | The newly constructed object. | - +| Name | Type | Description | +| ---------- | ------------------ | --------------------------- | +| `vocab` | `Vocab` | The vocab. | +| `features` | `Union[Dict, str]` | The morphological features. | ## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"} @@ -44,7 +41,6 @@ Whether a feature/value pair is in the analysis. | ----------- | ----- | ------------------------------------- | | **RETURNS** | `str` | A feature/value pair in the analysis. | - ## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"} Iterate over the feature/value pairs in the analysis. @@ -61,7 +57,6 @@ Iterate over the feature/value pairs in the analysis. | ---------- | ----- | ------------------------------------- | | **YIELDS** | `str` | A feature/value pair in the analysis. | - ## MorphAnalysis.\_\_len\_\_ {#len tag="method"} Returns the number of features in the analysis. @@ -78,7 +73,6 @@ Returns the number of features in the analysis. | ----------- | ----- | --------------------------------------- | | **RETURNS** | `int` | The number of features in the analysis. | - ## MorphAnalysis.\_\_str\_\_ {#str tag="method"} Returns the morphological analysis in the UD FEATS string format. @@ -92,10 +86,9 @@ Returns the morphological analysis in the UD FEATS string format. > ``` | Name | Type | Description | -| ----------- | ----- | ---------------------------------| +| ----------- | ----- | -------------------------------- | | **RETURNS** | `str` | The analysis in UD FEATS format. | - ## MorphAnalysis.get {#get tag="method"} Retrieve values for a feature by field. @@ -108,11 +101,10 @@ Retrieve values for a feature by field. > assert morph.get("Feat1") == ["Val1", "Val2"] > ``` -| Name | Type | Description | -| ----------- | ------ | ----------------------------------- | -| `field` | `str` | The field to retrieve. | -| **RETURNS** | `list` | A list of the individual features. | - +| Name | Type | Description | +| ----------- | ------ | ---------------------------------- | +| `field` | `str` | The field to retrieve. | +| **RETURNS** | `list` | A list of the individual features. | ## MorphAnalysis.to_dict {#to_dict tag="method"} @@ -128,10 +120,9 @@ map. > ``` | Name | Type | Description | -| ----------- | ------ | -----------------------------------------| +| ----------- | ------ | ---------------------------------------- | | **RETURNS** | `dict` | The dict representation of the analysis. | - ## MorphAnalysis.from_id {#from_id tag="classmethod"} Create a morphological analysis from a given hash ID. @@ -149,5 +140,3 @@ Create a morphological analysis from a given hash ID. | ------- | ------- | -------------------------------- | | `vocab` | `Vocab` | The vocab. | | `key` | `int` | The hash of the features string. | - - diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 8ac300de3..ac7146543 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and ## Morphologizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -276,10 +276,11 @@ Serialize the pipe to disk. > morphologizer.to_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Morphologizer.from_disk {#from_disk tag="method"} @@ -292,11 +293,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > morphologizer.from_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | ## Morphologizer.to_bytes {#to_bytes tag="method"} @@ -309,10 +311,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | ## Morphologizer.from_bytes {#from_bytes tag="method"} @@ -326,11 +329,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > morphologizer.from_bytes(morphologizer_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | ## Morphologizer.labels {#labels tag="property"} diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index ad279bff7..8fb89c15f 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -4,12 +4,11 @@ tag: class source: spacy/morphology.pyx --- -Store the possible morphological analyses for a language, and index them -by hash. To save space on each token, tokens only know the hash of their +Store the possible morphological analyses for a language, and index them by +hash. To save space on each token, tokens only know the hash of their morphological analysis, so queries of morphological attributes are delegated to this class. - ## Morphology.\_\_init\_\_ {#init tag="method"} Create a Morphology object using the tag map, lemmatizer and exceptions. @@ -22,21 +21,18 @@ Create a Morphology object using the tag map, lemmatizer and exceptions. > morphology = Morphology(strings, tag_map, lemmatizer) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| `strings` | `StringStore` | The string store. | -| `tag_map` | `Dict[str, Dict]` | The tag map. | -| `lemmatizer`| `Lemmatizer` | The lemmatizer. | -| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | -| **RETURNS** | `Morphology` | The newly constructed object. | - +| Name | Type | Description | +| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- | +| `strings` | `StringStore` | The string store. | +| `tag_map` | `Dict[str, Dict]` | The tag map. | +| `lemmatizer` | `Lemmatizer` | The lemmatizer. | +| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | ## Morphology.add {#add tag="method"} -Insert a morphological analysis in the morphology table, if not already -present. The morphological analysis may be provided in the UD FEATS format as a -string or in the tag map dictionary format. Returns the hash of the new -analysis. +Insert a morphological analysis in the morphology table, if not already present. +The morphological analysis may be provided in the UD FEATS format as a string or +in the tag map dictionary format. Returns the hash of the new analysis. > #### Example > @@ -46,10 +42,9 @@ analysis. > assert hash == nlp.vocab.strings[feats] > ``` -| Name | Type | Description | -| ----------- | ------------------- | --------------------------- | -| `features` | `Union[Dict, str]` | The morphological features. | - +| Name | Type | Description | +| ---------- | ------------------ | --------------------------- | +| `features` | `Union[Dict, str]` | The morphological features. | ## Morphology.get {#get tag="method"} @@ -63,33 +58,30 @@ analysis. Get the FEATS string for the hash of the morphological analysis. -| Name | Type | Description | -| ----------- | ------ | --------------------------------------- | -| `morph` | int | The hash of the morphological analysis. | - +| Name | Type | Description | +| ------- | ---- | --------------------------------------- | +| `morph` | int | The hash of the morphological analysis. | ## Morphology.load_tag_map {#load_tag_map tag="method"} Replace the current tag map with the provided tag map. -| Name | Type | Description | -| ----------- | ------------------ | ------------ | -| `tag_map` | `Dict[str, Dict]` | The tag map. | - +| Name | Type | Description | +| --------- | ----------------- | ------------ | +| `tag_map` | `Dict[str, Dict]` | The tag map. | ## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"} Replace the current morphological exceptions with the provided exceptions. -| Name | Type | Description | -| ------------- | ------------------ | ----------------------------- | -| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | - +| Name | Type | Description | +| ------------- | ----------------- | ----------------------------- | +| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | ## Morphology.add_special_case {#add_special_case tag="method"} -Add a special-case rule to the morphological analyzer. Tokens whose tag and -orth match the rule will receive the specified properties. +Add a special-case rule to the morphological analyzer. Tokens whose tag and orth +match the rule will receive the specified properties. > #### Example > @@ -98,27 +90,24 @@ orth match the rule will receive the specified properties. > morphology.add_special_case("DT", "the", attrs) > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------- | -| `tag_str` | str | The fine-grained tag. | -| `orth_str` | str | The token text. | -| `attrs` | dict | The features to assign for this token and tag. | - +| Name | Type | Description | +| ---------- | ---- | ---------------------------------------------- | +| `tag_str` | str | The fine-grained tag. | +| `orth_str` | str | The token text. | +| `attrs` | dict | The features to assign for this token and tag. | ## Morphology.exc {#exc tag="property"} The current morphological exceptions. -| Name | Type | Description | -| ---------- | ----- | --------------------------------------------------- | -| **YIELDS** | dict | The current dictionary of morphological exceptions. | - +| Name | Type | Description | +| ---------- | ---- | --------------------------------------------------- | +| **YIELDS** | dict | The current dictionary of morphological exceptions. | ## Morphology.lemmatize {#lemmatize tag="method"} TODO - ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} Convert a string FEATS representation to a dictionary of features and values in @@ -132,11 +121,10 @@ the same format as the tag map. > assert d == {"Feat1": "Val1", "Feat2": "Val2"} > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------------------------- | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------------------------ | | `feats` | str | The morphological features in Universal Dependencies FEATS format. | -| **RETURNS** | dict | The morphological features as a dictionary. | - +| **RETURNS** | dict | The morphological features as a dictionary. | ## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"} @@ -150,12 +138,11 @@ Convert a dictionary of features and values to a string FEATS representation. > assert f == "Feat1=Val1|Feat2=Val2" > ``` -| Name | Type | Description | +| Name | Type | Description | | ------------ | ----------------- | --------------------------------------------------------------------- | | `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. | | **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. | - ## Attributes {#attributes} | Name | Type | Description | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 991016094..866aca096 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -35,12 +35,11 @@ be shown. > matcher = PhraseMatcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | -| `validate` 2.1 | bool | Validate patterns added to the matcher. | -| **RETURNS** | `PhraseMatcher` | The newly constructed object. | +| Name | Type | Description | +| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | +| `validate` 2.1 | bool | Validate patterns added to the matcher. | ## PhraseMatcher.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index c03a1b4da..99d06c79f 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -95,7 +95,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and ## Pipe.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -198,7 +198,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = pipe.rehearse(examples, sgd=optimizer) > ``` @@ -306,10 +306,11 @@ Serialize the pipe to disk. > pipe.to_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Pipe.from_disk {#from_disk tag="method"} @@ -322,11 +323,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > pipe.from_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The modified pipe. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Pipe` | The modified pipe. | ## Pipe.to_bytes {#to_bytes tag="method"} @@ -339,10 +341,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the pipe. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the pipe. | ## Pipe.from_bytes {#from_bytes tag="method"} @@ -356,11 +359,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > pipe.from_bytes(pipe_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The pipe. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Pipe` | The pipe. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 1798f293e..52be3f6c7 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -27,10 +27,9 @@ Create a new `Scorer`. > scorer = Scorer(nlp) > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | -| **RETURNS** | `Scorer` | The newly created object. | +| Name | Type | Description | +| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | ## Scorer.score {#score tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 2c0944b1f..fdc950bb0 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -116,7 +116,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the ## SentenceRecognizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -201,7 +201,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > senter = nlp.add_pipe("senter") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = senter.rehearse(examples, sgd=optimizer) > ``` @@ -291,10 +291,11 @@ Serialize the pipe to disk. > senter.to_disk("/path/to/senter") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## SentenceRecognizer.from_disk {#from_disk tag="method"} @@ -307,11 +308,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > senter.from_disk("/path/to/senter") > ``` -| Name | Type | Description | -| ----------- | -------------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | -------------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | ## SentenceRecognizer.to_bytes {#to_bytes tag="method"} @@ -324,10 +326,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | ## SentenceRecognizer.from_bytes {#from_bytes tag="method"} @@ -341,11 +344,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > senter.from_bytes(senter_bytes) > ``` -| Name | Type | Description | -| ------------ | -------------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | -------------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 668013e76..9237b5538 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,15 +18,14 @@ Create a Span object from the slice `doc[start : end]`. > assert [t.text for t in span] == ["it", "back", "!"] > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | int | The index of the first token of the span. | -| `end` | int | The index of the first token after the span. | -| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | -| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | -| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object. | +| Name | Type | Description | +| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `start` | int | The index of the first token of the span. | +| `end` | int | The index of the first token after the span. | +| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | +| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | ## Span.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index c00c59832..b66d755ed 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -19,10 +19,9 @@ Create the `StringStore`. > stringstore = StringStore(["apple", "orange"]) > ``` -| Name | Type | Description | -| ----------- | ------------- | ------------------------------------------ | -| `strings` | iterable | A sequence of strings to add to the store. | -| **RETURNS** | `StringStore` | The newly constructed object. | +| Name | Type | Description | +| --------- | -------- | ------------------------------------------ | +| `strings` | iterable | A sequence of strings to add to the store. | ## StringStore.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 351492aa9..37ef13453 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -114,7 +114,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and ## Tagger.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -199,7 +199,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = tagger.rehearse(examples, sgd=optimizer) > ``` @@ -307,10 +307,11 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tagger.from_disk {#from_disk tag="method"} @@ -323,11 +324,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tagger.from_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The modified `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The modified `Tagger` object. | ## Tagger.to_bytes {#to_bytes tag="method"} @@ -340,10 +342,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tagger` object. | ## Tagger.from_bytes {#from_bytes tag="method"} @@ -357,11 +360,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tagger.from_bytes(tagger_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The `Tagger` object. | ## Tagger.labels {#labels tag="property"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index c4327dca7..1efd5831c 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -133,7 +133,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and ## TextCategorizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -218,7 +218,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = textcat.rehearse(examples, sgd=optimizer) > ``` @@ -325,10 +325,11 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -341,11 +342,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > textcat.from_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| ----------- | ----------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | ----------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | ## TextCategorizer.to_bytes {#to_bytes tag="method"} @@ -358,10 +360,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | ## TextCategorizer.from_bytes {#from_bytes tag="method"} @@ -375,11 +378,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > textcat.from_bytes(textcat_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | ----------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | ## TextCategorizer.labels {#labels tag="property"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 29f91afe6..f810793ce 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -110,7 +110,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods. ## Tok2Vec.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -227,10 +227,11 @@ Serialize the pipe to disk. > tok2vec.to_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tok2Vec.from_disk {#from_disk tag="method"} @@ -243,11 +244,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tok2vec.from_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | ## Tok2Vec.to_bytes {#to_bytes tag="method"} @@ -260,10 +262,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | ## Tok2Vec.from_bytes {#from_bytes tag="method"} @@ -277,11 +280,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tok2vec.from_bytes(tok2vec_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 1cb833089..ca6b57a5b 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -17,12 +17,11 @@ Construct a `Token` object. > assert token.text == "Give" > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `doc` | `Doc` | The parent document. | -| `offset` | int | The index of the token within the document. | -| **RETURNS** | `Token` | The newly constructed object. | +| Name | Type | Description | +| -------- | ------- | ------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `doc` | `Doc` | The parent document. | +| `offset` | int | The index of the token within the document. | ## Token.\_\_len\_\_ {#len tag="method"} @@ -393,73 +392,73 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | str | Verbatim text content. | -| `text_with_ws` | str | Text content, with trailing space character if present. | -| `whitespace_` | str | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | str | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | str | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | -| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Does the token have a word vector? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | str | Fine-grained part-of-speech. | -| `morph` | `MorphAnalysis` | Morphological analysis. | -| `morph_` | str | Morphological analysis in UD FEATS format. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | str | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | str | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | +| `text` | str | Verbatim text content. | +| `text_with_ws` | str | Text content, with trailing space character if present. | +| `whitespace_` | str | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | str | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | str | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Does the token have a word vector? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | str | Fine-grained part-of-speech. | +| `morph` | `MorphAnalysis` | Morphological analysis. | +| `morph_` | str | Morphological analysis in UD FEATS format. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | str | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | str | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 47e5aa9b3..23b6e4f3f 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,16 +34,15 @@ the > tokenizer = nlp.tokenizer > ``` -| Name | Type | Description | -| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | -| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} @@ -159,10 +158,11 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -175,11 +175,12 @@ Load the tokenizer from disk. Modifies the object in place and returns it. > tokenizer.from_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -192,10 +193,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it. Serialize the tokenizer to a bytestring. -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | ## Tokenizer.from_bytes {#from_bytes tag="method"} @@ -210,11 +212,12 @@ it. > tokenizer.from_bytes(tokenizer_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | ## Attributes {#attributes} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index a463441c7..ede7f9e21 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -304,6 +304,31 @@ factories. | `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | | `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | +### spacy-transformers registry {#registry-transformers} + +The following registries are added by the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package. +See the [`Transformer`](/api/transformer) API reference and +[usage docs](/usage/transformers) for details. + +> #### Example +> +> ```python +> import spacy_transformers +> +> @spacy_transformers.registry.annotation_setters("my_annotation_setter.v1") +> def configure_custom_annotation_setter(): +> def annotation_setter(docs, trf_data) -> None: +> # Set annotations on the docs +> +> return annotation_sette +> ``` + +| Registry name | Description | +| ------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. | +| [`annotation_setters`](/api/transformers#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | + ## Training data and alignment {#gold source="spacy/gold"} ### gold.docs_to_json {#docs_to_json tag="function"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md new file mode 100644 index 000000000..70128d225 --- /dev/null +++ b/website/docs/api/transformer.md @@ -0,0 +1,467 @@ +--- +title: Transformer +teaser: Pipeline component for multi-task learning with transformer models +tag: class +source: github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py +new: 3 +api_base_class: /api/pipe +api_string_name: transformer +--- + +> #### Installation +> +> ```bash +> $ pip install spacy-transformers +> ``` + + + +This component is available via the extension package +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). It +exposes the component via entry points, so if you have the package installed, +using `factory = "transformer"` in your +[training config](/usage/training#config) or `nlp.add_pipe("transformer")` will +work out-of-the-box. + + + +This pipeline component lets you use transformer models in your pipeline. The +component assigns the output of the transformer to the Doc's extension +attributes. We also calculate an alignment between the word-piece tokens and the +spaCy tokenization, so that we can use the last hidden states to set the +`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy +token, the spaCy token receives the sum of their values. To access the values, +you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The +package also adds the function registries [`@span_getters`](#span_getters) and +[`@annotation_setters`](#annotation_setters) with several built-in registered +functions. For more details, see the [usage documentation](/usage/transformers). + +## Config and implementation {#config} + +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). See the +[model architectures](/api/architectures) documentation for details on the +architectures and their arguments and hyperparameters. + +> #### Example +> +> ```python +> from spacy_transformers import Transformer, DEFAULT_CONFIG +> +> nlp.add_pipe("transformer", config=DEFAULT_CONFIG) +> ``` + +| Setting | Type | Description | Default | +| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | +| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | +| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | + +```python +https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py +``` + +## Transformer.\_\_init\_\_ {#init tag="method"} + +> #### Example +> +> ```python +> # Construction via add_pipe with default model +> trf = nlp.add_pipe("transformer") +> +> # Construction via add_pipe with custom config +> config = { +> "model": { +> "@architectures": "spacy-transformers.TransformerModel.v1", +> "name": "bert-base-uncased", +> "tokenizer_config": {"use_fast": True} +> } +> } +> trf = nlp.add_pipe("transformer", config=config) +> +> # Construction from class +> from spacy_transformers import Transformer +> trf = Transformer(nlp.vocab, model) +> ``` + +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#create_pipe). + +| Name | Type | Description | +| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. | +| _keyword-only_ | | | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | + +## Transformer.\_\_call\_\_ {#call tag="method"} + +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. Both +[`__call__`](/api/transformer#call) and [`pipe`](/api/transformer#pipe) delegate +to the [`predict`](/api/transformer#predict) and +[`set_annotations`](/api/transformer#set_annotations) methods. + +> #### Example +> +> ```python +> doc = nlp("This is a sentence.") +> trf = nlp.add_pipe("transformer") +> # This usually happens under the hood +> processed = transformer(doc) +> ``` + +| Name | Type | Description | +| ----------- | ----- | ------------------------ | +| `doc` | `Doc` | The document to process. | +| **RETURNS** | `Doc` | The processed document. | + +## Transformer.pipe {#pipe tag="method"} + +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and +[`pipe`](/api/transformer#pipe) delegate to the +[`predict`](/api/transformer#predict) and +[`set_annotations`](/api/transformer#set_annotations) methods. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> for doc in trf.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------- | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| _keyword-only_ | | | +| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | The processed documents in order. | + +## Transformer.begin_training {#begin_training tag="method"} + +Initialize the pipe for training, using data examples if available. Returns an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = trf.begin_training(pipeline=nlp.pipeline) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | +| _keyword-only_ | | | +| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/transformer#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Transformer.predict {#predict tag="method"} + +Apply the pipeline's model to a batch of docs, without modifying them. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> scores = trf.predict([doc1, doc2]) +> ``` + +| Name | Type | Description | +| ----------- | --------------- | ----------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to predict. | +| **RETURNS** | - | The model's prediction for each document. | + +## Transformer.set_annotations {#set_annotations tag="method"} + +Modify a batch of documents, using pre-computed scores. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> scores = trf.predict(docs) +> trf.set_annotations(docs, scores) +> ``` + +| Name | Type | Description | +| -------- | --------------- | ----------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `scores` | - | The scores to set, produced by `Transformer.predict`. | + +## Transformer.update {#update tag="method"} + +Learn from a batch of documents and gold-standard information, updating the +pipe's model. Delegates to [`predict`](/api/transformer#predict). + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = nlp.begin_training() +> losses = trf.update(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | + +## Transformer.create_optimizer {#create_optimizer tag="method"} + +Create an optimizer for the pipeline component. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = trf.create_optimizer() +> ``` + +| Name | Type | Description | +| ----------- | --------------------------------------------------- | -------------- | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Transformer.use_params {#use_params tag="method, contextmanager"} + +Modify the pipe's model, to use the given parameter values. At the end of the +context, the original parameters are restored. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> with trf.use_params(optimizer.averages): +> trf.to_disk("/best_model") +> ``` + +| Name | Type | Description | +| -------- | ---- | ----------------------------------------- | +| `params` | dict | The parameter values to use in the model. | + +## Transformer.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf.to_disk("/path/to/transformer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | + +## Transformer.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf.from_disk("/path/to/transformer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | + +## Transformer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf_bytes = trf.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | + +## Transformer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> trf_bytes = trf.to_bytes() +> trf = nlp.add_pipe("transformer") +> trf.from_bytes(trf_bytes) +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = trf.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | + +## TransformerData {#transformerdata tag="dataclass"} + +Transformer tokens and outputs for one `Doc` object. + +| Name | Type | Description | +| --------- | -------------------------------------------------- | ----------------------------------------- | +| `tokens` | `Dict` | | +| `tensors` | `List[FloatsXd]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `width` | int | | + +### TransformerData.empty {#transformerdata-emoty tag="classmethod"} + + + +| Name | Type | Description | +| ----------- | ----------------- | -------------- | +| **RETURNS** | `TransformerData` | | + +## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} + + + +| Name | Type | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | +| `spans` | `List[List[Span]]` | | +| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | | +| `tensors` | `List[torch.Tensor]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `doc_data` | `List[TransformerData]` | | + +### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} + + + +| Name | Type | Description | +| ----------- | ---------------------- | -------------- | +| `arrays` | `List[List[Floats3d]]` | | +| **RETURNS** | `FullTransformerBatch` | | + +### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} + +Split a `TransformerData` object that represents a batch into a list with one +`TransformerData` per `Doc`. + +| Name | Type | Description | +| ----------- | ----------------------- | -------------- | +| **RETURNS** | `List[TransformerData]` | | + +## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} + + + +Span getters are functions that take a batch of [`Doc`](/api/doc) objects and +return a lists of [`Span`](/api/span) objects for each doc, to be processed by +the transformer. The returned spans can overlap. Span getters can be referenced +in the config's `[components.transformer.model.get_spans]` block to customize +the sequences processed by the transformer. You can also register custom span +getters using the `@registry.span_getters` decorator. + +> #### Example +> +> ```python +> @registry.span_getters("sent_spans.v1") +> def configure_get_sent_spans() -> Callable: +> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]: +> return [list(doc.sents) for doc in docs] +> +> return get_sent_spans +> ``` + +| Name | Type | Description | +| ----------- | ------------------ | ---------------------------------------- | +| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. | +| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. | + +The following built-in functions are available: + +| Name | Description | +| ------------------ | ------------------------------------------------------------------ | +| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). | +| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. | +| `strided_spans.v1` | | + +## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} + +Annotation setters are functions that that take a batch of `Doc` objects and a +[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set +additional annotations on the `Doc`, e.g. to set custom or built-in attributes. +You can register custom annotation setters using the +`@registry.annotation_setters` decorator. + +> #### Example +> +> ```python +> @registry.annotation_setters("spacy-transformer.null_annotation_setter.v1") +> def configure_null_annotation_setter() -> Callable: +> def setter(docs: List[Doc], trf_data: FullTransformerBatch) -> None: +> pass +> +> return setter +> ``` + +| Name | Type | Description | +| ---------- | ---------------------- | ------------------------------------ | +| `docs` | `List[Doc]` | A batch of `Doc` objects. | +| `trf_data` | `FullTransformerBatch` | The transformers data for the batch. | + +The following built-in functions are available: + +| Name | Description | +| --------------------------------------------- | ------------------------------------- | +| `spacy-transformer.null_annotation_setter.v1` | Don't set any additional annotations. | + +## Custom attributes {#custom-attributes} + +The component sets the following +[custom extension attributes](/usage/processing-pipeline#custom-components-attributes): + +| Name | Type | Description | +| -------------- | ----------------------------------------------------- | ---------------------------------------------------- | +| `Doc.trf_data` | [`TransformerData`](/api/transformer#transformerdata) | Transformer tokens and outputs for the `Doc` object. | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index a0f7ef88b..bfb49e9a2 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -37,7 +37,6 @@ you can add vectors to later. | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `keys` | iterable | A sequence of keys aligned with the data. | | `name` | str | A name to identify the vectors table. | -| **RETURNS** | `Vectors` | The newly created object. | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index af9feb82c..d5c9b0ff0 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -31,7 +31,6 @@ Create the vocabulary. | `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | | `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `vectors_name` 2.2 | str | A name to identify the vectors table. | -| **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} @@ -231,10 +230,11 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -247,11 +247,12 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} @@ -263,10 +264,11 @@ Serialize the current state to a binary string. > vocab_bytes = nlp.vocab.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Vocab` object. | ## Vocab.from_bytes {#from_bytes tag="method"} @@ -281,11 +283,12 @@ Load state from a binary string. > vocab.from_bytes(vocab_bytes) > ``` -| Name | Type | Description | -| ------------ | ------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The `Vocab` object. | ## Attributes {#attributes} diff --git a/website/docs/images/pipeline_transformer.svg b/website/docs/images/pipeline_transformer.svg new file mode 100644 index 000000000..cfbf470cc --- /dev/null +++ b/website/docs/images/pipeline_transformer.svg @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 08e8e964f..56ade692a 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -220,15 +220,19 @@ available pipeline components and component functions. > ruler = nlp.add_pipe("entity_ruler") > ``` -| String name | Component | Description | -| --------------- | ------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | -| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | -| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | -| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | -| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | -| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. | -| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | +| String name | Component | Description | +| --------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | +| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | +| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | +| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | +| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | +| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | +| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | +| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | +| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | +| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 948a13086..12785b6de 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -243,7 +243,14 @@ compound = 1.001 ### Using transformer models like BERT {#transformers} - +spaCy v3.0 lets you use almost any statistical model to power your pipeline. You +can use models implemented in a variety of frameworks. A transformer model is +just a statistical model, so the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +actually has very little work to do: it just has to provide a few functions that +do the required plumbing. It also provides a pipeline component, +[`Transformer`](/api/transformer), that lets you do multi-task learning and lets +you save the transformer outputs for later use. @@ -253,6 +260,10 @@ visualize your model. +For more details on how to integrate transformer models into your training +config and customize the implementations, see the usage guide on +[training transformers](/usage/transformers#training). + ### Pretraining with spaCy {#pretraining} diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index c54165e72..bab1b82d3 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -1,6 +1,282 @@ --- title: Transformers teaser: Using transformer models like BERT in spaCy +menu: + - ['Installation', 'install'] + - ['Runtime Usage', 'runtime'] + - ['Training Usage', 'training'] +next: /usage/training --- -TODO: ... +## Installation {#install hidden="true"} + +spaCy v3.0 lets you use almost **any statistical model** to power your pipeline. +You can use models implemented in a variety of +[frameworks](https://thinc.ai/docs/usage-frameworks), including TensorFlow, +PyTorch and MXNet. To keep things sane, spaCy expects models from these +frameworks to be wrapped with a common interface, using our machine learning +library [Thinc](https://thinc.ai). A transformer model is just a statistical +model, so the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +actually has very little work to do: it just has to provide a few functions that +do the required plumbing. It also provides a pipeline component, +[`Transformer`](/api/transformer), that lets you do multi-task learning and lets +you save the transformer outputs for later use. + +To use transformers with spaCy, you need the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +installed. It takes care of all the setup behind the scenes, and makes sure the +transformer pipeline component is available to spaCy. + +```bash +$ pip install spacy-transformers +``` + + + +## Runtime usage {#runtime} + +Transformer models can be used as **drop-in replacements** for other types of +neural networks, so your spaCy pipeline can include them in a way that's +completely invisible to the user. Users will download, load and use the model in +the standard way, like any other spaCy pipeline. Instead of using the +transformers as subnetworks directly, you can also use them via the +[`Transformer`](/api/transformer) pipeline component. + +![The processing pipeline with the transformer component](../images/pipeline_transformer.svg) + +The `Transformer` component sets the +[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, +which lets you access the transformers outputs at runtime. + +```bash +$ python -m spacy download en_core_trf_lg +``` + +```python +### Example +import spacy + +nlp = spacy.load("en_core_trf_lg") +for doc in nlp.pipe(["some text", "some other text"]): + tokvecs = doc._.trf_data.tensors[-1] +``` + +You can also customize how the [`Transformer`](/api/transformer) component sets +annotations onto the [`Doc`](/api/doc), by customizing the `annotation_setter`. +This callback will be called with the raw input and output data for the whole +batch, along with the batch of `Doc` objects, allowing you to implement whatever +you need. The annotation setter is called with a batch of [`Doc`](/api/doc) +objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) +containing the transformers data for the batch. + +```python +def custom_annotation_setter(docs, trf_data): + # TODO: + ... + +nlp = spacy.load("en_core_trf_lg") +nlp.get_pipe("transformer").annotation_setter = custom_annotation_setter +doc = nlp("This is a text") +print() # TODO: +``` + +## Training usage {#training} + +The recommended workflow for training is to use spaCy's +[config system](/usage/training#config), usually via the +[`spacy train`](/api/cli#train) command. The training config defines all +component settings and hyperparameters in one place and lets you describe a tree +of objects by referring to creation functions, including functions you register +yourself. + + + +The easiest way to get started is to clone a transformers-based project +template. Swap in your data, edit the settings and hyperparameters and train, +evaluate, package and visualize your model. + + + +The `[components]` section in the [`config.cfg`](#TODO:) describes the pipeline +components and the settings used to construct them, including their model +implementation. Here's a config snippet for the +[`Transformer`](/api/transformer) component, along with matching Python code. In +this case, the `[components.transformer]` block describes the `transformer` +component: + +> #### Python equivalent +> +> ```python +> from spacy_transformers import Transformer, TransformerModel +> from spacy_transformers.annotation_setters import null_annotation_setter +> from spacy_transformers.span_getters import get_doc_spans +> +> trf = Transformer( +> nlp.vocab, +> TransformerModel( +> "bert-base-cased", +> get_spans=get_doc_spans, +> tokenizer_config={"use_fast": True}, +> ), +> annotation_setter=null_annotation_setter, +> max_batch_items=4096, +> ) +> ``` + +```ini +### config.cfg (excerpt) +[components.transformer] +factory = "transformer" +max_batch_items = 4096 + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "bert-base-cased" +tokenizer_config = {"use_fast": true} + +[components.transformer.model.get_spans] +@span_getters = "doc_spans.v1" + +[components.transformer.annotation_setter] +@annotation_setters = "spacy-transformer.null_annotation_setter.v1" + +``` + +The `[components.transformer.model]` block describes the `model` argument passed +to the transformer component. It's a Thinc +[`Model`](https://thinc.ai/docs/api-model) object that will be passed into the +component. Here, it references the function +[spacy-transformers.TransformerModel.v1](/api/architectures#TransformerModel) +registered in the [`architectures` registry](/api/top-level#registry). If a key +in a block starts with `@`, it's **resolved to a function** and all other +settings are passed to the function as arguments. In this case, `name`, +`tokenizer_config` and `get_spans`. + +`get_spans` is a function that takes a batch of `Doc` object and returns lists +of potentially overlapping `Span` objects to process by the transformer. Several +[built-in functions](/api/transformer#span-getters) are available – for example, +to process the whole document or individual sentences. When the config is +resolved, the function is created and passed into the model as an argument. + + + +Remember that the `config.cfg` used for training should contain **no missing +values** and requires all settings to be defined. You don't want any hidden +defaults creeping in and changing your results! spaCy will tell you if settings +are missing, and you can run [`spacy debug config`](/api/cli#debug-config) with +`--auto-fill` to automatically fill in all defaults. + + + + + +### Customizing the settings {#training-custom-settings} + +To change any of the settings, you can edit the `config.cfg` and re-run the +training. To change any of the functions, like the span getter, you can replace +the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to +process sentences. You can also register your own functions using the +`span_getters` registry: + +> #### config.cfg +> +> ```ini +> [components.transformer.model.get_spans] +> @span_getters = "custom_sent_spans" +> ``` + +```python +### code.py +import spacy_transformers + +@spacy_transformers.registry.span_getters("custom_sent_spans") +def configure_custom_sent_spans(): + # TODO: write custom example + def get_sent_spans(docs): + return [list(doc.sents) for doc in docs] + + return get_sent_spans +``` + +To resolve the config during training, spaCy needs to know about your custom +function. You can make it available via the `--code` argument that can point to +a Python file. For more details on training with custom code, see the +[training documentation](/usage/training#custom-code). + +```bash +$ python -m spacy train ./train.spacy ./dev.spacy ./config.cfg --code ./code.py +``` + +### Customizing the model implementations {#training-custom-model} + +The [`Transformer`](/api/transformer) component expects a Thinc +[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model` +argument. You're not limited to the implementation provided by +`spacy-transformers` – the only requirement is that your registered function +must return an object of type `Model[List[Doc], FullTransformerBatch]`: that is, +a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a +[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the +transformer data. + +> #### Model type annotations +> +> In the documentation and code base, you may come across type annotations and +> descriptions of [Thinc](https://thinc.ai) model types, like +> `Model[List[Doc], List[Floats2d]]`. This so-called generic type describes the +> layer and its input and output type – in this case, it takes a list of `Doc` +> objects as the input and list of 2-dimensional arrays of floats as the output. +> You can read more about defining Thinc +> models [here](https://thinc.ai/docs/usage-models). Also see the +> [type checking](https://thinc.ai/docs/usage-type-checking) for how to enable +> linting in your editor to see live feedback if your inputs and outputs don't +> match. + +The same idea applies to task models that power the **downstream components**. +Most of spaCy's built-in model creation functions support a `tok2vec` argument, +which should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This +is where we'll plug in our transformer model, using the +[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily +delegates to the `Transformer` pipeline component. + +```ini +### config.cfg (excerpt) {highlight="12"} +[components.ner] +factory = "ner" + +[nlp.pipeline.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +nr_feature_tokens = 3 +hidden_width = 128 +maxout_pieces = 3 +use_upper = false + +[nlp.pipeline.ner.model.tok2vec] +@architectures = "spacy-transformers.Tok2VecListener.v1" +grad_factor = 1.0 + +[nlp.pipeline.ner.model.tok2vec.pooling] +@layers = "reduce_mean.v1" +``` + +The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a +[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument +`pooling`, which needs to be of type `Model[Ragged, Floats2d]`. This layer +determines how the vector for each spaCy token will be computed from the zero or +more source rows the token is aligned against. Here we use the +[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which +averages the wordpiece rows. We could instead use `reduce_last`, +[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom +function you write yourself. + + + +You can have multiple components all listening to the same transformer model, +and all passing gradients back to it. By default, all of the gradients will be +**equally weighted**. You can control this with the `grad_factor` setting, which +lets you reweight the gradients from the different listeners. For instance, +setting `grad_factor = 0` would disable gradients from one of the listeners, +while `grad_factor = 2.0` would multiply them by 2. This is similar to having a +custom learning rate for each component. Instead of a constant, you can also +provide a schedule, allowing you to freeze the shared parameters at the start of +training. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 24803e953..1f13b6328 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -31,18 +31,35 @@ raise errors. Many of them were also mostly internals. If you've been working with more recent versions of spaCy v2.x, it's **unlikely** that your code relied on them. -| Removed | Replacement | -| ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) | -| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) | -| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) | -| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | -| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | -| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | -| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer), | +| Removed | Replacement | +| ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) | +| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) | +| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) | +| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | +| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | +| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | +| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) | ## Migrating from v2.x {#migrating} +### Downloading and loading models {#migrating-downloading-models} + +Model symlinks and shortcuts like `en` are now officially deprecated. There are +[many different models](/models) with different capabilities and not just one +"English model". In order to download and load a model, you should always use +its full name – for instance, `en_core_web_sm`. + +```diff +- python -m spacy download en ++ python -m spacy download en_core_web_sm +``` + +```diff +- nlp = spacy.load("en") ++ nlp = spacy.load("en_core_web_sm") +``` + ### Custom pipeline components and factories {#migrating-pipeline-components} Custom pipeline components now have to be registered explicitly using the @@ -179,6 +196,10 @@ workflows, from data preprocessing to training and packaging your model. +#### Training via the Python API {#migrating-training-python} + + + #### Packaging models {#migrating-training-packaging} The [`spacy package`](/api/cli#package) command now automatically builds the diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md index 7725068ec..823b30c20 100644 --- a/website/docs/usage/vectors-embeddings.md +++ b/website/docs/usage/vectors-embeddings.md @@ -1,63 +1,96 @@ --- -title: Word Vectors and Embeddings +title: Vectors and Embeddings menu: + - ["What's a Word Vector?", 'whats-a-vector'] - ['Word Vectors', 'vectors'] - ['Other Embeddings', 'embeddings'] +next: /usage/transformers --- - +An old idea in linguistics is that you can "know a word by the company it +keeps": that is, word meanings can be understood relationally, based on their +patterns of usage. This idea inspired a branch of NLP research known as +"distributional semantics" that has aimed to compute databases of lexical +knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec) +family of algorithms are a key milestone in this line of research. For +simplicity, we will refer to a distributional word representation as a "word +vector", and algorithms that computes word vectors (such as +[GloVe](https://nlp.stanford.edu/projects/glove/), +[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms". -## Word vectors and similarity +Word vector tables are included in some of the spaCy [model packages](/models) +we distribute, and you can easily create your own model packages with word +vectors you train or download yourself. In some cases you can also add word +vectors to an existing pipeline, although each pipeline can only have a single +word vectors table, and a model package that already has word vectors is +unlikely to work correctly if you replace the vectors with new ones. -> #### Training word vectors -> -> Dense, real valued vectors representing distributional similarity information -> are now a cornerstone of practical NLP. The most common way to train these -> vectors is the [Word2vec](https://en.wikipedia.org/wiki/Word2vec) family of -> algorithms. If you need to train a word2vec model, we recommend the -> implementation in the Python library -> [Gensim](https://radimrehurek.com/gensim/). +## What's a word vector? {#whats-a-vector} -import Vectors101 from 'usage/101/\_vectors-similarity.md' +For spaCy's purposes, a "word vector" is a 1-dimensional slice from a +2-dimensional **vectors table**, with a deterministic mapping from word types to +rows in the table. - +```python +def what_is_a_word_vector( + word_id: int, + key2row: Dict[int, int], + vectors_table: Floats2d, + *, + default_row: int=0 +) -> Floats1d: + return vectors_table[key2row.get(word_id, default_row)] +``` -### Customizing word vectors {#custom} +Word2vec algorithms try to produce vectors tables that let you estimate useful +relationships between words using simple linear algebra operations. For +instance, you can often find close synonyms of a word by finding the vectors +closest to it by cosine distance, and then finding the words that are mapped to +those neighboring vectors. Word vectors can also be useful as features in +statistical models. -Word vectors let you import knowledge from raw text into your model. The -knowledge is represented as a table of numbers, with one row per term in your -vocabulary. If two terms are used in similar contexts, the algorithm that learns -the vectors should assign them **rows that are quite similar**, while words that -are used in different contexts will have quite different values. This lets you -use the row-values assigned to the words as a kind of dictionary, to tell you -some things about what the words in your text mean. +### Word vectors vs. contextual language models {#vectors-vs-language-models} -Word vectors are particularly useful for terms which **aren't well represented -in your labelled training data**. For instance, if you're doing named entity -recognition, there will always be lots of names that you don't have examples of. -For instance, imagine your training data happens to contain some examples of the -term "Microsoft", but it doesn't contain any examples of the term "Symantec". In -your raw text sample, there are plenty of examples of both terms, and they're -used in similar contexts. The word vectors make that fact available to the -entity recognition model. It still won't see examples of "Symantec" labelled as -a company. However, it'll see that "Symantec" has a word vector that usually -corresponds to company terms, so it can **make the inference**. +The key difference between word vectors and contextual language models such as +ElMo, BERT and GPT-2 is that word vectors model **lexical types**, rather than +_tokens_. If you have a list of terms with no context around them, a model like +BERT can't really help you. BERT is designed to understand language **in +context**, which isn't what you have. A word vectors table will be a much better +fit for your task. However, if you do have words in context — whole sentences or +paragraphs of running text — word vectors will only provide a very rough +approximation of what the text is about. -In order to make best use of the word vectors, you want the word vectors table -to cover a **very large vocabulary**. However, most words are rare, so most of -the rows in a large word vectors table will be accessed very rarely, or never at -all. You can usually cover more than **95% of the tokens** in your corpus with -just **a few thousand rows** in the vector table. However, it's those **5% of -rare terms** where the word vectors are **most useful**. The problem is that -increasing the size of the vector table produces rapidly diminishing returns in -coverage over these rare terms. +Word vectors are also very computationally efficient, as they map a word to a +vector with a single indexing operation. Word vectors are therefore useful as a +way to **improve the accuracy** of neural network models, especially models that +are small or have received little or no pretraining. In spaCy, word vector +tables are only used as **static features**. spaCy does not backpropagate +gradients to the pretrained word vectors table. The static vectors table is +usually used in combination with a smaller table of learned task-specific +embeddings. -### Converting word vectors for use in spaCy {#converting new="2.0.10"} +## Using word vectors directly {#vectors} + +spaCy stores word vector information in the +[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole +vectors table from most spaCy objects. You can also access the vector for a +[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or +[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or +`Span` has multiple tokens, the average of the word vectors will be returned, +excluding any "out of vocabulary" entries that have no vector available. If none +of the words have a vector, a zeroed vector will be returned. + +The `vector` attribute is a **read-only** numpy or cupy array (depending on +whether you've configured spaCy to use GPU memory), with dtype `float32`. The +array is read-only so that spaCy can avoid unnecessary copy operations where +possible. You can modify the vectors via the `Vocab` or `Vectors` table. + +### Converting word vectors for use in spaCy Custom word vectors can be trained using a number of open-source libraries, such as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc), or Tomas Mikolov's original -[word2vec implementation](https://code.google.com/archive/p/word2vec/). Most +[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most word vector libraries output an easy-to-read text-based format, where each line consists of the word followed by its vector. For everyday use, we want to convert the vectors model into a binary format that loads faster and takes up @@ -137,11 +170,10 @@ the two words. In the example above, the vector for "Shore" was removed and remapped to the vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to -the vector of "leaving", which is identical. - -If you're using the [`init-model`](/api/cli#init-model) command, you can set the -`--prune-vectors` option to easily reduce the size of the vectors as you add -them to a spaCy model: +the vector of "leaving", which is identical. If you're using the +[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors` +option to easily reduce the size of the vectors as you add them to a spaCy +model: ```bash $ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 @@ -151,20 +183,7 @@ This will create a spaCy model with vectors for the first 10,000 words in the vectors model. All other words in the vectors model are mapped to the closest vector among those retained. -### Adding vectors {#custom-vectors-add new="2"} - -spaCy's new [`Vectors`](/api/vectors) class greatly improves the way word -vectors are stored, accessed and used. The data is stored in two structures: - -- An array, which can be either on CPU or [GPU](#gpu). -- A dictionary mapping string-hashes to rows in the table. - -Keep in mind that the `Vectors` class itself has no -[`StringStore`](/api/stringstore), so you have to store the hash-to-string -mapping separately. If you need to manage the strings, you should use the -`Vectors` via the [`Vocab`](/api/vocab) class, e.g. `vocab.vectors`. To add -vectors to the vocabulary, you can use the -[`Vocab.set_vector`](/api/vocab#set_vector) method. +### Adding vectors {#adding-vectors} ```python ### Adding vectors @@ -194,40 +213,12 @@ For more details on **adding hooks** and **overwriting** the built-in `Doc`, + ## Other embeddings {#embeddings} - - - + diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 96e1ea8d6..0795eecc9 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -81,6 +81,7 @@ "items": [ { "text": "Tokenizer", "url": "/api/tokenizer" }, { "text": "Tok2Vec", "url": "/api/tok2vec" }, + { "text": "Transformer", "url": "/api/transformer" }, { "text": "Lemmatizer", "url": "/api/lemmatizer" }, { "text": "Morphologizer", "url": "/api/morphologizer" }, { "text": "Tagger", "url": "/api/tagger" }, diff --git a/website/src/components/link.js b/website/src/components/link.js index a2ab46476..de4edba27 100644 --- a/website/src/components/link.js +++ b/website/src/components/link.js @@ -33,11 +33,12 @@ const Link = ({ const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest) const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest) const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest) - const sourceWithText = (isSource || isApi) && isString(children) + const withIcon = isApi || isArch || isSource + const sourceWithText = withIcon && isString(children) const linkClassNames = classNames(classes.root, className, { [classes.hidden]: hidden, - [classes.nowrap]: (isApi || isSource || isArch) && !sourceWithText, - [classes.withIcon]: isApi || isSource || isArch, + [classes.nowrap]: (withIcon && !sourceWithText) || isArch, + [classes.withIcon]: withIcon, }) const Wrapper = ws ? Whitespace : Fragment const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null diff --git a/website/src/components/util.js b/website/src/components/util.js index 1935a8085..844f2c133 100644 --- a/website/src/components/util.js +++ b/website/src/components/util.js @@ -22,6 +22,7 @@ export const headingTextClassName = 'heading-text' * @returns {string} - URL to the file on GitHub. */ export function github(filepath, branch = 'master') { + if (filepath && filepath.startsWith('github.com')) return `https://${filepath}` const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : '' return `https://github.com/${repo}${path}` } diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 237567eb8..b2e72752a 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -36,13 +36,18 @@ const DATA = [ ], }, { - id: 'data', - title: 'Additional data', + id: 'addition', + title: 'Additions', multiple: true, options: [ + { + id: 'transformers', + title: 'Transformers', + help: 'Use transformers like BERT to train your spaCy models', + }, { id: 'lookups', - title: 'Lemmatization', + title: 'Lemmatizer data', help: 'Install additional lookup tables and rules for lemmatization', }, ], @@ -86,13 +91,22 @@ const QuickstartInstall = ({ id, title }) => ( set PYTHONPATH=C:\path\to\spaCy pip install -r requirements.txt - + + pip install -U spacy-lookups-transformers + + + pip install -U spacy-transformers + + + conda install -c conda-forge spacy-transformers + + pip install -U spacy-lookups-data - + pip install -U spacy-lookups-data - + conda install -c conda-forge spacy-lookups-data python setup.py build_ext --inplace