From e5d9eaf79c5c935b4553c5ede31de383571fe0dc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 28 Jul 2020 23:12:42 +0200 Subject: [PATCH 01/55] Tidy up docstrings and arguments --- spacy/language.py | 105 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 82 insertions(+), 23 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 79fceec95..a75295ca5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -36,6 +36,7 @@ from . import util from . import about +# TODO: integrate pipeline analyis ENABLE_PIPELINE_ANALYSIS = False # This is the base config will all settings (training etc.) DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg" @@ -43,6 +44,10 @@ DEFAULT_CONFIG = Config().from_disk(DEFAULT_CONFIG_PATH) class BaseDefaults: + """Language data defaults, available via Language.Defaults. Can be + overwritten by language subclasses by defining their own subclasses of + Language.Defaults. + """ config: Config = Config() tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES @@ -58,6 +63,9 @@ class BaseDefaults: @registry.tokenizers("spacy.Tokenizer.v1") def create_tokenizer() -> Callable[["Language"], Tokenizer]: + """Registered function to create a tokenizer. Returns a factory that takes + the nlp object and returns a Tokenizer instance using the language detaults. + """ def tokenizer_factory(nlp: "Language") -> Tokenizer: prefixes = nlp.Defaults.prefixes suffixes = nlp.Defaults.suffixes @@ -80,6 +88,11 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: @registry.lemmatizers("spacy.Lemmatizer.v1") def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]: + """Registered function to create a lemmatizer. Returns a factory that takes + the nlp object and returns a Lemmatizer instance with data loaded in from + spacy-lookups-data, if the package is installed. + """ + # TODO: Will be replaced when the lemmatizer becomes a pipeline component tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] def lemmatizer_factory(nlp: "Language") -> "Lemmatizer": @@ -116,7 +129,7 @@ class Language: create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, create_lemmatizer: Optional[Callable[["Language"], Callable]] = None, **kwargs, - ): + ) -> None: """Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created. @@ -134,7 +147,8 @@ class Language: returns a tokenizer. create_lemmatizer (Callable): Function that takes the nlp object and returns a lemmatizer. - RETURNS (Language): The newly constructed object. + + DOCS: https://spacy.io/api/language#init """ # We're only calling this to import all factories provided via entry # points. The factory decorator applied to these functions takes care @@ -189,6 +203,13 @@ class Language: @property def meta(self) -> Dict[str, Any]: + """Custom meta data of the language class. If a model is loaded, this + includes details from the model's meta.json. + + RETURNS (Dict[str, Any]): The meta. + + DOCS: https://spacy.io/api/language#meta + """ spacy_version = util.get_model_version_range(about.__version__) if self.vocab.lang: self._meta.setdefault("lang", self.vocab.lang) @@ -221,6 +242,13 @@ class Language: @property def config(self) -> Config: + """Trainable config for the current language instance. Includes the + current pipeline components, as well as default training config. + + RETURNS (thinc.api.Config): The config. + + DOCS: https://spacy.io/api/language#config + """ self._config.setdefault("nlp", {}) self._config.setdefault("training", {}) self._config["nlp"]["lang"] = self.lang @@ -382,6 +410,8 @@ class Language: select the best model. Weights should sum to 1.0 per component and will be combined and normalized for the whole pipeline. func (Optional[Callable]): Factory function if not used as a decorator. + + DOCS: https://spacy.io/api/language#factory """ if not isinstance(name, str): raise ValueError(Errors.E963.format(decorator="factory")) @@ -460,6 +490,8 @@ class Language: select the best model. Weights should sum to 1.0 per component and will be combined and normalized for the whole pipeline. func (Optional[Callable]): Factory function if not used as a decorator. + + DOCS: https://spacy.io/api/language#component """ if name is not None and not isinstance(name, str): raise ValueError(Errors.E963.format(decorator="component")) @@ -504,6 +536,7 @@ class Language: self, factory_name: str, name: Optional[str] = None, + *, config: Optional[Dict[str, Any]] = SimpleFrozenDict(), overrides: Optional[Dict[str, Any]] = SimpleFrozenDict(), validate: bool = True, @@ -521,6 +554,8 @@ class Language: validate (bool): Whether to validate the component config against the arguments and types expected by the factory. RETURNS (Callable[[Doc], Doc]): The pipeline component. + + DOCS: https://spacy.io/api/language#create_pipe """ name = name if name is not None else factory_name if not isinstance(config, dict): @@ -692,6 +727,7 @@ class Language: self, name: str, factory_name: str, + *, config: Dict[str, Any] = SimpleFrozenDict(), validate: bool = True, ) -> None: @@ -761,6 +797,7 @@ class Language: def __call__( self, text: str, + *, disable: Iterable[str] = tuple(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, ) -> Doc: @@ -770,8 +807,8 @@ class Language: text (str): The text to be processed. disable (list): Names of the pipeline components to disable. - component_cfg (dict): An optional dictionary with extra keyword arguments - for specific components. + component_cfg (Dict[str, dict]): An optional dictionary with extra + keyword arguments for specific components. RETURNS (Doc): A container for accessing the annotations. DOCS: https://spacy.io/api/language#call @@ -811,6 +848,7 @@ class Language: def select_pipes( self, + *, disable: Optional[Union[str, Iterable[str]]] = None, enable: Optional[Union[str, Iterable[str]]] = None, ) -> "DisabledPipes": @@ -853,7 +891,7 @@ class Language: def update( self, examples: Iterable[Example], - dummy: Optional[Any] = None, + _: Optional[Any] = None, *, drop: float = 0.0, sgd: Optional[Optimizer] = None, @@ -863,7 +901,7 @@ class Language: """Update the models in the pipeline. examples (Iterable[Example]): A batch of examples - dummy: Should not be set - serves to catch backwards-incompatible scripts. + _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (Optimizer): An optimizer. losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. @@ -873,7 +911,7 @@ class Language: DOCS: https://spacy.io/api/language#update """ - if dummy is not None: + if _ is not None: raise ValueError(Errors.E989) if losses is None: losses = {} @@ -890,12 +928,10 @@ class Language: raise TypeError( Errors.E978.format(name="language", method="update", types=wrong_types) ) - if sgd is None: if self._optimizer is None: self._optimizer = create_default_optimizer() sgd = self._optimizer - if component_cfg is None: component_cfg = {} for i, (name, proc) in enumerate(self.pipeline): @@ -915,6 +951,7 @@ class Language: def rehearse( self, examples: Iterable[Example], + *, sgd: Optional[Optimizer] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, @@ -937,8 +974,9 @@ class Language: >>> nlp.update(labelled_batch) >>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)] >>> nlp.rehearse(raw_batch) + + DOCS: https://spacy.io/api/language#rehearse """ - # TODO: document if len(examples) == 0: return if not isinstance(examples, IterableInstance): @@ -983,17 +1021,18 @@ class Language: def begin_training( self, - get_examples: Optional[Callable] = None, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, sgd: Optional[Optimizer] = None, device: int = -1, ) -> Optimizer: - """Allocate models, pre-process training data and acquire a trainer and - optimizer. Used as a contextmanager. + """Initialize the pipe for training, using data examples if available. - get_examples (function): Function returning example training data. - TODO: document format change since 3.0. - sgd (Optional[Optimizer]): An optimizer. - RETURNS: An optimizer. + get_examples (Callable[[], Iterable[Example]]): Optional function that + returns gold-standard Example objects. + sgd (thinc.api.Optimizer): Optional optimizer. Will be created with + create_optimizer if it doesn't exist. + RETURNS (thinc.api.Optimizer): The optimizer. DOCS: https://spacy.io/api/language#begin_training """ @@ -1022,18 +1061,20 @@ class Language: return self._optimizer def resume_training( - self, sgd: Optional[Optimizer] = None, device: int = -1 + self, *, sgd: Optional[Optimizer] = None, device: int = -1 ) -> Optimizer: """Continue training a pretrained model. Create and return an optimizer, and initialize "rehearsal" for any pipeline component that has a .rehearse() method. Rehearsal is used to prevent - models from "forgetting" their initialised "knowledge". To perform + models from "forgetting" their initialized "knowledge". To perform rehearsal, collect samples of text you want the models to retain performance on, and call nlp.rehearse() with a batch of Example objects. sgd (Optional[Optimizer]): An optimizer. RETURNS (Optimizer): The optimizer. + + DOCS: https://spacy.io/api/language#resume_training """ if device >= 0: # TODO: do we need this here? require_gpu(device) @@ -1052,11 +1093,12 @@ class Language: def evaluate( self, examples: Iterable[Example], + *, verbose: bool = False, batch_size: int = 256, scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, - ) -> Scorer: + ) -> Dict[str, Union[float, dict]]: """Evaluate a model's pipeline components. examples (Iterable[Example]): `Example` objects. @@ -1112,7 +1154,9 @@ class Language: EXAMPLE: >>> with nlp.use_params(optimizer.averages): - >>> nlp.to_disk('/tmp/checkpoint') + >>> nlp.to_disk("/tmp/checkpoint") + + DOCS: https://spacy.io/api/language#use_params """ contexts = [ pipe.use_params(params) @@ -1136,6 +1180,7 @@ class Language: def pipe( self, texts: Iterable[str], + *, as_tuples: bool = False, batch_size: int = 1000, disable: Iterable[str] = tuple(), @@ -1305,6 +1350,16 @@ class Language: """Create the nlp object from a loaded config. Will set up the tokenizer and language data, add pipeline components etc. If no config is provided, the default config of the given language is used. + + config (Dict[str, Any] / Config): The loaded config. + disable (Iterable[str]): List of pipeline component names to disable. + auto_fill (bool): Automatically fill in missing values in config based + on defaults and function argument annotations. + validate (bool): Validate the component config and arguments against + the types expected by the factory. + RETURNS (Language): The initialized Language class. + + DOCS: https://spacy.io/api/language#from_config """ if auto_fill: config = util.deep_merge_configs(config, cls.default_config) @@ -1418,7 +1473,6 @@ class Language: _fix_pretrained_vectors_name(self) path = util.ensure_path(path) - deserializers = {} if Path(path / "config.cfg").exists(): deserializers["config.cfg"] = lambda p: self.config.from_disk(p) @@ -1509,6 +1563,11 @@ class Language: @dataclass class FactoryMeta: + """Dataclass containing information about a component and its defaults + provided by the @Language.component or @Language.factory decorator. It's + created whenever a component is defined and stored on the Language class for + each component instance and factory instance. + """ factory: str default_config: Optional[Dict[str, Any]] = None # noqa: E704 assigns: Iterable[str] = tuple() @@ -1551,7 +1610,7 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None: class DisabledPipes(list): """Manager for temporary pipeline disabling.""" - def __init__(self, nlp: Language, names: List[str]): + def __init__(self, nlp: Language, names: List[str]) -> None: self.nlp = nlp self.names = names # Important! Not deep copy -- we just want the container (but we also From 7adffc5361b921e36a8c60a8072c9126650311c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 28 Jul 2020 23:12:47 +0200 Subject: [PATCH 02/55] Remove unused schema --- spacy/schemas.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index 3f3c01f22..971d283e2 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -171,17 +171,6 @@ class ModelMetaSchema(BaseModel): # fmt: on -# JSON training format - - -class TrainingSchema(BaseModel): - # TODO: write - - class Config: - title = "Schema for training data in spaCy's JSON format" - extra = "forbid" - - # Config schema # We're not setting any defaults here (which is too messy) and are making all # fields required, so we can raise validation errors for missing values. To From 0cddb0dbe9b9af85bb017393ec017a16f2acdaec Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 29 Jul 2020 11:02:31 +0200 Subject: [PATCH 03/55] Move timing into Language.evaluate (#5836) Move timing into `Language.evaluate` so that only the processing is timing, not processing + scoring. `Language.evaluate` returns `scores["speed"]` as words per second, which should be identical to how the speed was added to the scores previously. Also add the speed to the evaluate CLI output. --- spacy/cli/evaluate.py | 17 +++++++++-------- spacy/cli/train.py | 5 ----- spacy/language.py | 19 +++++++++++++++++-- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 83281543a..ee1be57a3 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -67,10 +67,7 @@ def evaluate( corpus = Corpus(data_path, data_path) nlp = util.load_model(model) dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc)) - begin = timer() scores = nlp.evaluate(dev_dataset, verbose=False) - end = timer() - nwords = sum(len(ex.predicted) for ex in dev_dataset) metrics = { "TOK": "token_acc", "TAG": "tag_acc", @@ -82,17 +79,21 @@ def evaluate( "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", - "Textcat": "cats_score", - "Sent P": "sents_p", - "Sent R": "sents_r", - "Sent F": "sents_f", + "TEXTCAT": "cats_score", + "SENT P": "sents_p", + "SENT R": "sents_r", + "SENT F": "sents_f", + "SPEED": "speed", } results = {} for metric, key in metrics.items(): if key in scores: if key == "cats_score": metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" - results[metric] = f"{scores[key]*100:.2f}" + if key == "speed": + results[metric] = f"{scores[key]:.0f}" + else: + results[metric] = f"{scores[key]*100:.2f}" data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} msg.table(results, title="Results") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 44597c73e..25eb4a3c0 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,5 +1,4 @@ from typing import Optional, Dict, Any, Tuple, Union, Callable, List -from timeit import default_timer as timer import srsly import tqdm from pathlib import Path @@ -248,14 +247,11 @@ def create_evaluation_callback( dev_examples = list(dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples) batch_size = cfg["eval_batch_size"] - start_time = timer() if optimizer.averages: with nlp.use_params(optimizer.averages): scores = nlp.evaluate(dev_examples, batch_size=batch_size) else: scores = nlp.evaluate(dev_examples, batch_size=batch_size) - end_time = timer() - wps = n_words / (end_time - start_time) # Calculate a weighted sum based on score_weights for the main score weights = cfg["score_weights"] try: @@ -264,7 +260,6 @@ def create_evaluation_callback( keys = list(scores.keys()) err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys) raise KeyError(err) - scores["speed"] = wps return weighted_score, scores return evaluate diff --git a/spacy/language.py b/spacy/language.py index 79fceec95..fe0a86ed1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -14,6 +14,7 @@ from thinc.api import get_current_ops, Config, require_gpu, Optimizer import srsly import multiprocessing as mp from itertools import chain, cycle +from timeit import default_timer as timer from .tokens.underscore import Underscore from .vocab import Vocab, create_vocab @@ -1088,7 +1089,14 @@ class Language: kwargs.setdefault("verbose", verbose) kwargs.setdefault("nlp", self) scorer = Scorer(**kwargs) - docs = list(eg.predicted for eg in examples) + texts = [eg.reference.text for eg in examples] + docs = [eg.predicted for eg in examples] + start_time = timer() + # tokenize the texts only for timing purposes + if not hasattr(self.tokenizer, "pipe"): + _ = [self.tokenizer(text) for text in texts] + else: + _ = list(self.tokenizer.pipe(texts)) for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) @@ -1096,11 +1104,18 @@ class Language: docs = _pipe(docs, pipe, kwargs) else: docs = pipe.pipe(docs, **kwargs) + # iterate over the final generator + if len(self.pipeline): + docs = list(docs) + end_time = timer() for i, (doc, eg) in enumerate(zip(docs, examples)): if verbose: print(doc) eg.predicted = doc - return scorer.score(examples) + results = scorer.score(examples) + n_words = sum(len(eg.predicted) for eg in examples) + results["speed"] = n_words / (end_time - start_time) + return results @contextmanager def use_params(self, params: dict): From 191a12d75fb44b5d5a606dbaf6e009c961ff5528 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 29 Jul 2020 11:04:12 +0200 Subject: [PATCH 04/55] Fix score_weights typo in train CLI (#5835) --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 25eb4a3c0..fbe3a5013 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -441,7 +441,7 @@ def update_meta( training: Union[Dict[str, Any], Config], nlp: Language, info: Dict[str, Any] ) -> None: nlp.meta["performance"] = {} - for metric in training["scores_weights"]: + for metric in training["score_weights"]: nlp.meta["performance"][metric] = info["other_scores"][metric] for pipe_name in nlp.pipe_names: nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] From 40c995b1be3450978d3cad8c2fcf939cbb843067 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 29 Jul 2020 11:04:43 +0200 Subject: [PATCH 05/55] Option for returning only greedy matches (#5771) * add "greedy" option for match pattern * distinction between greedy FIRST or LONGEST * check for proper values, throw custom warning otherwise * unxfail one more test * add comment in docstring * add test that LONGEST also prefers first match if equal length * use c arrays for more efficient processing * rename 'greediness' to 'greedy' --- spacy/errors.py | 8 ++- spacy/matcher/matcher.pxd | 1 + spacy/matcher/matcher.pyx | 69 ++++++++++++++----- spacy/pipeline/functions.py | 2 +- spacy/tests/matcher/test_matcher_api.py | 11 +-- spacy/tests/matcher/test_matcher_logic.py | 82 ++++++++++++++++------- 6 files changed, 121 insertions(+), 52 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index a10e5d9bd..3fe53d6db 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -432,12 +432,12 @@ class Errors: "Current DocBin: {current}\nOther DocBin: {other}") E169 = ("Can't find module: {module}") E170 = ("Cannot apply transition {name}: invalid for the current state.") - E171 = ("Matcher.add received invalid on_match callback argument: expected " + E171 = ("Matcher.add received invalid 'on_match' callback argument: expected " "callable or None, but got: {arg_type}") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E177 = ("Ill-formed IOB input detected: {tag}") - E178 = ("Invalid pattern. Expected list of dicts but got: {pat}. Maybe you " + E178 = ("Each pattern should be a list of dicts, but got: {pat}. Maybe you " "accidentally passed a single pattern to Matcher.add instead of a " "list of patterns? If you only want to add one pattern, make sure " "to wrap it in a list. For example: matcher.add('{key}', [pattern])") @@ -483,6 +483,10 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E947 = ("Matcher.add received invalid 'greedy' argument: expected " + "a string value from {expected} but got: '{arg}'") + E948 = ("Matcher.add received invalid 'patterns' argument: expected " + "a List, but got: {arg_type}") E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd index 689734079..e1f6bc773 100644 --- a/spacy/matcher/matcher.pxd +++ b/spacy/matcher/matcher.pxd @@ -66,6 +66,7 @@ cdef class Matcher: cdef public object validate cdef public object _patterns cdef public object _callbacks + cdef public object _filter cdef public object _extensions cdef public object _extra_predicates cdef public object _seen_attrs diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 6c8ee4204..64be4f51b 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,6 +1,9 @@ # cython: infer_types=True, cython: profile=True +from typing import List + from libcpp.vector cimport vector from libc.stdint cimport int32_t +from libc.string cimport memset, memcmp from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 @@ -42,6 +45,7 @@ cdef class Matcher: self._extra_predicates = [] self._patterns = {} self._callbacks = {} + self._filter = {} self._extensions = {} self._seen_attrs = set() self.vocab = vocab @@ -69,7 +73,7 @@ cdef class Matcher: """ return self._normalize_key(key) in self._patterns - def add(self, key, patterns, *_patterns, on_match=None): + def add(self, key, patterns, *, on_match=None, greedy: str=None): """Add a match-rule to the matcher. A match-rule consists of: an ID key, an on_match callback, and one or more patterns. @@ -87,11 +91,10 @@ cdef class Matcher: '+': Require the pattern to match 1 or more times. '*': Allow the pattern to zero or more times. - The + and * operators are usually interpretted "greedily", i.e. longer - matches are returned where possible. However, if you specify two '+' - and '*' patterns in a row and their matches overlap, the first - operator will behave non-greedily. This quirk in the semantics makes - the matcher more efficient, by avoiding the need for back-tracking. + The + and * operators return all possible matches (not just the greedy + ones). However, the "greedy" argument can filter the final matches + by returning a non-overlapping set per key, either taking preference to + the first greedy match ("FIRST"), or the longest ("LONGEST"). As of spaCy v2.2.2, Matcher.add supports the future API, which makes the patterns the second argument and a list (instead of a variable @@ -101,16 +104,15 @@ cdef class Matcher: key (str): The match ID. patterns (list): The patterns to add for the given key. on_match (callable): Optional callback executed on match. - *_patterns (list): For backwards compatibility: list of patterns to add - as variable arguments. Will be ignored if a list of patterns is - provided as the second argument. + greedy (str): Optional filter: "FIRST" or "LONGEST". """ errors = {} if on_match is not None and not hasattr(on_match, "__call__"): raise ValueError(Errors.E171.format(arg_type=type(on_match))) - if patterns is None or hasattr(patterns, "__call__"): # old API - on_match = patterns - patterns = _patterns + if patterns is None or not isinstance(patterns, List): # old API + raise ValueError(Errors.E948.format(arg_type=type(patterns))) + if greedy is not None and greedy not in ["FIRST", "LONGEST"]: + raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=greedy)) for i, pattern in enumerate(patterns): if len(pattern) == 0: raise ValueError(Errors.E012.format(key=key)) @@ -133,6 +135,7 @@ cdef class Matcher: raise ValueError(Errors.E154.format()) self._patterns.setdefault(key, []) self._callbacks[key] = on_match + self._filter[key] = greedy self._patterns[key].extend(patterns) def remove(self, key): @@ -218,6 +221,7 @@ cdef class Matcher: length = doclike.end - doclike.start else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) + cdef Pool tmp_pool = Pool() if len(set([LEMMA, POS, TAG]) & self._seen_attrs) > 0 \ and not doc.is_tagged: raise ValueError(Errors.E155.format()) @@ -225,11 +229,42 @@ cdef class Matcher: raise ValueError(Errors.E156.format()) matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) - for i, (key, start, end) in enumerate(matches): + final_matches = [] + pairs_by_id = {} + # For each key, either add all matches, or only the filtered, non-overlapping ones + for (key, start, end) in matches: + span_filter = self._filter.get(key) + if span_filter is not None: + pairs = pairs_by_id.get(key, []) + pairs.append((start,end)) + pairs_by_id[key] = pairs + else: + final_matches.append((key, start, end)) + matched = tmp_pool.alloc(length, sizeof(char)) + empty = tmp_pool.alloc(length, sizeof(char)) + for key, pairs in pairs_by_id.items(): + memset(matched, 0, length * sizeof(matched[0])) + span_filter = self._filter.get(key) + if span_filter == "FIRST": + sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start + elif span_filter == "LONGEST": + sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length + else: + raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter)) + for (start, end) in sorted_pairs: + assert 0 <= start < end # Defend against segfaults + span_len = end-start + # If no tokens in the span have matched + if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0: + final_matches.append((key, start, end)) + # Mark tokens that have matched + memset(&matched[start], 1, span_len * sizeof(matched[0])) + # perform the callbacks on the filtered set of results + for i, (key, start, end) in enumerate(final_matches): on_match = self._callbacks.get(key, None) if on_match is not None: - on_match(self, doc, i, matches) - return matches + on_match(self, doc, i, final_matches) + return final_matches def _normalize_key(self, key): if isinstance(key, basestring): @@ -240,9 +275,9 @@ cdef class Matcher: def unpickle_matcher(vocab, patterns, callbacks): matcher = Matcher(vocab) - for key, specs in patterns.items(): + for key, pattern in patterns.items(): callback = callbacks.get(key, None) - matcher.add(key, callback, *specs) + matcher.add(key, pattern, on_match=callback) return matcher diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 8a6a5188f..501884873 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -58,7 +58,7 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: """ # TODO: make stateful component with "label" config merger = Matcher(doc.vocab) - merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) + merger.add("SUBTOK", [[{"DEP": label, "op": "+"}]]) matches = merger(doc) spans = filter_spans([doc[start : end + 1] for _, start, end in matches]) with doc.retokenize() as retokenizer: diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 98542e80f..bcb224bd3 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -63,18 +63,11 @@ def test_matcher_len_contains(matcher): assert "TEST2" not in matcher -def test_matcher_add_new_old_api(en_vocab): +def test_matcher_add_new_api(en_vocab): doc = Doc(en_vocab, words=["a", "b"]) patterns = [[{"TEXT": "a"}], [{"TEXT": "a"}, {"TEXT": "b"}]] matcher = Matcher(en_vocab) - matcher.add("OLD_API", None, *patterns) - assert len(matcher(doc)) == 2 - matcher = Matcher(en_vocab) on_match = Mock() - matcher.add("OLD_API_CALLBACK", on_match, *patterns) - assert len(matcher(doc)) == 2 - assert on_match.call_count == 2 - # New API: add(key: str, patterns: List[List[dict]], on_match: Callable) matcher = Matcher(en_vocab) matcher.add("NEW_API", patterns) assert len(matcher(doc)) == 2 @@ -176,7 +169,7 @@ def test_matcher_match_zero_plus(matcher): def test_matcher_match_one_plus(matcher): control = Matcher(matcher.vocab) - control.add("BasicPhilippe", None, [{"ORTH": "Philippe"}]) + control.add("BasicPhilippe", [[{"ORTH": "Philippe"}]]) doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index a2b2cd83f..8f4c13471 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -7,18 +7,10 @@ from spacy.tokens import Doc, Span pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] -pattern2 = [{"ORTH": "A"}, {"ORTH": "A"}] +pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}] pattern3 = [{"ORTH": "A"}, {"ORTH": "A"}] -pattern4 = [ - {"ORTH": "B"}, - {"ORTH": "A", "OP": "*"}, - {"ORTH": "B"}, -] -pattern5 = [ - {"ORTH": "B", "OP": "*"}, - {"ORTH": "A", "OP": "*"}, - {"ORTH": "B"}, -] +pattern4 = [{"ORTH": "B"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] +pattern5 = [{"ORTH": "B", "OP": "*"}, {"ORTH": "A", "OP": "*"}, {"ORTH": "B"}] re_pattern1 = "AA*" re_pattern2 = "A*A" @@ -26,10 +18,16 @@ re_pattern3 = "AA" re_pattern4 = "BA*B" re_pattern5 = "B*A*B" +longest1 = "A A A A A" +longest2 = "A A A A A" +longest3 = "A A" +longest4 = "B A A A A A B" # "FIRST" would be "B B" +longest5 = "B B A A A A A B" + @pytest.fixture def text(): - return "(ABBAAAAAB)." + return "(BBAAAAAB)." @pytest.fixture @@ -41,25 +39,63 @@ def doc(en_tokenizer, text): @pytest.mark.parametrize( "pattern,re_pattern", [ - pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()), - pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()), - pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()), + (pattern1, re_pattern1), + (pattern2, re_pattern2), + (pattern3, re_pattern3), (pattern4, re_pattern4), - pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()), + (pattern5, re_pattern5), ], ) -def test_greedy_matching(doc, text, pattern, re_pattern): - """Test that the greedy matching behavior of the * op is consistant with +def test_greedy_matching_first(doc, text, pattern, re_pattern): + """Test that the greedy matching behavior "FIRST" is consistent with other re implementations.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, [pattern]) + matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] - for match, re_match in zip(matches, re_matches): - assert match[1:] == re_match + for (key, m_s, m_e), (re_s, re_e) in zip(matches, re_matches): + # matching the string, not the exact position + assert doc[m_s:m_e].text == doc[re_s:re_e].text + + +@pytest.mark.parametrize( + "pattern,longest", + [ + (pattern1, longest1), + (pattern2, longest2), + (pattern3, longest3), + (pattern4, longest4), + (pattern5, longest5), + ], +) +def test_greedy_matching_longest(doc, text, pattern, longest): + """Test the "LONGEST" greedy matching behavior""" + matcher = Matcher(doc.vocab) + matcher.add("RULE", [pattern], greedy="LONGEST") + matches = matcher(doc) + for (key, s, e) in matches: + assert doc[s:e].text == longest + + +def test_greedy_matching_longest_first(en_tokenizer): + """Test that "LONGEST" matching prefers the first of two equally long matches""" + doc = en_tokenizer(" ".join("CCC")) + matcher = Matcher(doc.vocab) + pattern = [{"ORTH": "C"}, {"ORTH": "C"}] + matcher.add("RULE", [pattern], greedy="LONGEST") + matches = matcher(doc) + # out of 0-2 and 1-3, the first should be picked + assert len(matches) == 1 + assert matches[0][1] == 0 + assert matches[0][2] == 2 + + +def test_invalid_greediness(doc, text): + matcher = Matcher(doc.vocab) + with pytest.raises(ValueError): + matcher.add("RULE", [pattern1], greedy="GREEDY") -@pytest.mark.xfail @pytest.mark.parametrize( "pattern,re_pattern", [ @@ -74,7 +110,7 @@ def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" matcher = Matcher(doc.vocab) - matcher.add(re_pattern, [pattern]) + matcher.add(re_pattern, [pattern], greedy="FIRST") matches = matcher(doc) re_matches = [m.span() for m in re.finditer(re_pattern, text)] assert len(matches) == len(re_matches) From e0ffe36e79fc26384900f11dd331460f765d586c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 11:36:42 +0200 Subject: [PATCH 06/55] Update docstrings, docs and types --- spacy/gold/corpus.py | 78 +++++-- spacy/lemmatizer.py | 1 - spacy/lookups.py | 4 - spacy/matcher/dependencymatcher.pyx | 1 - spacy/matcher/matcher.pyx | 1 - spacy/matcher/phrasematcher.pyx | 1 - spacy/pipeline/entityruler.py | 1 - spacy/scorer.py | 1 - spacy/strings.pyx | 1 - spacy/tokenizer.pyx | 1 - spacy/tokens/_retokenize.pyx | 1 + spacy/tokens/_serialize.py | 30 +-- spacy/tokens/doc.pyx | 1 - spacy/tokens/span.pyx | 1 - spacy/vectors.pyx | 1 - spacy/vocab.pyx | 1 - website/docs/api/architectures.md | 9 +- website/docs/api/corpus.md | 93 +++++++-- website/docs/api/cython-classes.md | 13 +- website/docs/api/dependencyparser.md | 2 +- website/docs/api/doc.md | 11 +- website/docs/api/docbin.md | 10 +- website/docs/api/entitylinker.md | 2 +- website/docs/api/entityrecognizer.md | 2 +- website/docs/api/example.md | 1 - website/docs/api/kb.md | 10 +- website/docs/api/language.md | 278 +++++++++++++++---------- website/docs/api/lemmatizer.md | 1 - website/docs/api/lexeme.md | 9 +- website/docs/api/lookups.md | 7 +- website/docs/api/matcher.md | 9 +- website/docs/api/morphanalysis.md | 33 +-- website/docs/api/morphologizer.md | 2 +- website/docs/api/morphology.md | 87 ++++---- website/docs/api/phrasematcher.md | 11 +- website/docs/api/pipe.md | 4 +- website/docs/api/scorer.md | 7 +- website/docs/api/sentencerecognizer.md | 4 +- website/docs/api/span.md | 17 +- website/docs/api/stringstore.md | 7 +- website/docs/api/tagger.md | 4 +- website/docs/api/textcategorizer.md | 4 +- website/docs/api/tok2vec.md | 2 +- website/docs/api/token.md | 151 +++++++------- website/docs/api/tokenizer.md | 19 +- website/docs/api/transformer.md | 107 ++++++++++ website/docs/api/vectors.md | 1 - website/docs/api/vocab.md | 1 - website/docs/usage/transformers.md | 152 +++++++++++++- website/docs/usage/v3.md | 39 +++- website/meta/sidebars.json | 1 + website/src/components/link.js | 7 +- website/src/components/util.js | 1 + 53 files changed, 821 insertions(+), 422 deletions(-) create mode 100644 website/docs/api/transformer.md diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 427c00caa..d23f70bee 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -1,7 +1,15 @@ +from typing import Union, List, Iterable, Iterator, TYPE_CHECKING +from pathlib import Path import random + from .. import util from .example import Example from ..tokens import DocBin, Doc +from ..vocab import Vocab + +if TYPE_CHECKING: + # This lets us add type hints for mypy etc. without causing circular imports + from ..language import Language # noqa: F401 class Corpus: @@ -11,20 +19,23 @@ class Corpus: DOCS: https://spacy.io/api/corpus """ - def __init__(self, train_loc, dev_loc, limit=0): + def __init__( + self, train_loc: Union[str, Path], dev_loc: Union[str, Path], limit: int = 0 + ) -> None: """Create a Corpus. train (str / Path): File or directory of training data. dev (str / Path): File or directory of development data. - limit (int): Max. number of examples returned - RETURNS (Corpus): The newly created object. + limit (int): Max. number of examples returned. + + DOCS: https://spacy.io/api/corpus#init """ self.train_loc = train_loc self.dev_loc = dev_loc self.limit = limit @staticmethod - def walk_corpus(path): + def walk_corpus(path: Union[str, Path]) -> List[Path]: path = util.ensure_path(path) if not path.is_dir(): return [path] @@ -43,7 +54,9 @@ class Corpus: locs.append(path) return locs - def _make_example(self, nlp, reference, gold_preproc): + def _make_example( + self, nlp: "Language", reference: Doc, gold_preproc: bool + ) -> Example: if gold_preproc or reference.has_unknown_spaces: return Example( Doc( @@ -56,7 +69,9 @@ class Corpus: else: return Example(nlp.make_doc(reference.text), reference) - def make_examples(self, nlp, reference_docs, max_length=0): + def make_examples( + self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0 + ) -> Iterator[Example]: for reference in reference_docs: if len(reference) == 0: continue @@ -69,7 +84,9 @@ class Corpus: elif max_length == 0 or len(ref_sent) < max_length: yield self._make_example(nlp, ref_sent.as_doc(), False) - def make_examples_gold_preproc(self, nlp, reference_docs): + def make_examples_gold_preproc( + self, nlp: "Language", reference_docs: Iterable[Doc] + ) -> Iterator[Example]: for reference in reference_docs: if reference.is_sentenced: ref_sents = [sent.as_doc() for sent in reference.sents] @@ -80,7 +97,9 @@ class Corpus: if len(eg.x): yield eg - def read_docbin(self, vocab, locs): + def read_docbin( + self, vocab: Vocab, locs: Iterable[Union[str, Path]] + ) -> Iterator[Doc]: """ Yield training examples as example dicts """ i = 0 for loc in locs: @@ -96,8 +115,14 @@ class Corpus: if self.limit >= 1 and i >= self.limit: break - def count_train(self, nlp): - """Returns count of words in train examples""" + def count_train(self, nlp: "Language") -> int: + """Returns count of words in train examples. + + nlp (Language): The current nlp. object. + RETURNS (int): The word count. + + DOCS: https://spacy.io/api/corpus#count_train + """ n = 0 i = 0 for example in self.train_dataset(nlp): @@ -108,8 +133,25 @@ class Corpus: return n def train_dataset( - self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs - ): + self, + nlp: "Language", + *, + shuffle: bool = True, + gold_preproc: bool = False, + max_length: int = 0 + ) -> Iterator[Example]: + """Yield examples from the training data. + + nlp (Language): The current nlp object. + shuffle (bool): Whether to shuffle the examples. + gold_preproc (bool): Whether to train on gold-standard sentences and tokens. + max_length (int): Maximum document length. Longer documents will be + split into sentences, if sentence boundaries are available. 0 for + no limit. + YIELDS (Example): The examples. + + DOCS: https://spacy.io/api/corpus#train_dataset + """ ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) @@ -120,7 +162,17 @@ class Corpus: random.shuffle(examples) yield from examples - def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs): + def dev_dataset( + self, nlp: "Language", *, gold_preproc: bool = False + ) -> Iterator[Example]: + """Yield examples from the development data. + + nlp (Language): The current nlp object. + gold_preproc (bool): Whether to train on gold-standard sentences and tokens. + YIELDS (Example): The examples. + + DOCS: https://spacy.io/api/corpus#dev_dataset + """ ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 1cfb681f4..adba79686 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -21,7 +21,6 @@ class Lemmatizer: lookups (Lookups): The lookups object containing the (optional) tables "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". - RETURNS (Lemmatizer): The newly constructed object. """ self.lookups = lookups if lookups is not None else Lookups() self.is_base_form = is_base_form diff --git a/spacy/lookups.py b/spacy/lookups.py index bf71ba877..7862b9805 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -52,8 +52,6 @@ class Lookups: def __init__(self) -> None: """Initialize the Lookups object. - RETURNS (Lookups): The newly created object. - DOCS: https://spacy.io/api/lookups#init """ self._tables = {} @@ -202,7 +200,6 @@ class Table(OrderedDict): data (dict): The dictionary. name (str): Optional table name for reference. - RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.from_dict """ @@ -215,7 +212,6 @@ class Table(OrderedDict): name (str): Optional table name for reference. data (dict): Initial data, used to hint Bloom Filter. - RETURNS (Table): The newly created object. DOCS: https://spacy.io/api/lookups#table.init """ diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index ddeeedd06..716af9909 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -36,7 +36,6 @@ cdef class DependencyMatcher: vocab (Vocab): The vocabulary object, which must be shared with the documents the matcher will operate on. - RETURNS (DependencyMatcher): The newly constructed object. """ size = 20 # TODO: make matcher work with validation diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 6c8ee4204..706cfdd68 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -37,7 +37,6 @@ cdef class Matcher: vocab (Vocab): The vocabulary object, which must be shared with the documents the matcher will operate on. - RETURNS (Matcher): The newly constructed object. """ self._extra_predicates = [] self._patterns = {} diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index a2141dc02..060c4d37f 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -32,7 +32,6 @@ cdef class PhraseMatcher: vocab (Vocab): The shared vocabulary. attr (int / str): Token attribute to match on. validate (bool): Perform additional validation when patterns are added. - RETURNS (PhraseMatcher): The newly constructed object. DOCS: https://spacy.io/api/phrasematcher#init """ diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 96a5d3d67..d6ce86e78 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -86,7 +86,6 @@ class EntityRuler: overwrite_ents (bool): If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. ent_id_sep (str): Separator used internally for entity IDs. - RETURNS (EntityRuler): The newly constructed object. DOCS: https://spacy.io/api/entityruler#init """ diff --git a/spacy/scorer.py b/spacy/scorer.py index 2bbf453e7..702c74521 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -72,7 +72,6 @@ class Scorer: def __init__(self, nlp=None, **cfg): """Initialize the Scorer. - RETURNS (Scorer): The newly created object. DOCS: https://spacy.io/api/scorer#init """ diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 9e584ce8a..136eda9ff 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -97,7 +97,6 @@ cdef class StringStore: """Create the StringStore. strings (iterable): A sequence of unicode strings to add to the store. - RETURNS (StringStore): The newly constructed object. """ self.mem = Pool() self._map = PreshMap() diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 793bb5a25..858a93ce5 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -50,7 +50,6 @@ cdef class Tokenizer: recognised as tokens. url_match (callable): A boolean function matching strings to be recognised as tokens after considering prefixes and suffixes. - RETURNS (Tokenizer): The newly constructed object. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 3943767a0..b89ce3bdd 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -312,6 +312,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs): """Retokenize the document, such that the token at `doc[token_index]` is split into tokens with the orth 'orths' token_index(int): token index of the token to split. + orths: IDs of the verbatim text content of the tokens to create **attributes: Attributes to assign to each of the newly created tokens. By default, attributes are inherited from the original token. diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 96245a0e1..0a5fd0c59 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,10 +1,12 @@ +from typing import Iterable, Iterator import numpy import zlib import srsly from thinc.api import NumpyOps +from .doc import Doc +from ..vocab import Vocab from ..compat import copy_reg -from ..tokens import Doc from ..attrs import SPACY, ORTH, intify_attr from ..errors import Errors @@ -44,13 +46,18 @@ class DocBin: document from the DocBin. """ - def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]): + def __init__( + self, + attrs: Iterable[str] = ALL_ATTRS, + store_user_data: bool = False, + docs=Iterable[Doc], + ) -> None: """Create a DocBin object to hold serialized annotations. - attrs (list): List of attributes to serialize. 'orth' and 'spacy' are - always serialized, so they're not required. Defaults to None. + attrs (Iterable[str]): List of attributes to serialize. 'orth' and + 'spacy' are always serialized, so they're not required. store_user_data (bool): Whether to include the `Doc.user_data`. - RETURNS (DocBin): The newly constructed object. + docs (Iterable[Doc]): Docs to add. DOCS: https://spacy.io/api/docbin#init """ @@ -68,11 +75,11 @@ class DocBin: for doc in docs: self.add(doc) - def __len__(self): + def __len__(self) -> int: """RETURNS: The number of Doc objects added to the DocBin.""" return len(self.tokens) - def add(self, doc): + def add(self, doc: Doc) -> None: """Add a Doc's annotations to the DocBin for serialization. doc (Doc): The Doc object to add. @@ -100,7 +107,7 @@ class DocBin: if self.store_user_data: self.user_data.append(srsly.msgpack_dumps(doc.user_data)) - def get_docs(self, vocab): + def get_docs(self, vocab: Vocab) -> Iterator[Doc]: """Recover Doc objects from the annotations, using the given vocab. vocab (Vocab): The shared vocab. @@ -125,7 +132,7 @@ class DocBin: doc.user_data.update(user_data) yield doc - def merge(self, other): + def merge(self, other: "DocBin") -> None: """Extend the annotations of this DocBin with the annotations from another. Will raise an error if the pre-defined attrs of the two DocBins don't match. @@ -144,7 +151,7 @@ class DocBin: if self.store_user_data: self.user_data.extend(other.user_data) - def to_bytes(self): + def to_bytes(self) -> bytes: """Serialize the DocBin's annotations to a bytestring. RETURNS (bytes): The serialized DocBin. @@ -156,7 +163,6 @@ class DocBin: lengths = [len(tokens) for tokens in self.tokens] tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([]) spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([]) - msg = { "version": self.version, "attrs": self.attrs, @@ -171,7 +177,7 @@ class DocBin: msg["user_data"] = self.user_data return zlib.compress(srsly.msgpack_dumps(msg)) - def from_bytes(self, bytes_data): + def from_bytes(self, bytes_data: bytes) -> "DocBin": """Deserialize the DocBin's annotations from a bytestring. bytes_data (bytes): The data to load from. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index adc7059e5..0ba5abb52 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -173,7 +173,6 @@ cdef class Doc: words. True means that the word is followed by a space, False means it is not. If `None`, defaults to `[True]*len(words)` user_data (dict or None): Optional extra data to attach to the Doc. - RETURNS (Doc): The newly constructed object. DOCS: https://spacy.io/api/doc#init """ diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 203308749..5b55d8e88 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -94,7 +94,6 @@ cdef class Span: kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. - RETURNS (Span): The newly constructed object. DOCS: https://spacy.io/api/span#init """ diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 0cc7409a7..bcea87e67 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -58,7 +58,6 @@ cdef class Vectors: data (numpy.ndarray): The vector data. keys (iterable): A sequence of keys, aligned with the data. name (str): A name to identify the vectors table. - RETURNS (Vectors): The newly created object. DOCS: https://spacy.io/api/vectors#init """ diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 2115789e6..f41ad2356 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -74,7 +74,6 @@ cdef class Vocab: lookups (Lookups): Container for large lookup tables and dictionaries. oov_prob (float): Default OOV probability. vectors_name (unicode): Optional name to identify the vectors table. - RETURNS (Vocab): The newly constructed object. """ lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index abc2b7bfa..a87c2a1e8 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -4,6 +4,7 @@ teaser: Pre-defined model architectures included with the core library source: spacy/ml/models menu: - ['Tok2Vec', 'tok2vec'] + - ['Transformers', 'transformers'] - ['Parser & NER', 'parser'] - ['Text Classification', 'textcat'] - ['Entity Linking', 'entitylinker'] @@ -13,7 +14,7 @@ TODO: intro and how architectures work, link to [`registry`](/api/top-level#registry), [custom models](/usage/training#custom-models) usage etc. -## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"}} +## Tok2Vec architectures {#tok2vec source="spacy/ml/models/tok2vec.py"} ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} @@ -21,12 +22,14 @@ TODO: intro and how architectures work, link to ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM} +## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} + +### spacy-transformers.TransformerModel.v1 {#TransformerModel} + ## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"} ### spacy.TransitionBasedParser.v1 {#TransitionBasedParser} - - > #### Example Config > > ```ini diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md index 3256849c3..38e19129d 100644 --- a/website/docs/api/corpus.md +++ b/website/docs/api/corpus.md @@ -13,25 +13,84 @@ datasets in the [DocBin](/api/docbin) (`.spacy`) format. Create a `Corpus`. The input data can be a file or a directory of files. -| Name | Type | Description | -| ----------- | ------------ | ---------------------------------------------------------------- | -| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | -| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | -| `limit` | int | Maximum number of examples returned. | -| **RETURNS** | `Corpus` | The newly constructed object. | +> #### Example +> +> ```python +> from spacy.gold import Corpus +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> ``` - - -## Corpus.walk_corpus {#walk_corpus tag="staticmethod"} - -## Corpus.make_examples {#make_examples tag="method"} - -## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"} - -## Corpus.read_docbin {#read_docbin tag="method"} - -## Corpus.count_train {#count_train tag="method"} +| Name | Type | Description | +| ------- | ------------ | ---------------------------------------------------------------- | +| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | +| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | +| `limit` | int | Maximum number of examples returned. `0` for no limit (default). | ## Corpus.train_dataset {#train_dataset tag="method"} +Yield examples from the training data. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> train_data = corpus.train_dataset(nlp) +> ``` + +| Name | Type | Description | +| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `nlp` | `Language` | The current `nlp` object. | +| _keyword-only_ | | | +| `shuffle` | bool | Whether to shuffle the examples. Defaults to `True`. | +| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | +| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. `0` for no limit (default).  | +| **YIELDS** | `Example` | The examples. | + ## Corpus.dev_dataset {#dev_dataset tag="method"} + +Yield examples from the development data. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> dev_data = corpus.dev_dataset(nlp) +> ``` + +| Name | Type | Description | +| -------------- | ---------- | ---------------------------------------------------------------------------- | +| `nlp` | `Language` | The current `nlp` object. | +| _keyword-only_ | | | +| `gold_preproc` | bool | Whether to train on gold-standard sentences and tokens. Defaults to `False`. | +| **YIELDS** | `Example` | The examples. | + +## Corpus.count_train {#count_train tag="method"} + +Get the word count of all training examples. + +> #### Example +> +> ```python +> from spacy.gold import Corpus +> import spacy +> +> corpus = Corpus("./train.spacy", "./dev.spacy") +> nlp = spacy.blank("en") +> word_count = corpus.count_train(nlp) +> ``` + +| Name | Type | Description | +| ----------- | ---------- | ------------------------- | +| `nlp` | `Language` | The current `nlp` object. | +| **RETURNS** | int | The word count. | + + diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md index 9dea04284..6e54fb112 100644 --- a/website/docs/api/cython-classes.md +++ b/website/docs/api/cython-classes.md @@ -87,13 +87,12 @@ Create a `Token` object from a `TokenC*` pointer. > token = Token.cinit(&doc.c[3], doc, 3) > ``` -| Name | Type | Description | -| ----------- | --------- | ------------------------------------------------------------ | -| `vocab` | `Vocab` | A reference to the shared `Vocab`. | -| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | -| `offset` | `int` | The offset of the token within the document. | -| `doc` | `Doc` | The parent document. | -| **RETURNS** | `Token` | The newly constructed object. | +| Name | Type | Description | +| -------- | --------- | ------------------------------------------------------------ | +| `vocab` | `Vocab` | A reference to the shared `Vocab`. | +| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | +| `offset` | `int` | The offset of the token within the document. | +| `doc` | `Doc` | The parent document. | ## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"} diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index 425b669ce..f6ed7492d 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and ## DependencyParser.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index b5871f2ab..69608c958 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -30,12 +30,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `words` | iterable | A list of strings to add to the container. | -| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | -| **RETURNS** | `Doc` | The newly constructed object. | +| Name | Type | Description | +| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `words` | iterable | A list of strings to add to the container. | +| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md index 07f95f91d..65d1153d1 100644 --- a/website/docs/api/docbin.md +++ b/website/docs/api/docbin.md @@ -44,11 +44,11 @@ Create a `DocBin` object to hold serialized annotations. > doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) > ``` -| Argument | Type | Description | -| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `attrs` | list | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | -| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | -| **RETURNS** | `DocBin` | The newly constructed object. | +| Argument | Type | Description | +| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | +| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | +| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. | ## DocBin.\_\len\_\_ {#len tag="method"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index b2b1eec32..c29f0326c 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -125,7 +125,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and ## EntityLinker.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this method, a knowledge base should have been defined with [`set_kb`](/api/entitylinker#set_kb). diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 63404e087..b1d40a9c3 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and ## EntityRecognizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/example.md b/website/docs/api/example.md index 0d06c79a1..e6299fc31 100644 --- a/website/docs/api/example.md +++ b/website/docs/api/example.md @@ -37,7 +37,6 @@ both documents. | `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. | | _keyword-only_ | | | | `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. | -| **RETURNS** | `Example` | The newly constructed object. | ## Example.from_dict {#from_dict tag="classmethod"} diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md index f088815fd..7b2c4edf4 100644 --- a/website/docs/api/kb.md +++ b/website/docs/api/kb.md @@ -27,11 +27,10 @@ Create the knowledge base. > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > ``` -| Name | Type | Description | -| ---------------------- | --------------- | ---------------------------------------- | -| `vocab` | `Vocab` | A `Vocab` object. | -| `entity_vector_length` | int | Length of the fixed-size entity vectors. | -| **RETURNS** | `KnowledgeBase` | The newly constructed object. | +| Name | Type | Description | +| ---------------------- | ------- | ---------------------------------------- | +| `vocab` | `Vocab` | A `Vocab` object. | +| `entity_vector_length` | int | Length of the fixed-size entity vectors. | ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} @@ -255,7 +254,6 @@ but instead these objects are returned by the | `entity_freq` | float | The entity frequency as recorded in the KB. | | `alias_hash` | int | The hash of the textual mention or alias. | | `prior_prob` | float | The prior probability of the `alias` referring to the `entity` | -| **RETURNS** | `Candidate` | The newly constructed object. | ## Candidate attributes {#candidate_attributes} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index d685c014b..0f7797d7f 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -15,6 +15,58 @@ the tagger or parser that are called on a document in order. You can also add your own processing pipeline components that take a `Doc` object, modify it and return it. +## Language.\_\_init\_\_ {#init tag="method"} + +Initialize a `Language` object. + +> #### Example +> +> ```python +> # Construction from subclass +> from spacy.lang.en import English +> nlp = English() +> +> # Construction from scratch +> from spacy.vocab import Vocab +> from spacy.language import Language +> nlp = Language(Vocab()) +> ``` + +| Name | Type | Description | +| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | +| _keyword-only_ | | | +| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | +| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | +| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | + +## Language.from_config {#from_config tag="classmethod"} + +Create a `Language` object from a loaded config. Will set up the tokenizer and +language data, add pipeline components based on the pipeline and components +define in the config and validate the results. If no config is provided, the +default config of the given language is used. This is also how spaCy loads a +model under the hood based on its [`config.cfg`](/api/data-formats#config). + +> #### Example +> +> ```python +> from thinc.api import Config +> from spacy.language import Language +> +> config = Config().from_disk("./config.cfg") +> nlp = Language.from_config(config) +> ``` + +| Name | Type | Description | +| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | +| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | +| _keyword-only_ | | +| `disable` | `Iterable[str]` | List of pipeline component names to disable. | +| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | +| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | +| **RETURNS** | `Language` | The initialized object. | + ## Language.component {#component tag="classmethod" new="3"} Register a custom pipeline component under a given name. This allows @@ -101,57 +153,6 @@ examples, see the | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. | -## Language.\_\_init\_\_ {#init tag="method"} - -Initialize a `Language` object. - -> #### Example -> -> ```python -> from spacy.vocab import Vocab -> from spacy.language import Language -> nlp = Language(Vocab()) -> -> from spacy.lang.en import English -> nlp = English() -> ``` - -| Name | Type | Description | -| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | -| _keyword-only_ | | | -| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | -| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | -| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | -| **RETURNS** | `Language` | The newly constructed object. | - -## Language.from_config {#from_config tag="classmethod"} - -Create a `Language` object from a loaded config. Will set up the tokenizer and -language data, add pipeline components based on the pipeline and components -define in the config and validate the results. If no config is provided, the -default config of the given language is used. This is also how spaCy loads a -model under the hood based on its [`config.cfg`](/api/data-formats#config). - -> #### Example -> -> ```python -> from thinc.api import Config -> from spacy.language import Language -> -> config = Config().from_disk("./config.cfg") -> nlp = Language.from_config(config) -> ``` - -| Name | Type | Description | -| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | -| _keyword-only_ | | -| `disable` | `Iterable[str]` | List of pipeline component names to disable. | -| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | -| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | -| **RETURNS** | `Language` | The initialized object. | - ## Language.\_\_call\_\_ {#call tag="method"} Apply the pipeline to some text. The text can span multiple sentences, and can @@ -164,11 +165,13 @@ contain arbitrary whitespace. Alignment into the original string is preserved. > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > ``` -| Name | Type | Description | -| ----------- | ----------- | --------------------------------------------------------------------------------- | -| `text` | str | The text to be processed. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| **RETURNS** | `Doc` | A container for accessing the annotations. | +| Name | Type | Description | +| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ | +| `text` | str | The text to be processed. | +| _keyword-only_ | | | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. | ## Language.pipe {#pipe tag="method"} @@ -183,15 +186,57 @@ more efficient than processing texts one-by-one. > assert doc.is_parsed > ``` -| Name | Type | Description | -| -------------------------------------------- | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `texts` | `Iterable[str]` | A sequence of strings. | -| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | -| `batch_size` | int | The number of texts to buffer. | -| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | -| **YIELDS** | `Doc` | Documents in the order of the original text. | +| Name | Type | Description | +| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `texts` | `Iterable[str]` | A sequence of strings. | +| _keyword-only_ | | | +| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | +| `batch_size` | int | The number of texts to buffer. | +| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | +| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| `n_process` 2.2.2 | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | +| **YIELDS** | `Doc` | Documents in the order of the original text. | + +## Language.begin_training {#begin_training tag="method"} + +Initialize the pipe for training, using data examples if available. Returns an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. + +> #### Example +> +> ```python +> optimizer = nlp.begin_training(get_examples) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | +| _keyword-only_ | | | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Language.resume_training {#resume_training tag="method,experimental" new="3"} + +Continue training a pretrained model. Create and return an optimizer, and +initialize "rehearsal" for any pipeline component that has a `rehearse` method. +Rehearsal is used to prevent models from "forgetting" their initialized +"knowledge". To perform rehearsal, collect samples of text you want the models +to retain performance on, and call [`nlp.rehearse`](/api/language#rehearse) with +a batch of [Example](/api/example) objects. + +> #### Example +> +> ```python +> optimizer = nlp.resume_training() +> nlp.rehearse(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | ## Language.update {#update tag="method"} @@ -206,15 +251,37 @@ Update the models in the pipeline. > nlp.update([example], sgd=optimizer) > ``` -| Name | Type | Description | -| -------------------------------------------- | ------------------- | ---------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | -| _keyword-only_ | | | -| `drop` | float | The dropout rate. | -| `sgd` | `Optimizer` | An [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | -| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | +| Name | Type | Description | +| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of `Example` objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | + +## Language.rehearse {#rehearse tag="method,experimental"} + +Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the +current model to make predictions similar to an initial model, to try to address +the "catastrophic forgetting" problem. This feature is experimental. + +> #### Example +> +> ```python +> optimizer = nlp.resume_training() +> losses = nlp.rehearse(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | ## Language.evaluate {#evaluate tag="method"} @@ -227,33 +294,15 @@ Evaluate a model's pipeline components. > print(scores) > ``` -| Name | Type | Description | -| -------------------------------------------- | ------------------------------- | ------------------------------------------------------------------------------------- | -| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | -| `verbose` | bool | Print debugging information. | -| `batch_size` | int | The batch size to use. | -| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| **RETURNS** | `Dict[str, Union[float, Dict]]` | A dictionary of evaluation scores. | - -## Language.begin_training {#begin_training tag="method"} - -Allocate models, pre-process training data and acquire an -[`Optimizer`](https://thinc.ai/docs/api-optimizers). - -> #### Example -> -> ```python -> optimizer = nlp.begin_training(get_examples) -> ``` - -| Name | Type | Description | -| -------------------------------------------- | ------------------- | ------------------------------------------------------------------------------------------------------------------ | -| `get_examples` | `Iterable[Example]` | Optional gold-standard annotations in the form of [`Example`](/api/example) objects. | -| `sgd` | `Optimizer` | An optional [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. If not set, a default one will be created. | -| `component_cfg` 2.1 | `Dict[str, Dict]` | Config parameters for specific pipeline components, keyed by component name. | -| `**cfg` | - | Config parameters (sent to all components). | -| **RETURNS** | `Optimizer` | An optimizer. | +| Name | Type | Description | +| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `verbose` | bool | Print debugging information. | +| `batch_size` | int | The batch size to use. | +| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | +| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | +| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. | ## Language.use_params {#use_params tag="contextmanager, method"} @@ -296,6 +345,7 @@ To create a component and add it to the pipeline, you should always use | ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | `factory_name` | str | Name of the registered component factory. | | `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | +| _keyword-only_ | | | | `config` 3 | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | | `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | **RETURNS** | callable | The pipeline component. | @@ -418,10 +468,13 @@ Replace a component in the pipeline. > nlp.replace_pipe("parser", my_custom_parser) > ``` -| Name | Type | Description | -| ----------- | -------- | --------------------------------- | -| `name` | str | Name of the component to replace. | -| `component` | callable | The pipeline component to insert. | +| Name | Type | Description | +| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str | Name of the component to replace. | +| `component` | callable | The pipeline component to insert. | +| _keyword-only_ | | | +| `config` 3 | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. | +| `validate` 3 | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | ## Language.rename_pipe {#rename_pipe tag="method" new="2"} @@ -492,11 +545,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------------------ | -| `disable` | str / list | Name(s) of pipeline components to disable. | -| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | -| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------------------ | +| _keyword-only_ | | | +| `disable` | str / list | Name(s) of pipeline components to disable. | +| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | +| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} @@ -767,8 +821,8 @@ serialization by passing in the string names via the `exclude` argument. The `FactoryMeta` contains the information about the component and its default provided by the [`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory) decorator. It's created whenever a -component is added to the pipeline and stored on the `Language` class for each -component instance and factory instance. +component is defined and stored on the `Language` class for each component +instance and factory instance. | Name | Type | Description | | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 237bfa468..73f8aa71f 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -31,7 +31,6 @@ when a `Language` subclass and its `Vocab` is initialized. | Name | Type | Description | | -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | -| **RETURNS** | `Lemmatizer` | The newly created object. | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md index b39664a55..625a26412 100644 --- a/website/docs/api/lexeme.md +++ b/website/docs/api/lexeme.md @@ -13,11 +13,10 @@ lemmatization depends on the part-of-speech tag). Create a `Lexeme` object. -| Name | Type | Description | -| ----------- | -------- | ----------------------------- | -| `vocab` | `Vocab` | The parent vocabulary. | -| `orth` | int | The orth id of the lexeme. | -| **RETURNS** | `Lexeme` | The newly constructed object. | +| Name | Type | Description | +| ------- | ------- | -------------------------- | +| `vocab` | `Vocab` | The parent vocabulary. | +| `orth` | int | The orth id of the lexeme. | ## Lexeme.set_flag {#set_flag tag="method"} diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md index b91d92646..099b5306e 100644 --- a/website/docs/api/lookups.md +++ b/website/docs/api/lookups.md @@ -236,10 +236,9 @@ Initialize a new table. > assert table["foo"] == "bar" > ``` -| Name | Type | Description | -| ----------- | ------- | ---------------------------------- | -| `name` | str | Optional table name for reference. | -| **RETURNS** | `Table` | The newly constructed object. | +| Name | Type | Description | +| ------ | ---- | ---------------------------------- | +| `name` | str | Optional table name for reference. | ### Table.from_dict {#table.from_dict tag="classmethod"} diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index c59a58c81..925c9ad2e 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -19,11 +19,10 @@ string where an integer is expected) or unexpected property names. > matcher = Matcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `validate` 2.1 | bool | Validate all patterns added to this matcher. | -| **RETURNS** | `Matcher` | The newly constructed object. | +| Name | Type | Description | +| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `validate` 2.1 | bool | Validate all patterns added to this matcher. | ## Matcher.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/morphanalysis.md b/website/docs/api/morphanalysis.md index 5c2356ad9..4df9a3f7f 100644 --- a/website/docs/api/morphanalysis.md +++ b/website/docs/api/morphanalysis.md @@ -6,7 +6,6 @@ source: spacy/tokens/morphanalysis.pyx Stores a single morphological analysis. - ## MorphAnalysis.\_\_init\_\_ {#init tag="method"} Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of @@ -16,17 +15,15 @@ morphological features. > > ```python > from spacy.tokens import MorphAnalysis -> +> > feats = "Feat1=Val1|Feat2=Val2" > m = MorphAnalysis(nlp.vocab, feats) > ``` -| Name | Type | Description | -| ----------- | ------------------ | ----------------------------- | -| `vocab` | `Vocab` | The vocab. | -| `features` | `Union[Dict, str]` | The morphological features. | -| **RETURNS** | `MorphAnalysis` | The newly constructed object. | - +| Name | Type | Description | +| ---------- | ------------------ | --------------------------- | +| `vocab` | `Vocab` | The vocab. | +| `features` | `Union[Dict, str]` | The morphological features. | ## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"} @@ -44,7 +41,6 @@ Whether a feature/value pair is in the analysis. | ----------- | ----- | ------------------------------------- | | **RETURNS** | `str` | A feature/value pair in the analysis. | - ## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"} Iterate over the feature/value pairs in the analysis. @@ -61,7 +57,6 @@ Iterate over the feature/value pairs in the analysis. | ---------- | ----- | ------------------------------------- | | **YIELDS** | `str` | A feature/value pair in the analysis. | - ## MorphAnalysis.\_\_len\_\_ {#len tag="method"} Returns the number of features in the analysis. @@ -78,7 +73,6 @@ Returns the number of features in the analysis. | ----------- | ----- | --------------------------------------- | | **RETURNS** | `int` | The number of features in the analysis. | - ## MorphAnalysis.\_\_str\_\_ {#str tag="method"} Returns the morphological analysis in the UD FEATS string format. @@ -92,10 +86,9 @@ Returns the morphological analysis in the UD FEATS string format. > ``` | Name | Type | Description | -| ----------- | ----- | ---------------------------------| +| ----------- | ----- | -------------------------------- | | **RETURNS** | `str` | The analysis in UD FEATS format. | - ## MorphAnalysis.get {#get tag="method"} Retrieve values for a feature by field. @@ -108,11 +101,10 @@ Retrieve values for a feature by field. > assert morph.get("Feat1") == ["Val1", "Val2"] > ``` -| Name | Type | Description | -| ----------- | ------ | ----------------------------------- | -| `field` | `str` | The field to retrieve. | -| **RETURNS** | `list` | A list of the individual features. | - +| Name | Type | Description | +| ----------- | ------ | ---------------------------------- | +| `field` | `str` | The field to retrieve. | +| **RETURNS** | `list` | A list of the individual features. | ## MorphAnalysis.to_dict {#to_dict tag="method"} @@ -128,10 +120,9 @@ map. > ``` | Name | Type | Description | -| ----------- | ------ | -----------------------------------------| +| ----------- | ------ | ---------------------------------------- | | **RETURNS** | `dict` | The dict representation of the analysis. | - ## MorphAnalysis.from_id {#from_id tag="classmethod"} Create a morphological analysis from a given hash ID. @@ -149,5 +140,3 @@ Create a morphological analysis from a given hash ID. | ------- | ------- | -------------------------------- | | `vocab` | `Vocab` | The vocab. | | `key` | `int` | The hash of the features string. | - - diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index 8ac300de3..a153bd51c 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -121,7 +121,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and ## Morphologizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index ad279bff7..8fb89c15f 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -4,12 +4,11 @@ tag: class source: spacy/morphology.pyx --- -Store the possible morphological analyses for a language, and index them -by hash. To save space on each token, tokens only know the hash of their +Store the possible morphological analyses for a language, and index them by +hash. To save space on each token, tokens only know the hash of their morphological analysis, so queries of morphological attributes are delegated to this class. - ## Morphology.\_\_init\_\_ {#init tag="method"} Create a Morphology object using the tag map, lemmatizer and exceptions. @@ -22,21 +21,18 @@ Create a Morphology object using the tag map, lemmatizer and exceptions. > morphology = Morphology(strings, tag_map, lemmatizer) > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| `strings` | `StringStore` | The string store. | -| `tag_map` | `Dict[str, Dict]` | The tag map. | -| `lemmatizer`| `Lemmatizer` | The lemmatizer. | -| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | -| **RETURNS** | `Morphology` | The newly constructed object. | - +| Name | Type | Description | +| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- | +| `strings` | `StringStore` | The string store. | +| `tag_map` | `Dict[str, Dict]` | The tag map. | +| `lemmatizer` | `Lemmatizer` | The lemmatizer. | +| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | ## Morphology.add {#add tag="method"} -Insert a morphological analysis in the morphology table, if not already -present. The morphological analysis may be provided in the UD FEATS format as a -string or in the tag map dictionary format. Returns the hash of the new -analysis. +Insert a morphological analysis in the morphology table, if not already present. +The morphological analysis may be provided in the UD FEATS format as a string or +in the tag map dictionary format. Returns the hash of the new analysis. > #### Example > @@ -46,10 +42,9 @@ analysis. > assert hash == nlp.vocab.strings[feats] > ``` -| Name | Type | Description | -| ----------- | ------------------- | --------------------------- | -| `features` | `Union[Dict, str]` | The morphological features. | - +| Name | Type | Description | +| ---------- | ------------------ | --------------------------- | +| `features` | `Union[Dict, str]` | The morphological features. | ## Morphology.get {#get tag="method"} @@ -63,33 +58,30 @@ analysis. Get the FEATS string for the hash of the morphological analysis. -| Name | Type | Description | -| ----------- | ------ | --------------------------------------- | -| `morph` | int | The hash of the morphological analysis. | - +| Name | Type | Description | +| ------- | ---- | --------------------------------------- | +| `morph` | int | The hash of the morphological analysis. | ## Morphology.load_tag_map {#load_tag_map tag="method"} Replace the current tag map with the provided tag map. -| Name | Type | Description | -| ----------- | ------------------ | ------------ | -| `tag_map` | `Dict[str, Dict]` | The tag map. | - +| Name | Type | Description | +| --------- | ----------------- | ------------ | +| `tag_map` | `Dict[str, Dict]` | The tag map. | ## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"} Replace the current morphological exceptions with the provided exceptions. -| Name | Type | Description | -| ------------- | ------------------ | ----------------------------- | -| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | - +| Name | Type | Description | +| ------------- | ----------------- | ----------------------------- | +| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | ## Morphology.add_special_case {#add_special_case tag="method"} -Add a special-case rule to the morphological analyzer. Tokens whose tag and -orth match the rule will receive the specified properties. +Add a special-case rule to the morphological analyzer. Tokens whose tag and orth +match the rule will receive the specified properties. > #### Example > @@ -98,27 +90,24 @@ orth match the rule will receive the specified properties. > morphology.add_special_case("DT", "the", attrs) > ``` -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------- | -| `tag_str` | str | The fine-grained tag. | -| `orth_str` | str | The token text. | -| `attrs` | dict | The features to assign for this token and tag. | - +| Name | Type | Description | +| ---------- | ---- | ---------------------------------------------- | +| `tag_str` | str | The fine-grained tag. | +| `orth_str` | str | The token text. | +| `attrs` | dict | The features to assign for this token and tag. | ## Morphology.exc {#exc tag="property"} The current morphological exceptions. -| Name | Type | Description | -| ---------- | ----- | --------------------------------------------------- | -| **YIELDS** | dict | The current dictionary of morphological exceptions. | - +| Name | Type | Description | +| ---------- | ---- | --------------------------------------------------- | +| **YIELDS** | dict | The current dictionary of morphological exceptions. | ## Morphology.lemmatize {#lemmatize tag="method"} TODO - ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} Convert a string FEATS representation to a dictionary of features and values in @@ -132,11 +121,10 @@ the same format as the tag map. > assert d == {"Feat1": "Val1", "Feat2": "Val2"} > ``` -| Name | Type | Description | -| ----------- | ---- | ------------------------------------------------------------- | +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------------------------ | | `feats` | str | The morphological features in Universal Dependencies FEATS format. | -| **RETURNS** | dict | The morphological features as a dictionary. | - +| **RETURNS** | dict | The morphological features as a dictionary. | ## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"} @@ -150,12 +138,11 @@ Convert a dictionary of features and values to a string FEATS representation. > assert f == "Feat1=Val1|Feat2=Val2" > ``` -| Name | Type | Description | +| Name | Type | Description | | ------------ | ----------------- | --------------------------------------------------------------------- | | `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. | | **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. | - ## Attributes {#attributes} | Name | Type | Description | diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md index 991016094..866aca096 100644 --- a/website/docs/api/phrasematcher.md +++ b/website/docs/api/phrasematcher.md @@ -35,12 +35,11 @@ be shown. > matcher = PhraseMatcher(nlp.vocab) > ``` -| Name | Type | Description | -| --------------------------------------- | --------------- | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | -| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | -| `validate` 2.1 | bool | Validate patterns added to the matcher. | -| **RETURNS** | `PhraseMatcher` | The newly constructed object. | +| Name | Type | Description | +| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | +| `attr` 2.1 | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | +| `validate` 2.1 | bool | Validate patterns added to the matcher. | ## PhraseMatcher.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index c03a1b4da..a2d055d88 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -95,7 +95,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and ## Pipe.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -198,7 +198,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > pipe = nlp.add_pipe("your_custom_pipe") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = pipe.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 8daefd241..f50a13099 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -28,10 +28,9 @@ Create a new `Scorer`. > scorer = Scorer(nlp) > ``` -| Name | Type | Description | -| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | -| **RETURNS** | `Scorer` | The newly created object. | +| Name | Type | Description | +| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | ## Scorer.score {#score tag="method"} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index 2c0944b1f..f7d2ac00f 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -116,7 +116,7 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the ## SentenceRecognizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -201,7 +201,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > senter = nlp.add_pipe("senter") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = senter.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 668013e76..9237b5538 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,15 +18,14 @@ Create a Span object from the slice `doc[start : end]`. > assert [t.text for t in span] == ["it", "back", "!"] > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | int | The index of the first token of the span. | -| `end` | int | The index of the first token after the span. | -| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | -| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | -| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object. | +| Name | Type | Description | +| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `start` | int | The index of the first token of the span. | +| `end` | int | The index of the first token after the span. | +| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | +| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | ## Span.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md index c00c59832..b66d755ed 100644 --- a/website/docs/api/stringstore.md +++ b/website/docs/api/stringstore.md @@ -19,10 +19,9 @@ Create the `StringStore`. > stringstore = StringStore(["apple", "orange"]) > ``` -| Name | Type | Description | -| ----------- | ------------- | ------------------------------------------ | -| `strings` | iterable | A sequence of strings to add to the store. | -| **RETURNS** | `StringStore` | The newly constructed object. | +| Name | Type | Description | +| --------- | -------- | ------------------------------------------ | +| `strings` | iterable | A sequence of strings to add to the store. | ## StringStore.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 351492aa9..cc7401016 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -114,7 +114,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and ## Tagger.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -199,7 +199,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > tagger = nlp.add_pipe("tagger") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = tagger.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index c4327dca7..c0dd07c1e 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -133,7 +133,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and ## TextCategorizer.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example @@ -218,7 +218,7 @@ the "catastrophic forgetting" problem. This feature is experimental. > > ```python > textcat = nlp.add_pipe("textcat") -> optimizer = nlp.begin_training() +> optimizer = nlp.resume_training() > losses = textcat.rehearse(examples, sgd=optimizer) > ``` diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 29f91afe6..11167c428 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -110,7 +110,7 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods. ## Tok2Vec.begin_training {#begin_training tag="method"} -Initialize the pipe for training, using data examples if available. Return an +Initialize the pipe for training, using data examples if available. Returns an [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. > #### Example diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 1cb833089..ca6b57a5b 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -17,12 +17,11 @@ Construct a `Token` object. > assert token.text == "Give" > ``` -| Name | Type | Description | -| ----------- | ------- | ------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `doc` | `Doc` | The parent document. | -| `offset` | int | The index of the token within the document. | -| **RETURNS** | `Token` | The newly constructed object. | +| Name | Type | Description | +| -------- | ------- | ------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `doc` | `Doc` | The parent document. | +| `offset` | int | The index of the token within the document. | ## Token.\_\_len\_\_ {#len tag="method"} @@ -393,73 +392,73 @@ The L2 norm of the token's vector representation. ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | -| `text` | str | Verbatim text content. | -| `text_with_ws` | str | Text content, with trailing space character if present. | -| `whitespace_` | str | Trailing space character if present. | -| `orth` | int | ID of the verbatim text content. | -| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | -| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | -| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | -| `head` | `Token` | The syntactic parent, or "governor", of this token. | -| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | -| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | -| `i` | int | The index of the token within the parent document. | -| `ent_type` | int | Named entity type. | -| `ent_type_` | str | Named entity type. | -| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | -| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | -| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | -| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | -| `lemma` | int | Base form of the token, with no inflectional suffixes. | -| `lemma_` | str | Base form of the token, with no inflectional suffixes. | -| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | -| `lower` | int | Lowercase form of the token. | -| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | -| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | -| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | -| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | -| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | -| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | -| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | -| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | -| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | -| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | -| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | -| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | -| `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | -| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | -| `is_bracket` | bool | Is the token a bracket? | -| `is_quote` | bool | Is the token a quotation mark? | -| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | -| `like_url` | bool | Does the token resemble a URL? | -| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | -| `like_email` | bool | Does the token resemble an email address? | -| `is_oov` | bool | Does the token have a word vector? | -| `is_stop` | bool | Is the token part of a "stop list"? | -| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | -| `tag` | int | Fine-grained part-of-speech. | -| `tag_` | str | Fine-grained part-of-speech. | -| `morph` | `MorphAnalysis` | Morphological analysis. | -| `morph_` | str | Morphological analysis in UD FEATS format. | -| `dep` | int | Syntactic dependency relation. | -| `dep_` | str | Syntactic dependency relation. | -| `lang` | int | Language of the parent document's vocabulary. | -| `lang_` | str | Language of the parent document's vocabulary. | -| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | -| `idx` | int | The character offset of the token within the parent document. | -| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | -| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | -| `cluster` | int | Brown cluster ID. | -| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | +| Name | Type | Description | +| -------------------------------------------- | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `sent` 2.0.12 | `Span` | The sentence span that this token is a part of. | +| `text` | str | Verbatim text content. | +| `text_with_ws` | str | Text content, with trailing space character if present. | +| `whitespace_` | str | Trailing space character if present. | +| `orth` | int | ID of the verbatim text content. | +| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | +| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | +| `tensor` 2.1.7 | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | +| `head` | `Token` | The syntactic parent, or "governor", of this token. | +| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | +| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | +| `i` | int | The index of the token within the parent document. | +| `ent_type` | int | Named entity type. | +| `ent_type_` | str | Named entity type. | +| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | +| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | +| `ent_kb_id` 2.2 | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_kb_id_` 2.2 | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | +| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | +| `lemma` | int | Base form of the token, with no inflectional suffixes. | +| `lemma_` | str | Base form of the token, with no inflectional suffixes. | +| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | +| `lower` | int | Lowercase form of the token. | +| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | +| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | +| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | +| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | +| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | +| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | +| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | +| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | +| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | +| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | +| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | +| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | +| `is_punct` | bool | Is the token punctuation? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | +| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | +| `is_bracket` | bool | Is the token a bracket? | +| `is_quote` | bool | Is the token a quotation mark? | +| `is_currency` 2.0.8 | bool | Is the token a currency symbol? | +| `like_url` | bool | Does the token resemble a URL? | +| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | +| `like_email` | bool | Does the token resemble an email address? | +| `is_oov` | bool | Does the token have a word vector? | +| `is_stop` | bool | Is the token part of a "stop list"? | +| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | +| `tag` | int | Fine-grained part-of-speech. | +| `tag_` | str | Fine-grained part-of-speech. | +| `morph` | `MorphAnalysis` | Morphological analysis. | +| `morph_` | str | Morphological analysis in UD FEATS format. | +| `dep` | int | Syntactic dependency relation. | +| `dep_` | str | Syntactic dependency relation. | +| `lang` | int | Language of the parent document's vocabulary. | +| `lang_` | str | Language of the parent document's vocabulary. | +| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | +| `idx` | int | The character offset of the token within the parent document. | +| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | +| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | +| `cluster` | int | Brown cluster ID. | +| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 47e5aa9b3..02023cf9f 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -34,16 +34,15 @@ the > tokenizer = nlp.tokenizer > ``` -| Name | Type | Description | -| ---------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `rules` | dict | Exceptions and special-cases for the tokenizer. | -| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | -| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | -| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | -| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | -| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | -| **RETURNS** | `Tokenizer` | The newly constructed object. | +| Name | Type | Description | +| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ | +| `vocab` | `Vocab` | A storage container for lexical types. | +| `rules` | dict | Exceptions and special-cases for the tokenizer. | +| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | +| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | +| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | +| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | +| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | ## Tokenizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md new file mode 100644 index 000000000..aab02fe68 --- /dev/null +++ b/website/docs/api/transformer.md @@ -0,0 +1,107 @@ +--- +title: Transformer +teaser: Pipeline component for multi-task learning with transformer models +tag: class +source: github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py +new: 3 +api_base_class: /api/pipe +api_string_name: transformer +--- + +> #### Installation +> +> ```bash +> $ pip install spacy-transformers +> ``` + + + +This component is available via the extension package +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). It +exposes the component via entry points, so if you have the package installed, +using `factory = "transformer"` in your +[training config](/usage/training#config) or `nlp.add_pipe("transformer")` will +work out-of-the-box. + + + +This pipeline component lets you use transformer models in your pipeline. The +component assigns the output of the transformer to the Doc's extension +attributes. We also calculate an alignment between the word-piece tokens and the +spaCy tokenization, so that we can use the last hidden states to set the +`Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy +token, the spaCy token receives the sum of their values. To access the values, +you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. For +more details, see the [usage documentation](/usage/transformers). + +## Config and implementation {#config} + +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). See the +[model architectures](/api/architectures) documentation for details on the +architectures and their arguments and hyperparameters. + +> #### Example +> +> ```python +> from spacy_transformers import Transformer, DEFAULT_CONFIG +> +> nlp.add_pipe("transformer", config=DEFAULT_CONFIG) +> ``` + +| Setting | Type | Description | Default | +| ------------------- | ------------------------------------------ | ------------------------------- | ------------------------------------------------------------------- | +| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | +| `annotation_setter` | Callable | | [`null_annotation_setter`](/api/transformer#null_annotation_setter) | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | + +```python +https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py +``` + +## Transformer.\_\_init\_\_ {#init tag="method"} + +> #### Example +> +> ```python +> # Construction via add_pipe with default model +> trf = nlp.add_pipe("transformer") +> +> # Construction via add_pipe with custom model +> config = {"model": {"@architectures": "my_transformer"}} +> trf = nlp.add_pipe("transformer", config=config) +> +> # Construction from class +> from spacy_transformers import Transformer +> trf = Transformer(nlp.vocab, model) +> ``` + +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#create_pipe). + +| Name | Type | Description | +| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `annotation_setter` | `Callable` | | +| _keyword-only_ | | | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | + + + +## TransformerData {#transformerdata tag="dataclass"} + +## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} + +## Custom attributes {#custom-attributes} + +The component sets the following +[custom extension attributes](/usage/processing-pipeline#custom-components-attributes): + +| Name | Type | Description | +| -------------- | ----------------- | -------------- | +| `Doc.trf_data` | `TransformerData` | | diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index a0f7ef88b..bfb49e9a2 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -37,7 +37,6 @@ you can add vectors to later. | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `keys` | iterable | A sequence of keys aligned with the data. | | `name` | str | A name to identify the vectors table. | -| **RETURNS** | `Vectors` | The newly created object. | ## Vectors.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index af9feb82c..c68af2047 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -31,7 +31,6 @@ Create the vocabulary. | `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | | `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `vectors_name` 2.2 | str | A name to identify the vectors table. | -| **RETURNS** | `Vocab` | The newly constructed object. | ## Vocab.\_\_len\_\_ {#len tag="method"} diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index c54165e72..d5ce4e891 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -3,4 +3,154 @@ title: Transformers teaser: Using transformer models like BERT in spaCy --- -TODO: ... +spaCy v3.0 lets you use almost **any statistical model** to power your pipeline. +You can use models implemented in a variety of frameworks, including TensorFlow, +PyTorch and MXNet. To keep things sane, spaCy expects models from these +frameworks to be wrapped with a common interface, using our machine learning +library [Thinc](https://thinc.ai). A transformer model is just a statistical +model, so the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +actually has very little work to do: we just have to provide a few functions +that do the required plumbing. We also provide a pipeline component, +[`Transformer`](/api/transformer), that lets you do multi-task learning and lets +you save the transformer outputs for later use. + + + +Try out a BERT-based model pipeline using this project template: swap in your +data, edit the settings and hyperparameters and train, evaluate, package and +visualize your model. + + + + diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 24803e953..1f13b6328 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -31,18 +31,35 @@ raise errors. Many of them were also mostly internals. If you've been working with more recent versions of spaCy v2.x, it's **unlikely** that your code relied on them. -| Removed | Replacement | -| ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) | -| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) | -| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) | -| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | -| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | -| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | -| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer), | +| Removed | Replacement | +| ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Doc.tokens_from_list` | [`Doc.__init__`](/api/doc#init) | +| `Doc.merge`, `Span.merge` | [`Doc.retokenize`](/api/doc#retokenize) | +| `Token.string`, `Span.string`, `Span.upper`, `Span.lower` | [`Span.text`](/api/span#attributes), [`Token.text`](/api/token#attributes) | +| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | +| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | +| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | +| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) | ## Migrating from v2.x {#migrating} +### Downloading and loading models {#migrating-downloading-models} + +Model symlinks and shortcuts like `en` are now officially deprecated. There are +[many different models](/models) with different capabilities and not just one +"English model". In order to download and load a model, you should always use +its full name – for instance, `en_core_web_sm`. + +```diff +- python -m spacy download en ++ python -m spacy download en_core_web_sm +``` + +```diff +- nlp = spacy.load("en") ++ nlp = spacy.load("en_core_web_sm") +``` + ### Custom pipeline components and factories {#migrating-pipeline-components} Custom pipeline components now have to be registered explicitly using the @@ -179,6 +196,10 @@ workflows, from data preprocessing to training and packaging your model. +#### Training via the Python API {#migrating-training-python} + + + #### Packaging models {#migrating-training-packaging} The [`spacy package`](/api/cli#package) command now automatically builds the diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 96e1ea8d6..0795eecc9 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -81,6 +81,7 @@ "items": [ { "text": "Tokenizer", "url": "/api/tokenizer" }, { "text": "Tok2Vec", "url": "/api/tok2vec" }, + { "text": "Transformer", "url": "/api/transformer" }, { "text": "Lemmatizer", "url": "/api/lemmatizer" }, { "text": "Morphologizer", "url": "/api/morphologizer" }, { "text": "Tagger", "url": "/api/tagger" }, diff --git a/website/src/components/link.js b/website/src/components/link.js index a2ab46476..de4edba27 100644 --- a/website/src/components/link.js +++ b/website/src/components/link.js @@ -33,11 +33,12 @@ const Link = ({ const isApi = !external && !hidden && !hideIcon && /^\/?api/.test(dest) const isArch = !external && !hidden && !hideIcon && /^\/?api\/architectures#/.test(dest) const isSource = external && !hidden && !hideIcon && /(github.com)/.test(dest) - const sourceWithText = (isSource || isApi) && isString(children) + const withIcon = isApi || isArch || isSource + const sourceWithText = withIcon && isString(children) const linkClassNames = classNames(classes.root, className, { [classes.hidden]: hidden, - [classes.nowrap]: (isApi || isSource || isArch) && !sourceWithText, - [classes.withIcon]: isApi || isSource || isArch, + [classes.nowrap]: (withIcon && !sourceWithText) || isArch, + [classes.withIcon]: withIcon, }) const Wrapper = ws ? Whitespace : Fragment const icon = isArch ? 'network' : isApi ? 'docs' : isSource ? 'code' : null diff --git a/website/src/components/util.js b/website/src/components/util.js index 1935a8085..844f2c133 100644 --- a/website/src/components/util.js +++ b/website/src/components/util.js @@ -22,6 +22,7 @@ export const headingTextClassName = 'heading-text' * @returns {string} - URL to the file on GitHub. */ export function github(filepath, branch = 'master') { + if (filepath && filepath.startsWith('github.com')) return `https://${filepath}` const path = filepath ? '/tree/' + (branch || 'master') + '/' + filepath : '' return `https://github.com/${repo}${path}` } From cb9654e98c6d2fe34cedd7d8dc43e233d133ba84 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 00:52:50 +0200 Subject: [PATCH 07/55] WIP on new StaticVectors --- spacy/ml/models/tok2vec.py | 46 ++++++++++++++++++++------------------ spacy/util.py | 27 ++++++++++++++-------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 1766fa80e..caa9c467c 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,7 +1,9 @@ +from typing import Optional, List from thinc.api import chain, clone, concatenate, with_array, uniqued from thinc.api import Model, noop, with_padded, Maxout, expand_window from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM from thinc.api import residual, LayerNorm, FeatureExtractor, Mish +from thinc.types import Floats2d from ... import util from ...util import registry @@ -42,15 +44,15 @@ def Doc2Feats(columns): @registry.architectures.register("spacy.HashEmbedCNN.v1") def hash_embed_cnn( - pretrained_vectors, - width, - depth, - embed_size, - maxout_pieces, - window_size, - subword_features, - dropout, -): + pretrained_vectors: str, + width: int, + depth: int, + embed_size: int, + maxout_pieces: int, + window_size: int, + subword_features: bool, + dropout: float, +) -> Model[List[Doc], List[Floats2d]: # Does not use character embeddings: set to False by default return build_Tok2Vec_model( width=width, @@ -182,7 +184,7 @@ def MultiHashEmbed( if pretrained_vectors: glove = StaticVectors( - vectors=pretrained_vectors.data, + vectors_name=pretrained_vectors, nO=width, column=columns.index(ID), dropout=dropout, @@ -261,18 +263,18 @@ def TorchBiLSTMEncoder(width, depth): def build_Tok2Vec_model( - width, - embed_size, - pretrained_vectors, - window_size, - maxout_pieces, - subword_features, - char_embed, - nM, - nC, - conv_depth, - bilstm_depth, - dropout, + width: int, + embed_size: int, + pretrained_vectors: Optional[str], + window_size: int, + maxout_pieces: int, + subword_features: bool, + char_embed: bool, + nM: int, + nC: int, + conv_depth: int, + bilstm_depth: int, + dropout: float, ) -> Model: if char_embed: subword_features = False diff --git a/spacy/util.py b/spacy/util.py index d1951145f..de6d9831b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -24,6 +24,8 @@ import tempfile import shutil import shlex import inspect +from thinc.types import Unserializable + try: import cupy.random @@ -1184,20 +1186,27 @@ class DummyTokenizer: return self -def link_vectors_to_models(vocab: "Vocab") -> None: +def link_vectors_to_models( + vocab: "Vocab", + models: List[Model]=[], + *, + vectors_name_attr="vectors_name", + vectors_attr="vectors", + key2row_attr="key2row", + default_vectors_name="spacy_pretrained_vectors" +) -> None: + """Supply vectors data to models.""" vectors = vocab.vectors if vectors.name is None: - vectors.name = VECTORS_KEY + vectors.name = default_vectors_name if vectors.data.size != 0: warnings.warn(Warnings.W020.format(shape=vectors.data.shape)) - for word in vocab: - if word.orth in vectors.key2row: - word.rank = vectors.key2row[word.orth] - else: - word.rank = 0 - -VECTORS_KEY = "spacy_pretrained_vectors" + for model in models: + for node in model.walk(): + if node.attrs.get(vectors_name_attr) == vectors.name: + node.attrs[vectors_attr] = Unserializable(vectors.data) + node.attrs[key2row_attr] = Unserializable(vectors.key2row) def create_default_optimizer() -> Optimizer: From 9cc72622248808a7cd6807ed0d2f3afbfef4770b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 12:17:09 +0200 Subject: [PATCH 08/55] Draft StaticVectors layer --- spacy/ml/staticvectors.py | 98 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 spacy/ml/staticvectors.py diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py new file mode 100644 index 000000000..4c9e53563 --- /dev/null +++ b/spacy/ml/staticvectors.py @@ -0,0 +1,98 @@ +from typing import List, Tuple, Callable, Optional, cast + +from thinc.initializers import glorot_uniform_init +from thinc.util import partial +from thinc.types import Ragged, Floats2d, Floats1d +from thinc.api import Model, Ops, registry + +from ..tokens import Doc + + +@registry.layers("spacy.StaticVectors.v1") +def StaticVectors( + nO: Optional[int] = None, + nM: Optional[int] = None, + *, + dropout: Optional[float] = None, + init_W: Callable = glorot_uniform_init, + key_attr: str="ORTH" +) -> Model[List[Doc], Ragged]: + """Embed Doc objects with their vocab's vectors table, applying a learned + linear projection to control the dimensionality. If a dropout rate is + specified, the dropout is applied per dimension over the whole batch. + """ + return Model( + "static_vectors", + forward, + init=partial(init, init_W), + params={"W": None}, + attrs={"key_attr": key_attr, "dropout_rate": dropout}, + dims={"nO": nO, "nM": nM}, + ) + + +def forward( + model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool +) -> Tuple[Ragged, Callable]: + if not len(docs): + return _handle_empty(model.ops, model.get_dim("nO")) + key_attr = model.attrs["key_attr"] + W = cast(Floats2d, model.get_param("W")) + V = cast(Floats2d, docs[0].vocab.vectors.data) + mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate")) + + rows = model.ops.flatten( + [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs] + ) + output = Ragged( + model.ops.gemm(V[rows], W, trans2=True), + model.ops.asarray([len(doc) for doc in docs], dtype="i") + ) + if mask is not None: + output.data *= mask + + def backprop(d_output: Ragged) -> List[Doc]: + if mask is not None: + d_output.data *= mask + model.inc_grad("W", model.ops.gemm(d_output.data, V[rows], trans1=True)) + return [] + + return output, backprop + + +def init( + init_W: Callable, + model: Model[List[Doc], Ragged], + X: Optional[List[Doc]] = None, + Y: Optional[Ragged] = None, +) -> Model[List[Doc], Ragged]: + nM = model.get_dim("nM") if model.has_dim("nM") else None + nO = model.get_dim("nO") if model.has_dim("nO") else None + if X is not None and len(X): + nM = X[0].vocab.vectors.data.shape[1] + if Y is not None: + nO = Y.data.shape[1] + + if nM is None: + raise ValueError( + "Cannot initialize StaticVectors layer: nM dimension unset. " + "This dimension refers to the width of the vectors table." + ) + if nO is None: + raise ValueError( + "Cannot initialize StaticVectors layer: nO dimension unset. " + "This dimension refers to the output width, after the linear " + "projection has been applied." + ) + model.set_dim("nM", nM) + model.set_dim("nO", nO) + model.set_param("W", init_W(model.ops, (nO, nM))) + return model + + +def _handle_empty(ops: Ops, nO: int): + return Ragged(ops.alloc2f(0, nO), ops.alloc1i(0)), lambda d_ragged: [] + + +def _get_drop_mask(ops: Ops, nO: int, rate: Optional[float]) -> Optional[Floats1d]: + return ops.get_dropout_mask((nO,), rate) if rate is not None else None From c6b4f63c7c96a8c1dd52bb3afc1aade8fbfdfc3a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 12:18:28 +0200 Subject: [PATCH 09/55] Remove obsolete function --- spacy/ml/spacy_vectors.py | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 spacy/ml/spacy_vectors.py diff --git a/spacy/ml/spacy_vectors.py b/spacy/ml/spacy_vectors.py deleted file mode 100644 index 2a4988494..000000000 --- a/spacy/ml/spacy_vectors.py +++ /dev/null @@ -1,27 +0,0 @@ -import numpy -from thinc.api import Model, Unserializable - - -def SpacyVectors(vectors) -> Model: - attrs = {"vectors": Unserializable(vectors)} - model = Model("spacy_vectors", forward, attrs=attrs) - return model - - -def forward(model, docs, is_train: bool): - batch = [] - vectors = model.attrs["vectors"].obj - for doc in docs: - indices = numpy.zeros((len(doc),), dtype="i") - for i, word in enumerate(doc): - if word.orth in vectors.key2row: - indices[i] = vectors.key2row[word.orth] - else: - indices[i] = 0 - batch_vectors = vectors.data[indices] - batch.append(batch_vectors) - - def backprop(dY): - return None - - return batch, backprop From 123f8b832d7c00e5479e3d814bbaadb54ba54966 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 13:51:43 +0200 Subject: [PATCH 10/55] Refactor Tok2Vec model --- spacy/ml/models/tok2vec.py | 365 ++++++------------------------------- 1 file changed, 57 insertions(+), 308 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index caa9c467c..4bcd61625 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,8 +1,8 @@ from typing import Optional, List -from thinc.api import chain, clone, concatenate, with_array, uniqued -from thinc.api import Model, noop, with_padded, Maxout, expand_window -from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM -from thinc.api import residual, LayerNorm, FeatureExtractor, Mish +from thinc.api import chain, clone, concatenate, with_array, with_padded +from thinc.api import Model, noop +from thinc.api import FeatureExtractor, HashEmbed, StaticVectors +from thincapi import expand_window, residual, Maxout, Mish from thinc.types import Floats2d from ... import util @@ -12,199 +12,72 @@ from ...pipeline.tok2vec import Tok2VecListener from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE -@registry.architectures.register("spacy.Tok2VecTensors.v1") -def tok2vec_tensors_v1(width, upstream="*"): +@registry.architectures.register("spacy.Tok2VecListener.v1") +def tok2vec_listener_v1(width, upstream="*"): tok2vec = Tok2VecListener(upstream_name=upstream, width=width) return tok2vec -@registry.architectures.register("spacy.VocabVectors.v1") -def get_vocab_vectors(name): - nlp = util.load_model(name) - return nlp.vocab.vectors - - @registry.architectures.register("spacy.Tok2Vec.v1") -def Tok2Vec(extract, embed, encode): - field_size = 0 - if encode.attrs.get("receptive_field", None): - field_size = encode.attrs["receptive_field"] - with Model.define_operators({">>": chain, "|": concatenate}): - tok2vec = extract >> with_array(embed >> encode, pad=field_size) +def Tok2Vec( + embed: Model[List[Doc], List[Floats2d]], + encode: Model[List[Floats2d], List[Floats2d] +) -> Model[List[Doc], List[Floats2d]]: + tok2vec = with_array( + chain(embed, encode), + pad=encode.attrs.get("receptive_field", 0) + ) tok2vec.set_dim("nO", encode.get_dim("nO")) tok2vec.set_ref("embed", embed) tok2vec.set_ref("encode", encode) return tok2vec -@registry.architectures.register("spacy.Doc2Feats.v1") -def Doc2Feats(columns): - return FeatureExtractor(columns) - - -@registry.architectures.register("spacy.HashEmbedCNN.v1") -def hash_embed_cnn( - pretrained_vectors: str, +@registry.architectures.register("spacy.HashEmbed.v1") +def HashEmbed( width: int, - depth: int, - embed_size: int, - maxout_pieces: int, - window_size: int, - subword_features: bool, - dropout: float, -) -> Model[List[Doc], List[Floats2d]: - # Does not use character embeddings: set to False by default - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - conv_depth=depth, - bilstm_depth=0, - maxout_pieces=maxout_pieces, - window_size=window_size, - subword_features=subword_features, - char_embed=False, - nM=0, - nC=0, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.HashCharEmbedCNN.v1") -def hash_charembed_cnn( - pretrained_vectors, - width, - depth, - embed_size, - maxout_pieces, - window_size, - nM, - nC, - dropout, + rows: int, + also_embed_subwords: bool, + also_use_static_vectors: bool ): - # Allows using character embeddings by setting nC, nM and char_embed=True - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - conv_depth=depth, - bilstm_depth=0, - maxout_pieces=maxout_pieces, - window_size=window_size, - subword_features=False, - char_embed=True, - nM=nM, - nC=nC, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.HashEmbedBiLSTM.v1") -def hash_embed_bilstm_v1( - pretrained_vectors, - width, - depth, - embed_size, - subword_features, - maxout_pieces, - dropout, -): - # Does not use character embeddings: set to False by default - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - bilstm_depth=depth, - conv_depth=0, - maxout_pieces=maxout_pieces, - window_size=1, - subword_features=subword_features, - char_embed=False, - nM=0, - nC=0, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") -def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout -): - # Allows using character embeddings by setting nC, nM and char_embed=True - return build_Tok2Vec_model( - width=width, - embed_size=embed_size, - pretrained_vectors=pretrained_vectors, - bilstm_depth=depth, - conv_depth=0, - maxout_pieces=maxout_pieces, - window_size=1, - subword_features=False, - char_embed=True, - nM=nM, - nC=nC, - dropout=dropout, - ) - - -@registry.architectures.register("spacy.LayerNormalizedMaxout.v1") -def LayerNormalizedMaxout(width, maxout_pieces): - return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True) - - -@registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed( - columns, width, rows, use_subwords, pretrained_vectors, mix, dropout -): - norm = HashEmbed( - nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=6 - ) - if use_subwords: - prefix = HashEmbed( - nO=width, - nV=rows // 2, - column=columns.index("PREFIX"), - dropout=dropout, - seed=7, + cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] + + seed = 7 + def make_hash_embed(feature): + nonlocal seed + seed += 1 + return HashEmbed( + width, + rows if feature == NORM else rows // 2, + column=cols.index(feature), + seed=seed ) - suffix = HashEmbed( - nO=width, - nV=rows // 2, - column=columns.index("SUFFIX"), - dropout=dropout, - seed=8, + + if also_embed_subwords: + embeddings = [ + make_hash_embed(NORM) + make_hash_embed(PREFIX) + make_hash_embed(SUFFIX) + make_hash_embed(SHAPE) + ] + else: + embeddings = [make_hash_embed(NORM)] + + if also_use_static_vectors: + model = chain( + concatenate( + chain(FeatureExtractor(cols), concatenate(*embeddings)), + StaticVectors(width, dropout=dropout) + ), + Maxout(width, dropout=dropout, normalize=True) ) - shape = HashEmbed( - nO=width, - nV=rows // 2, - column=columns.index("SHAPE"), - dropout=dropout, - seed=9, + else: + model = chain( + chain(FeatureExtractor(cols), concatenate(*embeddings)), + Maxout(width, concat_size, dropout=dropout, normalize=True) ) - - if pretrained_vectors: - glove = StaticVectors( - vectors_name=pretrained_vectors, - nO=width, - column=columns.index(ID), - dropout=dropout, - ) - - with Model.define_operators({">>": chain, "|": concatenate}): - if not use_subwords and not pretrained_vectors: - embed_layer = norm - else: - if use_subwords and pretrained_vectors: - concat_columns = glove | norm | prefix | suffix | shape - elif use_subwords: - concat_columns = norm | prefix | suffix | shape - else: - concat_columns = glove | norm - - embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) - - return embed_layer - + return model + @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): @@ -219,7 +92,7 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): @registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): +def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: int): cnn = chain( expand_window(window_size=window_size), Maxout( @@ -249,133 +122,9 @@ def MishWindowEncoder(width, window_size, depth): @registry.architectures.register("spacy.TorchBiLSTMEncoder.v1") -def TorchBiLSTMEncoder(width, depth): - import torch.nn - - # TODO FIX - from thinc.api import PyTorchRNNWrapper - +def BiLSTMEncoder(width, depth, dropout): if depth == 0: return noop() return with_padded( - PyTorchRNNWrapper(torch.nn.LSTM(width, width // 2, depth, bidirectional=True)) + PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout) ) - - -def build_Tok2Vec_model( - width: int, - embed_size: int, - pretrained_vectors: Optional[str], - window_size: int, - maxout_pieces: int, - subword_features: bool, - char_embed: bool, - nM: int, - nC: int, - conv_depth: int, - bilstm_depth: int, - dropout: float, -) -> Model: - if char_embed: - subword_features = False - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed( - nO=width, nV=embed_size, column=cols.index(NORM), dropout=None, seed=0 - ) - if subword_features: - prefix = HashEmbed( - nO=width, - nV=embed_size // 2, - column=cols.index(PREFIX), - dropout=None, - seed=1, - ) - suffix = HashEmbed( - nO=width, - nV=embed_size // 2, - column=cols.index(SUFFIX), - dropout=None, - seed=2, - ) - shape = HashEmbed( - nO=width, - nV=embed_size // 2, - column=cols.index(SHAPE), - dropout=None, - seed=3, - ) - else: - prefix, suffix, shape = (None, None, None) - if pretrained_vectors is not None: - glove = StaticVectors( - vectors=pretrained_vectors.data, - nO=width, - column=cols.index(ID), - dropout=dropout, - ) - - if subword_features: - columns = 5 - embed = uniqued( - (glove | norm | prefix | suffix | shape) - >> Maxout( - nO=width, nI=width * columns, nP=3, dropout=0.0, normalize=True, - ), - column=cols.index(ORTH), - ) - else: - columns = 2 - embed = uniqued( - (glove | norm) - >> Maxout( - nO=width, nI=width * columns, nP=3, dropout=0.0, normalize=True, - ), - column=cols.index(ORTH), - ) - elif subword_features: - columns = 4 - embed = uniqued( - concatenate(norm, prefix, suffix, shape) - >> Maxout( - nO=width, nI=width * columns, nP=3, dropout=0.0, normalize=True, - ), - column=cols.index(ORTH), - ) - elif char_embed: - embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) | FeatureExtractor( - cols - ) >> with_array(norm) - reduce_dimensions = Maxout( - nO=width, nI=nM * nC + width, nP=3, dropout=0.0, normalize=True, - ) - else: - embed = norm - - convolution = residual( - expand_window(window_size=window_size) - >> Maxout( - nO=width, - nI=width * ((window_size * 2) + 1), - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ) - ) - if char_embed: - tok2vec = embed >> with_array( - reduce_dimensions >> convolution ** conv_depth, pad=conv_depth - ) - else: - tok2vec = FeatureExtractor(cols) >> with_array( - embed >> convolution ** conv_depth, pad=conv_depth - ) - - if bilstm_depth >= 1: - tok2vec = tok2vec >> PyTorchLSTM( - nO=width, nI=width, depth=bilstm_depth, bi=True - ) - if tok2vec.has_dim("nO") is not False: - tok2vec.set_dim("nO", width) - tok2vec.set_ref("embed", embed) - return tok2vec From 034d803b7a4f0118b1d981554e62a1cf0a0da721 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 13:52:05 +0200 Subject: [PATCH 11/55] Update ptb config --- .../ptb-joint-pos-dep/defaults.cfg | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index d694ceac8..48741c433 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -64,7 +64,7 @@ min_action_freq = 1 @architectures = "spacy.Tagger.v1" [components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" +@architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model:width} [components.parser.model] @@ -74,16 +74,21 @@ hidden_width = 64 maxout_pieces = 3 [components.parser.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" +@architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model:width} [components.tok2vec.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${training:vectors} +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.HashEmbed.v1" width = 96 +rows = 2000 +also_use_subwords = true +also_use_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncode.v1" depth = 4 window_size = 1 -embed_size = 2000 maxout_pieces = 3 -subword_features = true -dropout = null From fe0cdcd461f4c77db4ce120b3dee2d960d83d605 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 13:59:46 +0200 Subject: [PATCH 12/55] Fixes --- spacy/ml/models/tok2vec.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 4bcd61625..4c4bd0d22 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -50,7 +50,8 @@ def HashEmbed( width, rows if feature == NORM else rows // 2, column=cols.index(feature), - seed=seed + seed=seed, + dropout=0.0 ) if also_embed_subwords: @@ -67,14 +68,14 @@ def HashEmbed( model = chain( concatenate( chain(FeatureExtractor(cols), concatenate(*embeddings)), - StaticVectors(width, dropout=dropout) + StaticVectors(width, dropout=0.0) ), - Maxout(width, dropout=dropout, normalize=True) + Maxout(width, pieces=3, dropout=0.0, normalize=True) ) else: model = chain( chain(FeatureExtractor(cols), concatenate(*embeddings)), - Maxout(width, concat_size, dropout=dropout, normalize=True) + Maxout(width, pieces=3, dropout=0.0, normalize=True) ) return model From 099e9331c50e92ba96bd3cc23498a023492965a7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 15:51:40 +0200 Subject: [PATCH 13/55] Fix tok2vec --- spacy/ml/models/tok2vec.py | 45 ++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 4c4bd0d22..448f9d1d0 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,13 +1,15 @@ from typing import Optional, List from thinc.api import chain, clone, concatenate, with_array, with_padded -from thinc.api import Model, noop -from thinc.api import FeatureExtractor, HashEmbed, StaticVectors -from thincapi import expand_window, residual, Maxout, Mish +from thinc.api import Model, noop, list2ragged, ragged2list +from thinc.api import FeatureExtractor, HashEmbed +from thinc.api import expand_window, residual, Maxout, Mish from thinc.types import Floats2d +from ...tokens import Doc from ... import util from ...util import registry from ...ml import _character_embed +from ..staticvectors import StaticVectors from ...pipeline.tok2vec import Tok2VecListener from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE @@ -21,20 +23,19 @@ def tok2vec_listener_v1(width, upstream="*"): @registry.architectures.register("spacy.Tok2Vec.v1") def Tok2Vec( embed: Model[List[Doc], List[Floats2d]], - encode: Model[List[Floats2d], List[Floats2d] + encode: Model[List[Floats2d], List[Floats2d]] ) -> Model[List[Doc], List[Floats2d]]: - tok2vec = with_array( - chain(embed, encode), - pad=encode.attrs.get("receptive_field", 0) - ) + + receptive_field = encode.attrs.get("receptive_field", 0) + tok2vec = chain(embed, with_array(encode, pad=receptive_field)) tok2vec.set_dim("nO", encode.get_dim("nO")) tok2vec.set_ref("embed", embed) tok2vec.set_ref("encode", encode) return tok2vec -@registry.architectures.register("spacy.HashEmbed.v1") -def HashEmbed( +@registry.architectures.register("spacy.MultiHashEmbed.v1") +def MultiHashEmbed( width: int, rows: int, also_embed_subwords: bool, @@ -56,9 +57,9 @@ def HashEmbed( if also_embed_subwords: embeddings = [ - make_hash_embed(NORM) - make_hash_embed(PREFIX) - make_hash_embed(SUFFIX) + make_hash_embed(NORM), + make_hash_embed(PREFIX), + make_hash_embed(SUFFIX), make_hash_embed(SHAPE) ] else: @@ -67,15 +68,25 @@ def HashEmbed( if also_use_static_vectors: model = chain( concatenate( - chain(FeatureExtractor(cols), concatenate(*embeddings)), + chain( + FeatureExtractor(cols), + list2ragged(), + with_array(concatenate(*embeddings)) + ), StaticVectors(width, dropout=0.0) ), - Maxout(width, pieces=3, dropout=0.0, normalize=True) + with_array(Maxout(width, nP=3, dropout=0.0, normalize=True)), + ragged2list() ) else: model = chain( - chain(FeatureExtractor(cols), concatenate(*embeddings)), - Maxout(width, pieces=3, dropout=0.0, normalize=True) + chain( + FeatureExtractor(cols), + list2ragged(), + with_array(concatenate(*embeddings)) + ), + with_array(Maxout(width, nP=3, dropout=0.0, normalize=True)), + ragged2list() ) return model From 9987ea9e4ddd920a8035c097b3360a80560352cb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 15:52:02 +0200 Subject: [PATCH 14/55] Fix Tok2Vec begin_training --- spacy/pipeline/tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5bda12d1b..5caaf432f 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -196,7 +196,7 @@ class Tok2Vec(Pipe): DOCS: https://spacy.io/api/tok2vec#begin_training """ - docs = [Doc(Vocab(), words=["hello"])] + docs = [Doc(self.vocab, words=["hello"])] self.model.initialize(X=docs) link_vectors_to_models(self.vocab) From acc64e138aa4d334e58933f885debe763952b3e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 15:52:20 +0200 Subject: [PATCH 15/55] Add import --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index de6d9831b..72e68463b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,7 +7,7 @@ import importlib.util import re from pathlib import Path import thinc -from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer +from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model import functools import itertools import numpy.random From 984754e3be65ddd0ed3ab77835b62ca67bb1266a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 15:52:30 +0200 Subject: [PATCH 16/55] Update config --- .../ptb-joint-pos-dep/defaults.cfg | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 48741c433..5850eaf3a 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -4,16 +4,16 @@ patience = 10000 eval_frequency = 200 dropout = 0.2 init_tok2vec = null -vectors = null +vectors = "tmp/fasttext_vectors/vocab" max_epochs = 100 orth_variant_level = 0.0 gold_preproc = true max_length = 0 -scores = ["tag_acc", "dep_uas", "dep_las"] +scores = ["tag_acc", "dep_uas", "dep_las", "speed"] score_weights = {"dep_las": 0.8, "tag_acc": 0.2} limit = 0 seed = 0 -accumulate_gradient = 2 +accumulate_gradient = 1 discard_oversize = false raw_text = null tag_map = null @@ -22,7 +22,7 @@ base_model = null eval_batch_size = 128 use_pytorch_for_gpu_memory = false -batch_by = "padded" +batch_by = "words" [training.batch_size] @schedules = "compounding.v1" @@ -65,7 +65,7 @@ min_action_freq = 1 [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model:width} +width = ${components.tok2vec.model.encode:width} [components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" @@ -75,20 +75,21 @@ maxout_pieces = 3 [components.parser.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model:width} +width = ${components.tok2vec.model.encode:width} [components.tok2vec.model] @architectures = "spacy.Tok2Vec.v1" [components.tok2vec.model.embed] -@architectures = "spacy.HashEmbed.v1" -width = 96 +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} rows = 2000 -also_use_subwords = true -also_use_static_vectors = false +also_embed_subwords = true +also_use_static_vectors = true [components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncode.v1" +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 depth = 4 window_size = 1 maxout_pieces = 3 From 44d350dc9476aa897cbf8d9502f0b5c87a2efa89 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 15:52:46 +0200 Subject: [PATCH 17/55] Use spaCy's StaticVectors --- spacy/ml/models/textcat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index e5f4af2fb..a64a2487a 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -5,7 +5,6 @@ from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_ from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued from thinc.api import Relu, residual, expand_window, FeatureExtractor -from ..spacy_vectors import SpacyVectors from ... import util from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry From 475d7c1c7c4520b280fad01e2a3c8db5d60a594b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 15:52:55 +0200 Subject: [PATCH 18/55] Fix StaticVectors class --- spacy/ml/staticvectors.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index 4c9e53563..ce2c7efff 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -37,15 +37,14 @@ def forward( if not len(docs): return _handle_empty(model.ops, model.get_dim("nO")) key_attr = model.attrs["key_attr"] - W = cast(Floats2d, model.get_param("W")) + W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) V = cast(Floats2d, docs[0].vocab.vectors.data) mask = _get_drop_mask(model.ops, W.shape[0], model.attrs.get("dropout_rate")) - rows = model.ops.flatten( [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs] ) output = Ragged( - model.ops.gemm(V[rows], W, trans2=True), + model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True), model.ops.asarray([len(doc) for doc in docs], dtype="i") ) if mask is not None: @@ -54,7 +53,14 @@ def forward( def backprop(d_output: Ragged) -> List[Doc]: if mask is not None: d_output.data *= mask - model.inc_grad("W", model.ops.gemm(d_output.data, V[rows], trans1=True)) + model.inc_grad( + "W", + model.ops.gemm( + d_output.data, + model.ops.as_contig(V[rows]), + trans1=True + ) + ) return [] return output, backprop From df95e2af64a6c3d2862e4317a450e8e694e2d406 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 21:56:02 +0200 Subject: [PATCH 19/55] Add load_vectors_into_model util --- spacy/util.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 72e68463b..4e3a8d203 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -189,6 +189,23 @@ def get_module_path(module: ModuleType) -> Path: return Path(sys.modules[module.__module__].__file__).parent +def load_vectors_into_model( + nlp: "Language", + name: Union[str, Path], + *, + add_strings=True +) -> None: + """Load word vectors from an installed model or path into a model instance.""" + vectors_nlp = load_model(name) + nlp.vocab.vectors = vectors_nlp.vocab.vectors + if add_strings: + # I guess we should add the strings from the vectors_nlp model? + # E.g. if someone does a similarity query, they might expect the strings. + for key in nlp.vocab.vectors.key2row: + if key in vectors_nlp.strings: + nlp.vocab.strings.add(vectors_nlp.strings[key]) + + def load_model( name: Union[str, Path], disable: Iterable[str] = tuple(), From 30dd96c540ec07f3289649e8f946547407721eba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 21:56:28 +0200 Subject: [PATCH 20/55] Load vectors in Language.from_config --- spacy/language.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 9dd8a347e..9fde419b3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1408,6 +1408,8 @@ class Language: nlp = cls( create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer, ) + if config["training"]["vectors"] is not None: + util.load_vectors_into_model(nlp, config["training"]["vectors"]) pipeline = config.get("components", {}) for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: From 7299419fe4cb68459bb300ad1c6c2b4885861db0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 21:59:30 +0200 Subject: [PATCH 21/55] Dont load vectors in Language.from_config --- spacy/language.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 9fde419b3..3511a7691 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1408,8 +1408,10 @@ class Language: nlp = cls( create_tokenizer=create_tokenizer, create_lemmatizer=create_lemmatizer, ) - if config["training"]["vectors"] is not None: - util.load_vectors_into_model(nlp, config["training"]["vectors"]) + # Note that we don't load vectors here, instead they get loaded explicitly + # inside stuff like the spacy train function. If we loaded them here, + # then we would load them twice at runtime: once when we make from config, + # and then again when we load from disk. pipeline = config.get("components", {}) for pipe_name in config["nlp"]["pipeline"]: if pipe_name not in pipeline: From 7852a68a7530b36c512a54163a1511afe102b625 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 21:59:51 +0200 Subject: [PATCH 22/55] Fix load_vectors_into_model function --- spacy/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 4e3a8d203..7a26011f1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -202,8 +202,8 @@ def load_vectors_into_model( # I guess we should add the strings from the vectors_nlp model? # E.g. if someone does a similarity query, they might expect the strings. for key in nlp.vocab.vectors.key2row: - if key in vectors_nlp.strings: - nlp.vocab.strings.add(vectors_nlp.strings[key]) + if key in vectors_nlp.vocab.strings: + nlp.vocab.strings.add(vectors_nlp.vocab.strings[key]) def load_model( From 2aff3c4b5aff4f5e17fa67d07424580609719fad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 22:00:24 +0200 Subject: [PATCH 23/55] Load vectors in 'spacy train' --- spacy/cli/train.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index fbe3a5013..e152ae8ea 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -80,16 +80,20 @@ def train( msg.info("Using CPU") msg.info(f"Loading config and nlp from: {config_path}") config = Config().from_disk(config_path) + if config.get("training", {}).get("seed") is not None: + fix_random_seed(config["training"]["seed"]) with show_validation_error(): nlp, config = util.load_model_from_config(config, overrides=config_overrides) if config["training"]["base_model"]: - base_nlp = util.load_model(config["training"]["base_model"]) # TODO: do something to check base_nlp against regular nlp described in config? - nlp = base_nlp + # If everything matches it will look something like: + # base_nlp = util.load_model(config["training"]["base_model"]) + # nlp = base_nlp + raise NotImplementedError("base_model not supported yet.") + if config["training"]["vectors"] is not None: + util.load_vectors_into_model(nlp, config["training"]["vectors"]) verify_config(nlp) raw_text, tag_map, morph_rules, weights_data = load_from_paths(config) - if config["training"]["seed"] is not None: - fix_random_seed(config["training"]["seed"]) if config["training"]["use_pytorch_for_gpu_memory"]: # It feels kind of weird to not have a default for this. use_pytorch_for_gpu_memory() From 0c17ea4c851d2d5996447f1da8d6de2b601e5ec7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 22:02:34 +0200 Subject: [PATCH 24/55] Format --- spacy/ml/models/tok2vec.py | 32 ++++++++++++++------------------ spacy/ml/staticvectors.py | 14 +++++--------- spacy/util.py | 9 +++------ 3 files changed, 22 insertions(+), 33 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 448f9d1d0..f9183e709 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -23,7 +23,7 @@ def tok2vec_listener_v1(width, upstream="*"): @registry.architectures.register("spacy.Tok2Vec.v1") def Tok2Vec( embed: Model[List[Doc], List[Floats2d]], - encode: Model[List[Floats2d], List[Floats2d]] + encode: Model[List[Floats2d], List[Floats2d]], ) -> Model[List[Doc], List[Floats2d]]: receptive_field = encode.attrs.get("receptive_field", 0) @@ -36,14 +36,12 @@ def Tok2Vec( @registry.architectures.register("spacy.MultiHashEmbed.v1") def MultiHashEmbed( - width: int, - rows: int, - also_embed_subwords: bool, - also_use_static_vectors: bool + width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool ): cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] - + seed = 7 + def make_hash_embed(feature): nonlocal seed seed += 1 @@ -52,15 +50,15 @@ def MultiHashEmbed( rows if feature == NORM else rows // 2, column=cols.index(feature), seed=seed, - dropout=0.0 + dropout=0.0, ) - + if also_embed_subwords: embeddings = [ make_hash_embed(NORM), make_hash_embed(PREFIX), make_hash_embed(SUFFIX), - make_hash_embed(SHAPE) + make_hash_embed(SHAPE), ] else: embeddings = [make_hash_embed(NORM)] @@ -71,25 +69,25 @@ def MultiHashEmbed( chain( FeatureExtractor(cols), list2ragged(), - with_array(concatenate(*embeddings)) + with_array(concatenate(*embeddings)), ), - StaticVectors(width, dropout=0.0) + StaticVectors(width, dropout=0.0), ), with_array(Maxout(width, nP=3, dropout=0.0, normalize=True)), - ragged2list() + ragged2list(), ) else: model = chain( chain( FeatureExtractor(cols), list2ragged(), - with_array(concatenate(*embeddings)) + with_array(concatenate(*embeddings)), ), with_array(Maxout(width, nP=3, dropout=0.0, normalize=True)), - ragged2list() + ragged2list(), ) return model - + @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): @@ -137,6 +135,4 @@ def MishWindowEncoder(width, window_size, depth): def BiLSTMEncoder(width, depth, dropout): if depth == 0: return noop() - return with_padded( - PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout) - ) + return with_padded(PyTorchLSTM(width, width, bi=True, depth=depth, dropout=dropout)) diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index ce2c7efff..41afdbf80 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -15,7 +15,7 @@ def StaticVectors( *, dropout: Optional[float] = None, init_W: Callable = glorot_uniform_init, - key_attr: str="ORTH" + key_attr: str = "ORTH" ) -> Model[List[Doc], Ragged]: """Embed Doc objects with their vocab's vectors table, applying a learned linear projection to control the dimensionality. If a dropout rate is @@ -45,21 +45,17 @@ def forward( ) output = Ragged( model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True), - model.ops.asarray([len(doc) for doc in docs], dtype="i") + model.ops.asarray([len(doc) for doc in docs], dtype="i"), ) if mask is not None: output.data *= mask - + def backprop(d_output: Ragged) -> List[Doc]: if mask is not None: d_output.data *= mask model.inc_grad( "W", - model.ops.gemm( - d_output.data, - model.ops.as_contig(V[rows]), - trans1=True - ) + model.ops.gemm(d_output.data, model.ops.as_contig(V[rows]), trans1=True), ) return [] @@ -78,7 +74,7 @@ def init( nM = X[0].vocab.vectors.data.shape[1] if Y is not None: nO = Y.data.shape[1] - + if nM is None: raise ValueError( "Cannot initialize StaticVectors layer: nM dimension unset. " diff --git a/spacy/util.py b/spacy/util.py index 7a26011f1..898e1c2c3 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -190,10 +190,7 @@ def get_module_path(module: ModuleType) -> Path: def load_vectors_into_model( - nlp: "Language", - name: Union[str, Path], - *, - add_strings=True + nlp: "Language", name: Union[str, Path], *, add_strings=True ) -> None: """Load word vectors from an installed model or path into a model instance.""" vectors_nlp = load_model(name) @@ -1205,12 +1202,12 @@ class DummyTokenizer: def link_vectors_to_models( vocab: "Vocab", - models: List[Model]=[], + models: List[Model] = [], *, vectors_name_attr="vectors_name", vectors_attr="vectors", key2row_attr="key2row", - default_vectors_name="spacy_pretrained_vectors" + default_vectors_name="spacy_pretrained_vectors", ) -> None: """Supply vectors data to models.""" vectors = vocab.vectors From 1784c95827f7a2fe8f8df88facced72af73cc961 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 22:17:47 +0200 Subject: [PATCH 25/55] Clean up link_vectors_to_models unused stuff --- spacy/cli/project/assets.py | 1 - spacy/language.py | 4 +--- spacy/pipeline/morphologizer.pyx | 1 - spacy/pipeline/multitask.pyx | 3 --- spacy/pipeline/pipe.pyx | 4 +--- spacy/pipeline/senter.pyx | 1 - spacy/pipeline/simple_ner.py | 1 - spacy/pipeline/tagger.pyx | 1 - spacy/pipeline/textcat.py | 1 - spacy/pipeline/tok2vec.py | 3 +-- spacy/syntax/_parser_model.pyx | 2 +- spacy/syntax/nn_parser.pyx | 3 +-- spacy/tests/regression/test_issue2501-3000.py | 2 -- spacy/util.py | 23 ------------------- spacy/vocab.pyx | 7 +----- 15 files changed, 6 insertions(+), 51 deletions(-) diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py index 1bd28cb7e..e42935e2f 100644 --- a/spacy/cli/project/assets.py +++ b/spacy/cli/project/assets.py @@ -11,7 +11,6 @@ from ...util import ensure_path, working_dir from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum - # TODO: find a solution for caches # CACHES = [ # Path.home() / ".torch", diff --git a/spacy/language.py b/spacy/language.py index 3511a7691..4b7651d65 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -21,7 +21,7 @@ from .vocab import Vocab, create_vocab from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .gold import Example from .scorer import Scorer -from .util import link_vectors_to_models, create_default_optimizer, registry +from .util import create_default_optimizer, registry from .util import SimpleFrozenDict, combine_score_weights from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES @@ -1049,7 +1049,6 @@ class Language: if self.vocab.vectors.data.shape[1] >= 1: ops = get_current_ops() self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - link_vectors_to_models(self.vocab) if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd @@ -1082,7 +1081,6 @@ class Language: ops = get_current_ops() if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data) - link_vectors_to_models(self.vocab) if sgd is None: sgd = create_default_optimizer() self._optimizer = sgd diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a6be129ba..56ef44cb9 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -149,7 +149,6 @@ class Morphologizer(Tagger): self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.set_output(len(self.labels)) self.model.initialize() - util.link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 4945afe4f..97826aaa6 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -11,7 +11,6 @@ from .tagger import Tagger from ..language import Language from ..syntax import nonproj from ..attrs import POS, ID -from ..util import link_vectors_to_models from ..errors import Errors @@ -91,7 +90,6 @@ class MultitaskObjective(Tagger): if label is not None and label not in self.labels: self.labels[label] = len(self.labels) self.model.initialize() - link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd @@ -179,7 +177,6 @@ class ClozeMultitask(Pipe): pass def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None): - link_vectors_to_models(self.vocab) self.model.initialize() X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) self.model.output_layer.begin_training(X) diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index f8ca28724..e4f7989b8 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -3,7 +3,7 @@ import srsly from ..tokens.doc cimport Doc -from ..util import link_vectors_to_models, create_default_optimizer +from ..util import create_default_optimizer from ..errors import Errors from .. import util @@ -145,8 +145,6 @@ class Pipe: DOCS: https://spacy.io/api/pipe#begin_training """ self.model.initialize() - if hasattr(self, "vocab"): - link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 743ceb32b..568e6031b 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -138,7 +138,6 @@ class SentenceRecognizer(Tagger): """ self.set_output(len(self.labels)) self.model.initialize() - util.link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index ec7ab6b7a..9b9872b77 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -168,7 +168,6 @@ class SimpleNER(Pipe): self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) - util.link_vectors_to_models(self.vocab) self.loss_func = SequenceCategoricalCrossentropy( names=self.get_tag_names(), normalize=True, missing_value=None ) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index c52a7889b..b3f996acb 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -318,7 +318,6 @@ class Tagger(Pipe): self.model.initialize(X=doc_sample) # Get batch of example docs, example outputs to call begin_training(). # This lets the model infer shapes. - util.link_vectors_to_models(self.vocab) if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 2aaa4a769..c235a2594 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -356,7 +356,6 @@ class TextCategorizer(Pipe): docs = [Doc(Vocab(), words=["hello"])] truths, _ = self._examples_to_truth(examples) self.set_output(len(self.labels)) - util.link_vectors_to_models(self.vocab) self.model.initialize(X=docs, Y=truths) if sgd is None: sgd = self.create_optimizer() diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5caaf432f..5e9e5b40e 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -7,7 +7,7 @@ from ..tokens import Doc from ..vocab import Vocab from ..language import Language from ..errors import Errors -from ..util import link_vectors_to_models, minibatch +from ..util import minibatch default_model_config = """ @@ -198,7 +198,6 @@ class Tok2Vec(Pipe): """ docs = [Doc(self.vocab, words=["hello"])] self.model.initialize(X=docs) - link_vectors_to_models(self.vocab) class Tok2VecListener(Model): diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 7acee5efd..eedd84bac 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -21,7 +21,7 @@ from .transition_system cimport Transition from ..compat import copy_array from ..errors import Errors, TempErrors -from ..util import link_vectors_to_models, create_default_optimizer +from ..util import create_default_optimizer from .. import util from . import nonproj diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 5313ec9bd..a0ee13a0a 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -29,7 +29,7 @@ from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition -from ..util import link_vectors_to_models, create_default_optimizer, registry +from ..util import create_default_optimizer, registry from ..compat import copy_array from ..errors import Errors, Warnings from .. import util @@ -456,7 +456,6 @@ cdef class Parser: self.model.initialize() if pipeline is not None: self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) - link_vectors_to_models(self.vocab) return sgd def to_disk(self, path, exclude=tuple()): diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index ac0867189..cf4e402e2 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -9,7 +9,6 @@ from spacy.matcher import Matcher from spacy.tokens import Doc, Span from spacy.vocab import Vocab from spacy.compat import pickle -from spacy.util import link_vectors_to_models import numpy import random @@ -190,7 +189,6 @@ def test_issue2871(): _ = vocab[word] # noqa: F841 vocab.set_vector(word, vector_data[0]) vocab.vectors.name = "dummy_vectors" - link_vectors_to_models(vocab) assert vocab["dog"].rank == 0 assert vocab["cat"].rank == 1 assert vocab["SUFFIX"].rank == 2 diff --git a/spacy/util.py b/spacy/util.py index 898e1c2c3..677f5e8e0 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1200,29 +1200,6 @@ class DummyTokenizer: return self -def link_vectors_to_models( - vocab: "Vocab", - models: List[Model] = [], - *, - vectors_name_attr="vectors_name", - vectors_attr="vectors", - key2row_attr="key2row", - default_vectors_name="spacy_pretrained_vectors", -) -> None: - """Supply vectors data to models.""" - vectors = vocab.vectors - if vectors.name is None: - vectors.name = default_vectors_name - if vectors.data.size != 0: - warnings.warn(Warnings.W020.format(shape=vectors.data.shape)) - - for model in models: - for node in model.walk(): - if node.attrs.get(vectors_name_attr) == vectors.name: - node.attrs[vectors_attr] = Unserializable(vectors.data) - node.attrs[key2row_attr] = Unserializable(vectors.key2row) - - def create_default_optimizer() -> Optimizer: # TODO: Do we still want to allow env_opt? learn_rate = env_opt("learn_rate", 0.001) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f41ad2356..b7337b92e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -16,7 +16,7 @@ from .errors import Errors from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM, IS_STOP from .vectors import Vectors -from .util import link_vectors_to_models, registry +from .util import registry from .lookups import Lookups, load_lookups from . import util from .lang.norm_exceptions import BASE_NORMS @@ -344,7 +344,6 @@ cdef class Vocab: synonym = self.strings[syn_keys[i][0]] score = scores[i][0] remap[word] = (synonym, score) - link_vectors_to_models(self) return remap def get_vector(self, orth, minn=None, maxn=None): @@ -476,8 +475,6 @@ cdef class Vocab: if "vectors" not in exclude: if self.vectors is not None: self.vectors.from_disk(path, exclude=["strings"]) - if self.vectors.name is not None: - link_vectors_to_models(self) if "lookups" not in exclude: self.lookups.from_disk(path) if "lexeme_norm" in self.lookups: @@ -537,8 +534,6 @@ cdef class Vocab: ) self.length = 0 self._by_orth = PreshMap() - if self.vectors.name is not None: - link_vectors_to_models(self) return self def _reset_cache(self, keys, strings): From c35d6282fcd1d74f209e34b0f90c09cbe2882ded Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 22:43:06 +0200 Subject: [PATCH 26/55] Add previous HashEmbedCNN tok2vec to make transition easier --- spacy/ml/models/tok2vec.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index f9183e709..881f25a3b 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -20,8 +20,37 @@ def tok2vec_listener_v1(width, upstream="*"): return tok2vec +@registry.architectures.register("spacy.HashEmbedCNN.v1") +def build_hash_embed_cnn_tok2vec( + *, + width: int, + depth: int, + embed_size: int, + window_size: int, + maxout_pieces: int, + subword_features: bool, + dropout: Optional[float], + pretrained_vectors: Optional[bool] +) -> Model[List[Doc], List[Floats2d]]: + """Build spaCy's 'standard' tok2vec layer, which uses hash embedding + with subword features and a CNN with layer-normalized maxout.""" + return build_Tok2Vec_model( + embed=MultiHashEmbed( + width=width, + rows=embed_size, + also_embed_subwords=subword_features, + also_use_static_vectors=bool(pretrained_vectors), + ), + encode=MaxoutWindowEncoder( + width=width, + depth=depth, + window_size=window_size, + maxout_pieces=maxout_pieces + ) + ) + @registry.architectures.register("spacy.Tok2Vec.v1") -def Tok2Vec( +def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], encode: Model[List[Floats2d], List[Floats2d]], ) -> Model[List[Doc], List[Floats2d]]: @@ -62,7 +91,7 @@ def MultiHashEmbed( ] else: embeddings = [make_hash_embed(NORM)] - + concat_size = width * (len(embeddings) + also_use_static_vectors) if also_use_static_vectors: model = chain( concatenate( @@ -73,7 +102,7 @@ def MultiHashEmbed( ), StaticVectors(width, dropout=0.0), ), - with_array(Maxout(width, nP=3, dropout=0.0, normalize=True)), + with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), ragged2list(), ) else: @@ -83,7 +112,7 @@ def MultiHashEmbed( list2ragged(), with_array(concatenate(*embeddings)), ), - with_array(Maxout(width, nP=3, dropout=0.0, normalize=True)), + with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), ragged2list(), ) return model From 20e9098e3f527fadff62aa31bb6342bc14763e91 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 22:43:19 +0200 Subject: [PATCH 27/55] Update tests --- .../tests/serialize/test_serialize_config.py | 24 ++--- spacy/tests/test_models.py | 94 ++++++++++--------- 2 files changed, 61 insertions(+), 57 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 90a79994e..25673b8c4 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -68,18 +68,18 @@ dropout = null @registry.architectures.register("my_test_parser") def my_parser(): tok2vec = build_Tok2Vec_model( - width=321, - embed_size=5432, - pretrained_vectors=None, - window_size=3, - maxout_pieces=4, - subword_features=True, - char_embed=True, - nM=64, - nC=8, - conv_depth=2, - bilstm_depth=0, - dropout=None, + MultiHashEmbed( + width=321, + embed_size=5432, + also_embed_subwords=True, + also_use_static_vectors=False + ), + MaxoutWindowEncoder( + width=321, + window_size=3, + maxout_pieces=4, + depth=2 + ) ) parser = build_tb_parser_model( tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index fc1988fcd..4c38ea6c6 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -5,12 +5,32 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate from numpy.testing import assert_array_equal import numpy -from spacy.ml.models import build_Tok2Vec_model +from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES +def get_textcat_kwargs(): + return { + "width": 64, + "embed_size": 2000, + "pretrained_vectors": None, + "exclusive_classes": False, + "ngram_size": 1, + "window_size": 1, + "conv_depth": 2, + "dropout": None, + "nO": 7, + } + +def get_textcat_cnn_kwargs(): + return { + "tok2vec": test_tok2vec(), + "exclusive_classes": False, + "nO": 13, + } + def get_all_params(model): params = [] for node in model.walk(): @@ -35,50 +55,34 @@ def get_gradient(model, Y): raise ValueError(f"Could not get gradient for type {type(Y)}") +def get_tok2vec_kwargs(): + # This actually creates models, so seems best to put it in a function. + return { + "embed": MultiHashEmbed( + width=32, + rows=500, + also_embed_subwords=True, + also_use_static_vectors=False + ), + "encode": MaxoutWindowEncoder( + width=32, + depth=2, + maxout_pieces=2, + window_size=1, + ) + } + + def test_tok2vec(): - return build_Tok2Vec_model(**TOK2VEC_KWARGS) - - -TOK2VEC_KWARGS = { - "width": 96, - "embed_size": 2000, - "subword_features": True, - "char_embed": False, - "conv_depth": 4, - "bilstm_depth": 0, - "maxout_pieces": 4, - "window_size": 1, - "dropout": 0.1, - "nM": 0, - "nC": 0, - "pretrained_vectors": None, -} - -TEXTCAT_KWARGS = { - "width": 64, - "embed_size": 2000, - "pretrained_vectors": None, - "exclusive_classes": False, - "ngram_size": 1, - "window_size": 1, - "conv_depth": 2, - "dropout": None, - "nO": 7, -} - -TEXTCAT_CNN_KWARGS = { - "tok2vec": test_tok2vec(), - "exclusive_classes": False, - "nO": 13, -} + return build_Tok2Vec_model(**get_tok2vec_kwargs()) @pytest.mark.parametrize( "seed,model_func,kwargs", [ - (0, build_Tok2Vec_model, TOK2VEC_KWARGS), - (0, build_text_classifier, TEXTCAT_KWARGS), - (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS), + (0, build_Tok2Vec_model, get_tok2vec_kwargs()), + (0, build_text_classifier, get_textcat_kwargs()), + (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), ], ) def test_models_initialize_consistently(seed, model_func, kwargs): @@ -96,9 +100,9 @@ def test_models_initialize_consistently(seed, model_func, kwargs): @pytest.mark.parametrize( "seed,model_func,kwargs,get_X", [ - (0, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), - (0, build_text_classifier, TEXTCAT_KWARGS, get_docs), - (0, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), + (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), + (0, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) def test_models_predict_consistently(seed, model_func, kwargs, get_X): @@ -131,9 +135,9 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): @pytest.mark.parametrize( "seed,dropout,model_func,kwargs,get_X", [ - (0, 0.2, build_Tok2Vec_model, TOK2VEC_KWARGS, get_docs), - (0, 0.2, build_text_classifier, TEXTCAT_KWARGS, get_docs), - (0, 0.2, build_simple_cnn_text_classifier, TEXTCAT_CNN_KWARGS, get_docs), + (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), + (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), + (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), ], ) def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): From 6a6b09bd32f6e687246f1162a645040e90019570 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 22:59:42 +0200 Subject: [PATCH 28/55] Update morphologizer model --- spacy/pipeline/morphologizer.pyx | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 56ef44cb9..e76b7fb77 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -22,17 +22,23 @@ default_model_config = """ @architectures = "spacy.Tagger.v1" [model.tok2vec] -@architectures = "spacy.HashCharEmbedCNN.v1" -pretrained_vectors = null +@architectures = "spacy.Tok2Vec.v1" + +[model.tok2vec.embed] +@architectures = "spacy.CharacterEmbed.v1" width = 128 -depth = 4 -embed_size = 7000 -window_size = 1 -maxout_pieces = 3 +rows = 7000 nM = 64 nC = 8 -dropout = null + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 128 +depth = 4 +window_size = 1 +maxout_pieces = 3 """ + DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"] From 00de30bcc28379ffb28be4d0b0c28ce9391eabb8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 23:06:30 +0200 Subject: [PATCH 29/55] Update CharacterEmbed function --- spacy/ml/models/tok2vec.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 881f25a3b..acd9dc0b0 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -119,15 +119,16 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): - norm = HashEmbed( - nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout, seed=5 +def CharacterEmbed(width: int, rows: int, nM: int, nC: int): + model = concatenate( + _character_embed.CharacterEmbed(nM=nM, nC=nC), + chain( + FeatureExtractor([NORM]), + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)) + ) ) - chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) - with Model.define_operators({">>": chain, "|": concatenate}): - embed_layer = chr_embed | features >> with_array(norm) - embed_layer.set_dim("nO", nM * nC + width) - return embed_layer + model.set_dim("nO", nM * nC + width) + return model @registry.architectures.register("spacy.MaxoutWindowEncoder.v1") From c7d1ece3ebf5e4fb45d14faf106a0aba7b179ee2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 28 Jul 2020 23:06:46 +0200 Subject: [PATCH 30/55] Update tests --- .../tests/serialize/test_serialize_config.py | 1 + spacy/tests/test_tok2vec.py | 52 ++++++++++--------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 25673b8c4..ef5c7f8f4 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -5,6 +5,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.util import registry, deep_merge_configs, load_model_from_config from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model +from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder from ..util import make_tempdir diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 32f4c5774..6b7170fe3 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,6 +1,7 @@ import pytest from spacy.ml.models.tok2vec import build_Tok2Vec_model +from spacy.ml.models.tok2vec import MultiHashEmbed, MaxoutWindowEncoder from spacy.vocab import Vocab from spacy.tokens import Doc @@ -13,18 +14,18 @@ def test_empty_doc(): vocab = Vocab() doc = Doc(vocab, words=[]) tok2vec = build_Tok2Vec_model( - width, - embed_size, - pretrained_vectors=None, - conv_depth=4, - bilstm_depth=0, - window_size=1, - maxout_pieces=3, - subword_features=True, - char_embed=False, - nM=64, - nC=8, - dropout=None, + MultiHashEmbed( + width=width, + rows=embed_size, + also_use_static_vectors=False, + also_embed_subwords=True + ), + MaxoutWindowEncoder( + width=width, + depth=4, + window_size=1, + maxout_pieces=3 + ) ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update([doc]) @@ -38,18 +39,18 @@ def test_empty_doc(): def test_tok2vec_batch_sizes(batch_size, width, embed_size): batch = get_batch(batch_size) tok2vec = build_Tok2Vec_model( - width, - embed_size, - pretrained_vectors=None, - conv_depth=4, - bilstm_depth=0, - window_size=1, - maxout_pieces=3, - subword_features=True, - char_embed=False, - nM=64, - nC=8, - dropout=None, + MultiHashEmbed( + width=width, + rows=embed_size, + also_use_static_vectors=False, + also_embed_subwords=True + ), + MaxoutWindowEncoder( + width=width, + depth=4, + window_size=1, + maxout_pieces=3, + ) ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) @@ -59,6 +60,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): # fmt: off +@pytest.mark.xfail(reason="TODO: Update for new signature") @pytest.mark.parametrize( "tok2vec_config", [ @@ -75,7 +77,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): # fmt: on def test_tok2vec_configs(tok2vec_config): docs = get_batch(3) - tok2vec = build_Tok2Vec_model(**tok2vec_config) + tok2vec = build_Tok2Vec_model_from_old_args(**tok2vec_config) tok2vec.initialize(docs) vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) From 97d36515747640b11e8447a6177ab867353b0915 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 13:38:13 +0200 Subject: [PATCH 31/55] Fix stray link_vectors_to_models call --- spacy/language.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 4b7651d65..0ec29f3b1 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1615,8 +1615,6 @@ def _fix_pretrained_vectors_name(nlp: Language) -> None: nlp.vocab.vectors.name = vectors_name else: raise ValueError(Errors.E092) - if nlp.vocab.vectors.size != 0: - link_vectors_to_models(nlp.vocab) for name, proc in nlp.pipeline: if not hasattr(proc, "cfg"): continue From 5ae862857108db67d331ed68e703b034436b9e08 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 13:38:30 +0200 Subject: [PATCH 32/55] Fix CharacterEmbed layer --- spacy/ml/_character_embed.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py index 57fbf73b3..ab0cb85c7 100644 --- a/spacy/ml/_character_embed.py +++ b/spacy/ml/_character_embed.py @@ -1,16 +1,18 @@ +from typing import List from thinc.api import Model +from thinc.types import Floats2d +from ..tokens import Doc -def CharacterEmbed(nM, nC): +def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: # nM: Number of dimensions per character. nC: Number of characters. - nO = nM * nC if (nM is not None and nC is not None) else None return Model( "charembed", forward, init=init, - dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256}, + dims={"nM": nM, "nC": nC, "nO": nM * nC, "nV": 256}, params={"E": None}, - ).initialize() + ) def init(model, X=None, Y=None): From 07b47eaac8bc169fdf88677e0660b34ea5f24d7a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 13:38:41 +0200 Subject: [PATCH 33/55] Update tok2vec layer --- spacy/ml/models/tok2vec.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index acd9dc0b0..d81c9f918 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -107,11 +107,9 @@ def MultiHashEmbed( ) else: model = chain( - chain( - FeatureExtractor(cols), - list2ragged(), - with_array(concatenate(*embeddings)), - ), + FeatureExtractor(cols), + list2ragged(), + with_array(concatenate(*embeddings)), with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), ragged2list(), ) @@ -120,14 +118,18 @@ def MultiHashEmbed( @registry.architectures.register("spacy.CharacterEmbed.v1") def CharacterEmbed(width: int, rows: int, nM: int, nC: int): - model = concatenate( - _character_embed.CharacterEmbed(nM=nM, nC=nC), - chain( - FeatureExtractor([NORM]), - with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)) - ) + model = chain( + concatenate( + chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), + chain( + FeatureExtractor([NORM]), + list2ragged(), + with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)) + ) + ), + with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), + ragged2list() ) - model.set_dim("nO", nM * nC + width) return model @@ -153,8 +155,12 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth: def MishWindowEncoder(width, window_size, depth): cnn = chain( expand_window(window_size=window_size), - Mish(nO=width, nI=width * ((window_size * 2) + 1)), - LayerNorm(width), + Mish( + nO=width, + nI=width * ((window_size * 2) + 1), + dropout=0.0, + normalize=True + ), ) model = clone(residual(cnn), depth) model.set_dim("nO", width) From f0cf4a2dca7cd2685d0842dbe5111d541288d661 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 13:47:37 +0200 Subject: [PATCH 34/55] Update tests --- .../tests/serialize/test_serialize_config.py | 4 +-- spacy/tests/test_tok2vec.py | 29 ++++++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ef5c7f8f4..ce35add42 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -41,7 +41,7 @@ factory = "tagger" @architectures = "spacy.Tagger.v1" [components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" +@architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model:width} """ @@ -71,7 +71,7 @@ def my_parser(): tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=321, - embed_size=5432, + rows=5432, also_embed_subwords=True, also_use_static_vectors=False ), diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 6b7170fe3..76b5e64df 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,7 +1,8 @@ import pytest from spacy.ml.models.tok2vec import build_Tok2Vec_model -from spacy.ml.models.tok2vec import MultiHashEmbed, MaxoutWindowEncoder +from spacy.ml.models.tok2vec import MultiHashEmbed, CharacterEmbed +from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder from spacy.vocab import Vocab from spacy.tokens import Doc @@ -60,26 +61,26 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): # fmt: off -@pytest.mark.xfail(reason="TODO: Update for new signature") @pytest.mark.parametrize( - "tok2vec_config", + "width,embed_arch,embed_config,encode_arch,encode_config", [ - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], ) # fmt: on -def test_tok2vec_configs(tok2vec_config): +def test_tok2vec_configs(width, embed_arch, embed_config, encode_arch, encode_config): + embed_config["width"] = width + encode_config["width"] = width docs = get_batch(3) - tok2vec = build_Tok2Vec_model_from_old_args(**tok2vec_config) + tok2vec = build_Tok2Vec_model( + embed_arch(**embed_config), + encode_arch(**encode_config) + ) tok2vec.initialize(docs) vectors, backprop = tok2vec.begin_update(docs) assert len(vectors) == len(docs) - assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"]) + assert vectors[0].shape == (len(docs[0]), width) backprop(vectors) From 4bbbb41bf8d70a3acfd45d9da2172c4401fc5452 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 13:48:34 +0200 Subject: [PATCH 35/55] Update config --- examples/experiments/ptb-joint-pos-dep/defaults.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 5850eaf3a..eed76cb7b 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -4,7 +4,7 @@ patience = 10000 eval_frequency = 200 dropout = 0.2 init_tok2vec = null -vectors = "tmp/fasttext_vectors/vocab" +vectors = null max_epochs = 100 orth_variant_level = 0.0 gold_preproc = true @@ -85,7 +85,7 @@ width = ${components.tok2vec.model.encode:width} width = ${components.tok2vec.model.encode:width} rows = 2000 also_embed_subwords = true -also_use_static_vectors = true +also_use_static_vectors = false [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" From 80b18124d20de79c5ef9c794ae496394c401edcb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 14:03:35 +0200 Subject: [PATCH 36/55] Fix docstring [ci skip] --- spacy/pipeline/pipe.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index f8ca28724..05f900e08 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -32,7 +32,9 @@ class Pipe: raise NotImplementedError def __call__(self, Doc doc): - """Add context-sensitive embeddings to the Doc.tensor attribute. + """Apply the pipe to one document. The document is modified in place, + and returned. This usually happens under the hood when the nlp object + is called on a text and all components are applied to the Doc. docs (Doc): The Doc to preocess. RETURNS (Doc): The processed Doc. From 8d56260d922bb566d010bb09d9cd524e4c82a58d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 14:07:13 +0200 Subject: [PATCH 37/55] Fix docstrings [ci skip] --- spacy/pipeline/pipe.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 05f900e08..3500d6cf5 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -76,7 +76,7 @@ class Pipe: """Modify a batch of documents, using pre-computed scores. docs (Iterable[Doc]): The documents to modify. - tokvecses: The tensors to set, produced by Pipe.predict. + scores: The scores to assign. DOCS: https://spacy.io/api/pipe#predict """ From 6e2623d3f8b9379ac44ec9079ddb724ecb80f47d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 14:08:05 +0200 Subject: [PATCH 38/55] Fix docstring [ci skip] --- spacy/pipeline/pipe.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 3500d6cf5..ab80aa32e 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -78,7 +78,7 @@ class Pipe: docs (Iterable[Doc]): The documents to modify. scores: The scores to assign. - DOCS: https://spacy.io/api/pipe#predict + DOCS: https://spacy.io/api/pipe#set_annotations """ raise NotImplementedError From ff0bc05da82b2b5b033c3dcc27fe499305517dd4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 14:09:37 +0200 Subject: [PATCH 39/55] Fix docstrings [ci skip] --- spacy/pipeline/entity_linker.py | 2 +- spacy/pipeline/morphologizer.pyx | 2 +- spacy/pipeline/senter.pyx | 2 +- spacy/pipeline/tagger.pyx | 2 +- spacy/pipeline/textcat.py | 2 +- spacy/pipeline/tok2vec.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 4165dab83..45713108a 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -387,7 +387,7 @@ class EntityLinker(Pipe): docs (Iterable[Doc]): The documents to modify. kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict. - DOCS: https://spacy.io/api/entitylinker#predict + DOCS: https://spacy.io/api/entitylinker#set_annotations """ count_ents = len([ent for doc in docs for ent in doc.ents]) if count_ents != len(kb_ids): diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a6be129ba..2f2601f3f 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -160,7 +160,7 @@ class Morphologizer(Tagger): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by Morphologizer.predict. - DOCS: https://spacy.io/api/morphologizer#predict + DOCS: https://spacy.io/api/morphologizer#set_annotations """ if isinstance(docs, Doc): docs = [docs] diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 743ceb32b..58f6f04b6 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -76,7 +76,7 @@ class SentenceRecognizer(Tagger): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by SentenceRecognizer.predict. - DOCS: https://spacy.io/api/sentencerecognizer#predict + DOCS: https://spacy.io/api/sentencerecognizer#set_annotations """ if isinstance(docs, Doc): docs = [docs] diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index c52a7889b..c96f9c029 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -145,7 +145,7 @@ class Tagger(Pipe): docs (Iterable[Doc]): The documents to modify. batch_tag_ids: The IDs to set, produced by Tagger.predict. - DOCS: https://spacy.io/api/tagger#predict + DOCS: https://spacy.io/api/tagger#set_annotations """ if isinstance(docs, Doc): docs = [docs] diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 2aaa4a769..856f3632a 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -163,7 +163,7 @@ class TextCategorizer(Pipe): docs (Iterable[Doc]): The documents to modify. scores: The scores to set, produced by TextCategorizer.predict. - DOCS: https://spacy.io/api/textcategorizer#predict + DOCS: https://spacy.io/api/textcategorizer#set_annotations """ for i, doc in enumerate(docs): for j, label in enumerate(self.labels): diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 5bda12d1b..ea6e90169 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -109,7 +109,7 @@ class Tok2Vec(Pipe): docs (Iterable[Doc]): The documents to modify. tokvecses: The tensors to set, produced by Tok2Vec.predict. - DOCS: https://spacy.io/api/tok2vec#predict + DOCS: https://spacy.io/api/tok2vec#set_annotations """ for doc, tokvecs in zip(docs, tokvecses): assert tokvecs.shape[0] == len(doc) From 105cf2996785961a2c6bf717c3d80736daaa1c60 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 14:23:13 +0200 Subject: [PATCH 40/55] Fix DocBin --- spacy/tokens/_serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 0a5fd0c59..bc371199a 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -50,7 +50,7 @@ class DocBin: self, attrs: Iterable[str] = ALL_ATTRS, store_user_data: bool = False, - docs=Iterable[Doc], + docs: Iterable[Doc]=[], ) -> None: """Create a DocBin object to hold serialized annotations. From b5bbfec591b9cb659bf51add783e0935fbee452b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 14:26:44 +0200 Subject: [PATCH 41/55] Update config --- examples/experiments/onto-joint/defaults.cfg | 83 +++++++++++--------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index 95c2f28bd..d37929ff1 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -20,20 +20,20 @@ seed = 0 accumulate_gradient = 1 use_pytorch_for_gpu_memory = false # Control how scores are printed and checkpoints are evaluated. -scores = ["speed", "tags_acc", "uas", "las", "ents_f"] +eval_batch_size = 128 score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} -# These settings are invalid for the transformer models. init_tok2vec = null discard_oversize = false -omit_extra_lookups = false batch_by = "words" -use_gpu = -1 raw_text = null tag_map = null +vectors = null +base_model = null +morph_rules = null [training.batch_size] @schedules = "compounding.v1" -start = 1000 +start = 100 stop = 1000 compound = 1.001 @@ -46,74 +46,79 @@ L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 1e-8 -#learn_rate = 0.001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.001 +learn_rate = 0.001 [nlp] lang = "en" -base_model = null -vectors = null +load_vocab_data = false +pipeline = ["tok2vec", "ner", "tagger", "parser"] -[nlp.pipeline] +[nlp.tokenizer] +@tokenizers = "spacy.Tokenizer.v1" -[nlp.pipeline.tok2vec] +[nlp.lemmatizer] +@lemmatizers = "spacy.Lemmatizer.v1" + +[components] + +[components.tok2vec] factory = "tok2vec" - -[nlp.pipeline.ner] +[components.ner] factory = "ner" learn_tokens = false min_action_freq = 1 -[nlp.pipeline.tagger] +[components.tagger] factory = "tagger" -[nlp.pipeline.parser] +[components.parser] factory = "parser" learn_tokens = false min_action_freq = 30 -[nlp.pipeline.tagger.model] +[components.tagger.model] @architectures = "spacy.Tagger.v1" -[nlp.pipeline.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} -[nlp.pipeline.parser.model] +[components.parser.model] @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 8 hidden_width = 128 maxout_pieces = 2 use_upper = true -[nlp.pipeline.parser.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +[components.parser.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} -[nlp.pipeline.ner.model] +[components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" nr_feature_tokens = 3 hidden_width = 128 maxout_pieces = 2 use_upper = true -[nlp.pipeline.ner.model.tok2vec] -@architectures = "spacy.Tok2VecTensors.v1" -width = ${nlp.pipeline.tok2vec.model:width} +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} -[nlp.pipeline.tok2vec.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = ${nlp:vectors} -width = 128 +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +rows = 2000 +also_embed_subwords = true +also_use_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 depth = 4 window_size = 1 -embed_size = 7000 maxout_pieces = 3 -subword_features = true -dropout = ${training:dropout} From 9e1b11dd8158b938798ae5ba7384623b8ed535a1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 14:35:36 +0200 Subject: [PATCH 42/55] Update vectors in textcat --- spacy/ml/models/textcat.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index a64a2487a..139917581 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -9,6 +9,7 @@ from ... import util from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...util import registry from ..extract_ngrams import extract_ngrams +from ..staticvectors import StaticVectors @registry.architectures.register("spacy.TextCatCNN.v1") @@ -101,13 +102,7 @@ def build_text_classifier( ) if pretrained_vectors: - nlp = util.load_model(pretrained_vectors) - vectors = nlp.vocab.vectors - vector_dim = vectors.data.shape[1] - - static_vectors = SpacyVectors(vectors) >> with_array( - Linear(width, vector_dim) - ) + static_vectors = StaticVectors(width) vector_layer = trained_vectors | static_vectors vectors_width = width * 2 else: @@ -158,14 +153,10 @@ def build_text_classifier( @registry.architectures.register("spacy.TextCatLowData.v1") def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): - nlp = util.load_model(pretrained_vectors) - vectors = nlp.vocab.vectors - vector_dim = vectors.data.shape[1] - # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" with Model.define_operators({">>": chain, "**": clone}): model = ( - SpacyVectors(vectors) + StaticVectors(width) >> list2ragged() >> with_ragged(0, Linear(width, vector_dim)) >> ParametricAttention(width) From c99a65307037440406a3b35d04440bb671e931d8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 14:38:15 +0200 Subject: [PATCH 43/55] Adjust textcat model --- spacy/ml/models/textcat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 139917581..53200c165 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -158,7 +158,6 @@ def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): model = ( StaticVectors(width) >> list2ragged() - >> with_ragged(0, Linear(width, vector_dim)) >> ParametricAttention(width) >> reduce_sum() >> residual(Relu(width, width)) ** 2 From 142b58be92dbc1ee63d3424f7afaf4fe44cab417 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 14:45:09 +0200 Subject: [PATCH 44/55] Fix import --- spacy/ml/models/tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index d81c9f918..1460b3005 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -2,7 +2,7 @@ from typing import Optional, List from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import Model, noop, list2ragged, ragged2list from thinc.api import FeatureExtractor, HashEmbed -from thinc.api import expand_window, residual, Maxout, Mish +from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM from thinc.types import Floats2d from ...tokens import Doc From 62266fb8286e0918366b9488b4aa4a38b9d49437 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 14:49:49 +0200 Subject: [PATCH 45/55] Fix broken type annotation --- spacy/tokens/_serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 0a5fd0c59..192067ed4 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -50,7 +50,7 @@ class DocBin: self, attrs: Iterable[str] = ALL_ATTRS, store_user_data: bool = False, - docs=Iterable[Doc], + docs: Iterable[Doc] = tuple(), ) -> None: """Create a DocBin object to hold serialized annotations. From 2af741d7e3b96be4e24319c2e8284fa168c6ab99 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 14:56:01 +0200 Subject: [PATCH 46/55] Fix train arg --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e152ae8ea..b0bc145ff 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -246,7 +246,7 @@ def create_evaluation_callback( ) -> Callable[[], Tuple[float, Dict[str, float]]]: def evaluate() -> Tuple[float, Dict[str, float]]: dev_examples = corpus.dev_dataset( - nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True + nlp, gold_preproc=cfg["gold_preproc"] ) dev_examples = list(dev_examples) n_words = sum(len(ex.predicted) for ex in dev_examples) From ebdb3f5f04e32d05ed59046b4655a4bc6869bc79 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 14:56:11 +0200 Subject: [PATCH 47/55] Fix config --- examples/experiments/onto-joint/defaults.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index d37929ff1..0e0d4d4c3 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -21,7 +21,7 @@ accumulate_gradient = 1 use_pytorch_for_gpu_memory = false # Control how scores are printed and checkpoints are evaluated. eval_batch_size = 128 -score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2} +score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2} init_tok2vec = null discard_oversize = false batch_by = "words" From b0f57a0cac93d2fe3862cb7e33bdf75fbed1d121 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 15:14:07 +0200 Subject: [PATCH 48/55] Update docs and consistency --- spacy/language.py | 13 +- spacy/pipeline/entity_linker.py | 6 +- spacy/pipeline/entityruler.py | 10 +- spacy/pipeline/morphologizer.pyx | 8 +- spacy/pipeline/pipe.pyx | 8 +- spacy/pipeline/sentencizer.pyx | 8 +- spacy/pipeline/senter.pyx | 8 +- spacy/pipeline/tagger.pyx | 8 +- spacy/tokenizer.pyx | 8 +- spacy/tokens/doc.pyx | 28 +-- spacy/vocab.pyx | 8 +- website/docs/api/dependencyparser.md | 40 +-- website/docs/api/doc.md | 40 +-- website/docs/api/entitylinker.md | 20 +- website/docs/api/entityrecognizer.md | 40 +-- website/docs/api/language.md | 40 +-- website/docs/api/morphologizer.md | 40 +-- website/docs/api/pipe.md | 40 +-- website/docs/api/sentencerecognizer.md | 40 +-- website/docs/api/tagger.md | 40 +-- website/docs/api/textcategorizer.md | 40 +-- website/docs/api/tok2vec.md | 40 +-- website/docs/api/tokenizer.md | 40 +-- website/docs/api/transformer.md | 331 +++++++++++++++++++++++-- website/docs/api/vocab.md | 40 +-- 25 files changed, 646 insertions(+), 298 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 9dd8a347e..ef185a7eb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -49,6 +49,7 @@ class BaseDefaults: overwritten by language subclasses by defining their own subclasses of Language.Defaults. """ + config: Config = Config() tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS prefixes: Optional[List[Union[str, Pattern]]] = TOKENIZER_PREFIXES @@ -67,6 +68,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: """Registered function to create a tokenizer. Returns a factory that takes the nlp object and returns a Tokenizer instance using the language detaults. """ + def tokenizer_factory(nlp: "Language") -> Tokenizer: prefixes = nlp.Defaults.prefixes suffixes = nlp.Defaults.suffixes @@ -1432,7 +1434,9 @@ class Language: nlp.resolved = resolved return nlp - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Save the current state to a directory. If a model is loaded, this will include the model. @@ -1461,7 +1465,7 @@ class Language: util.to_disk(path, serializers, exclude) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "Language": """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the @@ -1512,7 +1516,7 @@ class Language: self._link_components() return self - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: """Serialize the current state to a binary string. exclude (list): Names of components or serialization fields to exclude. @@ -1534,7 +1538,7 @@ class Language: return util.to_bytes(serializers, exclude) def from_bytes( - self, bytes_data: bytes, exclude: Iterable[str] = tuple() + self, bytes_data: bytes, *, exclude: Iterable[str] = tuple() ) -> "Language": """Load state from a binary string. @@ -1583,6 +1587,7 @@ class FactoryMeta: created whenever a component is defined and stored on the Language class for each component instance and factory instance. """ + factory: str default_config: Optional[Dict[str, Any]] = None # noqa: E704 assigns: Iterable[str] = tuple() diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 45713108a..cc4e7b159 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -400,7 +400,9 @@ class EntityLinker(Pipe): for token in ent: token.ent_kb_id_ = kb_id - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -417,7 +419,7 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "EntityLinker": """Load the pipe from disk. Modifies the object in place and returns it. diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index d6ce86e78..8f280547e 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -315,7 +315,7 @@ class EntityRuler: return Scorer.score_spans(examples, "ents", **kwargs) def from_bytes( - self, patterns_bytes: bytes, exclude: Iterable[str] = tuple() + self, patterns_bytes: bytes, *, exclude: Iterable[str] = tuple() ) -> "EntityRuler": """Load the entity ruler from a bytestring. @@ -339,7 +339,7 @@ class EntityRuler: self.add_patterns(cfg) return self - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: """Serialize the entity ruler patterns to a bytestring. RETURNS (bytes): The serialized patterns. @@ -355,7 +355,7 @@ class EntityRuler: return srsly.msgpack_dumps(serial) def from_disk( - self, path: Union[str, Path], exclude: Iterable[str] = tuple() + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "EntityRuler": """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. @@ -391,7 +391,9 @@ class EntityRuler: from_disk(path, deserializers_patterns, {}) return self - def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = tuple()) -> None: + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() + ) -> None: """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index 2f2601f3f..4cf1580d3 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -230,7 +230,7 @@ class Morphologizer(Tagger): "morph", **kwargs)) return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -244,7 +244,7 @@ class Morphologizer(Tagger): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -267,7 +267,7 @@ class Morphologizer(Tagger): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -282,7 +282,7 @@ class Morphologizer(Tagger): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index ab80aa32e..c378b42f7 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -180,7 +180,7 @@ class Pipe: """ return {} - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -195,7 +195,7 @@ class Pipe: serialize["vocab"] = self.vocab.to_bytes return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -218,7 +218,7 @@ class Pipe: util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -232,7 +232,7 @@ class Pipe: serialize["model"] = lambda p: self.model.to_disk(p) util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 8203249d7..31208ea2c 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -162,7 +162,7 @@ class Sentencizer(Pipe): del results["sents_per_type"] return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the sentencizer to a bytestring. RETURNS (bytes): The serialized object. @@ -171,7 +171,7 @@ class Sentencizer(Pipe): """ return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the sentencizer from a bytestring. bytes_data (bytes): The data to load. @@ -183,7 +183,7 @@ class Sentencizer(Pipe): self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the sentencizer to disk. DOCS: https://spacy.io/api/sentencizer#to_disk @@ -193,7 +193,7 @@ class Sentencizer(Pipe): srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the sentencizer from disk. DOCS: https://spacy.io/api/sentencizer#from_disk diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index 58f6f04b6..e09805e33 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -157,7 +157,7 @@ class SentenceRecognizer(Tagger): del results["sents_per_type"] return results - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -171,7 +171,7 @@ class SentenceRecognizer(Tagger): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -194,7 +194,7 @@ class SentenceRecognizer(Tagger): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -209,7 +209,7 @@ class SentenceRecognizer(Tagger): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index c96f9c029..28c46d1cd 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -370,7 +370,7 @@ class Tagger(Pipe): scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) return scores - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. exclude (Iterable[str]): String names of serialization fields to exclude. @@ -388,7 +388,7 @@ class Tagger(Pipe): serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules) return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load the pipe from a bytestring. bytes_data (bytes): The serialized pipe. @@ -424,7 +424,7 @@ class Tagger(Pipe): util.from_bytes(bytes_data, deserialize, exclude) return self - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Serialize the pipe to disk. path (str / Path): Path to a directory. @@ -443,7 +443,7 @@ class Tagger(Pipe): } util.to_disk(path, serialize, exclude) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Load the pipe from disk. Modifies the object in place and returns it. path (str / Path): Path to a directory. diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 858a93ce5..bffbf5829 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -728,7 +728,7 @@ cdef class Tokenizer: with path.open("wb") as file_: file_.write(self.to_bytes(**kwargs)) - def from_disk(self, path, **kwargs): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -741,10 +741,10 @@ cdef class Tokenizer: path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **kwargs) + self.from_bytes(bytes_data, exclude=exclude) return self - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -763,7 +763,7 @@ cdef class Tokenizer: } return util.to_bytes(serializers, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0ba5abb52..2fcc0983b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -987,20 +987,20 @@ cdef class Doc: other.c = &tokens[PADDING] return other - def to_disk(self, path, **kwargs): + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. path (str / Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or Path-like objects. - exclude (list): String names of serialization fields to exclude. + exclude (Iterable[str]): String names of serialization fields to exclude. DOCS: https://spacy.io/api/doc#to_disk """ path = util.ensure_path(path) with path.open("wb") as file_: - file_.write(self.to_bytes(**kwargs)) + file_.write(self.to_bytes(exclude=exclude)) - def from_disk(self, path, **kwargs): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -1014,9 +1014,9 @@ cdef class Doc: path = util.ensure_path(path) with path.open("rb") as file_: bytes_data = file_.read() - return self.from_bytes(bytes_data, **kwargs) + return self.from_bytes(bytes_data, exclude=exclude) - def to_bytes(self, exclude=tuple(), **kwargs): + def to_bytes(self, *, exclude=tuple()): """Serialize, i.e. export the document contents to a binary string. exclude (list): String names of serialization fields to exclude. @@ -1025,9 +1025,9 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs)) + return srsly.msgpack_dumps(self.to_dict(exclude=exclude)) - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. @@ -1036,13 +1036,9 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#from_bytes """ - return self.from_dict( - srsly.msgpack_loads(bytes_data), - exclude=exclude, - **kwargs - ) + return self.from_dict(srsly.msgpack_loads(bytes_data), exclude=exclude) - def to_dict(self, exclude=tuple(), **kwargs): + def to_dict(self, *, exclude=tuple()): """Export the document contents to a dictionary for serialization. exclude (list): String names of serialization fields to exclude. @@ -1090,14 +1086,14 @@ cdef class Doc: serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values) return util.to_dict(serializers, exclude) - def from_dict(self, msg, exclude=tuple(), **kwargs): + def from_dict(self, msg, *, exclude=tuple()): """Deserialize, i.e. import the document contents from a binary string. data (bytes): The string to load from. exclude (list): String names of serialization fields to exclude. RETURNS (Doc): Itself. - DOCS: https://spacy.io/api/doc#from_bytes + DOCS: https://spacy.io/api/doc#from_dict """ if self.length != 0: raise ValueError(Errors.E033.format(length=self.length)) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f41ad2356..7713ec528 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -439,7 +439,7 @@ cdef class Vocab: orth = self.strings.add(orth) return orth in self.vectors - def to_disk(self, path, exclude=tuple()): + def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if @@ -459,7 +459,7 @@ cdef class Vocab: if "lookups" not in "exclude" and self.lookups is not None: self.lookups.to_disk(path) - def from_disk(self, path, exclude=tuple()): + def from_disk(self, path, *, exclude=tuple()): """Loads state from a directory. Modifies the object in place and returns it. @@ -488,7 +488,7 @@ cdef class Vocab: self._by_orth = PreshMap() return self - def to_bytes(self, exclude=tuple()): + def to_bytes(self, *, exclude=tuple()): """Serialize the current state to a binary string. exclude (list): String names of serialization fields to exclude. @@ -509,7 +509,7 @@ cdef class Vocab: } return util.to_bytes(getters, exclude) - def from_bytes(self, bytes_data, exclude=tuple()): + def from_bytes(self, bytes_data, *, exclude=tuple()): """Load state from a binary string. bytes_data (bytes): The data to load from. diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md index f6ed7492d..a18e9e582 100644 --- a/website/docs/api/dependencyparser.md +++ b/website/docs/api/dependencyparser.md @@ -290,10 +290,11 @@ Serialize the pipe to disk. > parser.to_disk("/path/to/parser") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## DependencyParser.from_disk {#from_disk tag="method"} @@ -306,11 +307,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > parser.from_disk("/path/to/parser") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | +| Name | Type | Description | +| -------------- | ------------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | ## DependencyParser.to_bytes {#to_bytes tag="method"} @@ -323,10 +325,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | ## DependencyParser.from_bytes {#from_bytes tag="method"} @@ -340,11 +343,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > parser.from_bytes(parser_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | +| Name | Type | Description | +| -------------- | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | ## DependencyParser.labels {#labels tag="property"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 69608c958..a9499f6d4 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -385,10 +385,11 @@ Save the current state to a directory. > doc.to_disk("/path/to/doc") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Doc.from_disk {#from_disk tag="method" new="2"} @@ -402,11 +403,12 @@ Loads state from a directory. Modifies the object in place and returns it. > doc = Doc(Vocab()).from_disk("/path/to/doc") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The modified `Doc` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The modified `Doc` object. | ## Doc.to_bytes {#to_bytes tag="method"} @@ -419,10 +421,11 @@ Serialize, i.e. export the document contents to a binary string. > doc_bytes = doc.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | ## Doc.from_bytes {#from_bytes tag="method"} @@ -438,11 +441,12 @@ Deserialize, i.e. import the document contents from a binary string. > assert doc.text == doc2.text > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `data` | bytes | The string to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Doc` | The `Doc` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `data` | bytes | The string to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Doc` | The `Doc` object. | ## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"} diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index c29f0326c..2a1ba94d2 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -265,10 +265,11 @@ Serialize the pipe to disk. > entity_linker.to_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityLinker.from_disk {#from_disk tag="method"} @@ -281,11 +282,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > entity_linker.from_disk("/path/to/entity_linker") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index b1d40a9c3..b5b549a04 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -289,10 +289,11 @@ Serialize the pipe to disk. > ner.to_disk("/path/to/ner") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## EntityRecognizer.from_disk {#from_disk tag="method"} @@ -305,11 +306,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > ner.from_disk("/path/to/ner") > ``` -| Name | Type | Description | -| ----------- | ------------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | ------------------ | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | ## EntityRecognizer.to_bytes {#to_bytes tag="method"} @@ -322,10 +324,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | ## EntityRecognizer.from_bytes {#from_bytes tag="method"} @@ -339,11 +342,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > ner.from_bytes(ner_bytes) > ``` -| Name | Type | Description | -| ------------ | ------------------ | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | +| Name | Type | Description | +| -------------- | ------------------ | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | ## EntityRecognizer.labels {#labels tag="property"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index 0f7797d7f..7e25106d1 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -645,10 +645,11 @@ the model**. > nlp.to_disk("/path/to/models") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | ## Language.from_disk {#from_disk tag="method" new="2"} @@ -670,11 +671,12 @@ loaded object. > nlp = English().from_disk("/path/to/en_model") > ``` -| Name | Type | Description | -| ----------- | ------------ | ----------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The modified `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The modified `Language` object. | ## Language.to_bytes {#to_bytes tag="method"} @@ -686,10 +688,11 @@ Serialize the current state to a binary string. > nlp_bytes = nlp.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ----------------------------------------------------------------------------------------- | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Language` object. | ## Language.from_bytes {#from_bytes tag="method"} @@ -707,11 +710,12 @@ available to the loaded object. > nlp2.from_bytes(nlp_bytes) > ``` -| Name | Type | Description | -| ------------ | ---------- | ----------------------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Language` | The `Language` object. | +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Language` | The `Language` object. | ## Attributes {#attributes} diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md index a153bd51c..ac7146543 100644 --- a/website/docs/api/morphologizer.md +++ b/website/docs/api/morphologizer.md @@ -276,10 +276,11 @@ Serialize the pipe to disk. > morphologizer.to_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Morphologizer.from_disk {#from_disk tag="method"} @@ -292,11 +293,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > morphologizer.from_disk("/path/to/morphologizer") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | ## Morphologizer.to_bytes {#to_bytes tag="method"} @@ -309,10 +311,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | ## Morphologizer.from_bytes {#from_bytes tag="method"} @@ -326,11 +329,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > morphologizer.from_bytes(morphologizer_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | ## Morphologizer.labels {#labels tag="property"} diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index a2d055d88..99d06c79f 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -306,10 +306,11 @@ Serialize the pipe to disk. > pipe.to_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Pipe.from_disk {#from_disk tag="method"} @@ -322,11 +323,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > pipe.from_disk("/path/to/pipe") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The modified pipe. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Pipe` | The modified pipe. | ## Pipe.to_bytes {#to_bytes tag="method"} @@ -339,10 +341,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the pipe. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the pipe. | ## Pipe.from_bytes {#from_bytes tag="method"} @@ -356,11 +359,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > pipe.from_bytes(pipe_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Pipe` | The pipe. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Pipe` | The pipe. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md index f7d2ac00f..fdc950bb0 100644 --- a/website/docs/api/sentencerecognizer.md +++ b/website/docs/api/sentencerecognizer.md @@ -291,10 +291,11 @@ Serialize the pipe to disk. > senter.to_disk("/path/to/senter") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## SentenceRecognizer.from_disk {#from_disk tag="method"} @@ -307,11 +308,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > senter.from_disk("/path/to/senter") > ``` -| Name | Type | Description | -| ----------- | -------------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | -------------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | ## SentenceRecognizer.to_bytes {#to_bytes tag="method"} @@ -324,10 +326,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | ## SentenceRecognizer.from_bytes {#from_bytes tag="method"} @@ -341,11 +344,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > senter.from_bytes(senter_bytes) > ``` -| Name | Type | Description | -| ------------ | -------------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | +| Name | Type | Description | +| -------------- | -------------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index cc7401016..37ef13453 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -307,10 +307,11 @@ Serialize the pipe to disk. > tagger.to_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tagger.from_disk {#from_disk tag="method"} @@ -323,11 +324,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tagger.from_disk("/path/to/tagger") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The modified `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The modified `Tagger` object. | ## Tagger.to_bytes {#to_bytes tag="method"} @@ -340,10 +342,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tagger` object. | ## Tagger.from_bytes {#from_bytes tag="method"} @@ -357,11 +360,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tagger.from_bytes(tagger_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tagger` | The `Tagger` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tagger` | The `Tagger` object. | ## Tagger.labels {#labels tag="property"} diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md index c0dd07c1e..1efd5831c 100644 --- a/website/docs/api/textcategorizer.md +++ b/website/docs/api/textcategorizer.md @@ -325,10 +325,11 @@ Serialize the pipe to disk. > textcat.to_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## TextCategorizer.from_disk {#from_disk tag="method"} @@ -341,11 +342,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > textcat.from_disk("/path/to/textcat") > ``` -| Name | Type | Description | -| ----------- | ----------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | ----------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | ## TextCategorizer.to_bytes {#to_bytes tag="method"} @@ -358,10 +360,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | ## TextCategorizer.from_bytes {#from_bytes tag="method"} @@ -375,11 +378,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > textcat.from_bytes(textcat_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | +| Name | Type | Description | +| -------------- | ----------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | ## TextCategorizer.labels {#labels tag="property"} diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md index 11167c428..f810793ce 100644 --- a/website/docs/api/tok2vec.md +++ b/website/docs/api/tok2vec.md @@ -227,10 +227,11 @@ Serialize the pipe to disk. > tok2vec.to_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tok2Vec.from_disk {#from_disk tag="method"} @@ -243,11 +244,12 @@ Load the pipe from disk. Modifies the object in place and returns it. > tok2vec.from_disk("/path/to/tok2vec") > ``` -| Name | Type | Description | -| ----------- | --------------- | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | ## Tok2Vec.to_bytes {#to_bytes tag="method"} @@ -260,10 +262,11 @@ Load the pipe from disk. Modifies the object in place and returns it. Serialize the pipe to a bytestring. -| Name | Type | Description | -| ----------- | --------------- | ------------------------------------------------------------------------- | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | ## Tok2Vec.from_bytes {#from_bytes tag="method"} @@ -277,11 +280,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. > tok2vec.from_bytes(tok2vec_bytes) > ``` -| Name | Type | Description | -| ------------ | --------------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | ## Serialization fields {#serialization-fields} diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 02023cf9f..23b6e4f3f 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -158,10 +158,11 @@ Serialize the tokenizer to disk. > tokenizer.to_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Tokenizer.from_disk {#from_disk tag="method"} @@ -174,11 +175,12 @@ Load the tokenizer from disk. Modifies the object in place and returns it. > tokenizer.from_disk("/path/to/tokenizer") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | ## Tokenizer.to_bytes {#to_bytes tag="method"} @@ -191,10 +193,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it. Serialize the tokenizer to a bytestring. -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | ## Tokenizer.from_bytes {#from_bytes tag="method"} @@ -209,11 +212,12 @@ it. > tokenizer.from_bytes(tokenizer_bytes) > ``` -| Name | Type | Description | -| ------------ | ----------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | ## Attributes {#attributes} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index aab02fe68..e89ecb6b7 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -51,11 +51,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Type | Description | Default | -| ------------------- | ------------------------------------------ | ------------------------------- | ------------------------------------------------------------------- | -| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | -| `annotation_setter` | Callable | | [`null_annotation_setter`](/api/transformer#null_annotation_setter) | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | +| Setting | Type | Description | Default | +| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | +| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | +| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py @@ -69,8 +69,14 @@ https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/p > # Construction via add_pipe with default model > trf = nlp.add_pipe("transformer") > -> # Construction via add_pipe with custom model -> config = {"model": {"@architectures": "my_transformer"}} +> # Construction via add_pipe with custom config +> config = { +> "model": { +> "@architectures": "spacy-transformers.TransformerModel.v1", +> "name": "bert-base-uncased", +> "tokenizer_config": {"use_fast": True} +> } +> } > trf = nlp.add_pipe("transformer", config=config) > > # Construction from class @@ -82,26 +88,313 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Type | Description | -| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | -| `annotation_setter` | `Callable` | | -| _keyword-only_ | | | -| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | -| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | +| Name | Type | Description | +| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared vocabulary. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | +| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. Defaults to `null_annotation_setter`, a function that does nothing. | +| _keyword-only_ | | | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | - +## Transformer.\_\_call\_\_ {#call tag="method"} + +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. Both +[`__call__`](/api/transformer#call) and [`pipe`](/api/transformer#pipe) delegate +to the [`predict`](/api/transformer#predict) and +[`set_annotations`](/api/transformer#set_annotations) methods. + +> #### Example +> +> ```python +> doc = nlp("This is a sentence.") +> trf = nlp.add_pipe("transformer") +> # This usually happens under the hood +> processed = transformer(doc) +> ``` + +| Name | Type | Description | +| ----------- | ----- | ------------------------ | +| `doc` | `Doc` | The document to process. | +| **RETURNS** | `Doc` | The processed document. | + +## Transformer.pipe {#pipe tag="method"} + +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and +[`pipe`](/api/transformer#pipe) delegate to the +[`predict`](/api/transformer#predict) and +[`set_annotations`](/api/transformer#set_annotations) methods. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> for doc in trf.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ----------------------------------------------------- | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| _keyword-only_ | | | +| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | The processed documents in order. | + +## Transformer.begin_training {#begin_training tag="method"} + +Initialize the pipe for training, using data examples if available. Returns an +[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = trf.begin_training(pipeline=nlp.pipeline) +> ``` + +| Name | Type | Description | +| -------------- | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | +| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | +| _keyword-only_ | | | +| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/transformer#create_optimizer) if not set. | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Transformer.predict {#predict tag="method"} + +Apply the pipeline's model to a batch of docs, without modifying them. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> scores = trf.predict([doc1, doc2]) +> ``` + +| Name | Type | Description | +| ----------- | --------------- | ----------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to predict. | +| **RETURNS** | - | The model's prediction for each document. | + +## Transformer.set_annotations {#set_annotations tag="method"} + +Modify a batch of documents, using pre-computed scores. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> scores = trf.predict(docs) +> trf.set_annotations(docs, scores) +> ``` + +| Name | Type | Description | +| -------- | --------------- | ----------------------------------------------------- | +| `docs` | `Iterable[Doc]` | The documents to modify. | +| `scores` | - | The scores to set, produced by `Transformer.predict`. | + +## Transformer.update {#update tag="method"} + +Learn from a batch of documents and gold-standard information, updating the +pipe's model. Delegates to [`predict`](/api/transformer#predict). + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = nlp.begin_training() +> losses = trf.update(examples, sgd=optimizer) +> ``` + +| Name | Type | Description | +| ----------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | +| _keyword-only_ | | | +| `drop` | float | The dropout rate. | +| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). | +| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | +| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | +| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | + +## Transformer.create_optimizer {#create_optimizer tag="method"} + +Create an optimizer for the pipeline component. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> optimizer = trf.create_optimizer() +> ``` + +| Name | Type | Description | +| ----------- | --------------------------------------------------- | -------------- | +| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | + +## Transformer.use_params {#use_params tag="method, contextmanager"} + +Modify the pipe's model, to use the given parameter values. At the end of the +context, the original parameters are restored. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> with trf.use_params(optimizer.averages): +> trf.to_disk("/best_model") +> ``` + +| Name | Type | Description | +| -------- | ---- | ----------------------------------------- | +| `params` | dict | The parameter values to use in the model. | + +## Transformer.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf.to_disk("/path/to/transformer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | + +## Transformer.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf.from_disk("/path/to/transformer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | + +## Transformer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> trf = nlp.add_pipe("transformer") +> trf_bytes = trf.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | + +## Transformer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> trf_bytes = trf.to_bytes() +> trf = nlp.add_pipe("transformer") +> trf.from_bytes(trf_bytes) +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = trf.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | ## TransformerData {#transformerdata tag="dataclass"} +Transformer tokens and outputs for one `Doc` object. + +| Name | Type | Description | +| --------- | -------------------------------------------------- | ----------------------------------------- | +| `tokens` | `Dict` | | +| `tensors` | `List[FloatsXd]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `width` | int | | + +### TransformerData.empty {#transformerdata-emoty tag="classmethod"} + + + +| Name | Type | Description | +| ----------- | ----------------- | -------------- | +| **RETURNS** | `TransformerData` | | + ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} + + +| Name | Type | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | +| `spans` | `List[List[Span]]` | | +| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=batchencoding#transformers.BatchEncoding) | | +| `tensors` | `List[torch.Tensor]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `doc_data` | `List[TransformerData]` | | + +### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} + + + +| Name | Type | Description | +| ----------- | ---------------------- | -------------- | +| `arrays` | `List[List[Floats3d]]` | | +| **RETURNS** | `FullTransformerBatch` | | + +### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} + +Split a `TransformerData` object that represents a batch into a list with one +`TransformerData` per `Doc`. + +| Name | Type | Description | +| ----------- | ----------------------- | -------------- | +| **RETURNS** | `List[TransformerData]` | | + ## Custom attributes {#custom-attributes} The component sets the following [custom extension attributes](/usage/processing-pipeline#custom-components-attributes): -| Name | Type | Description | -| -------------- | ----------------- | -------------- | -| `Doc.trf_data` | `TransformerData` | | +| Name | Type | Description | +| -------------- | ----------------------------------------------------- | ---------------------------------------------------- | +| `Doc.trf_data` | [`TransformerData`](/api/transformer#transformerdata) | Transformer tokens and outputs for the `Doc` object. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index c68af2047..d5c9b0ff0 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -230,10 +230,11 @@ Save the current state to a directory. > nlp.vocab.to_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | ## Vocab.from_disk {#from_disk tag="method" new="2"} @@ -246,11 +247,12 @@ Loads state from a directory. Modifies the object in place and returns it. > vocab = Vocab().from_disk("/path/to/vocab") > ``` -| Name | Type | Description | -| ----------- | ------------ | -------------------------------------------------------------------------- | -| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The modified `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The modified `Vocab` object. | ## Vocab.to_bytes {#to_bytes tag="method"} @@ -262,10 +264,11 @@ Serialize the current state to a binary string. > vocab_bytes = nlp.vocab.to_bytes() > ``` -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------------------- | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | bytes | The serialized form of the `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Vocab` object. | ## Vocab.from_bytes {#from_bytes tag="method"} @@ -280,11 +283,12 @@ Load state from a binary string. > vocab.from_bytes(vocab_bytes) > ``` -| Name | Type | Description | -| ------------ | ------- | ------------------------------------------------------------------------- | -| `bytes_data` | bytes | The data to load from. | -| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | -| **RETURNS** | `Vocab` | The `Vocab` object. | +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Vocab` | The `Vocab` object. | ## Attributes {#attributes} From f7adc9d3b713ea473469e2371f5ce816bdc7e406 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2020 17:10:06 +0200 Subject: [PATCH 49/55] Start rewriting vectors docs --- website/docs/usage/vectors-embeddings.md | 156 ++++++++++------------- 1 file changed, 68 insertions(+), 88 deletions(-) diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md index 7725068ec..8f6315901 100644 --- a/website/docs/usage/vectors-embeddings.md +++ b/website/docs/usage/vectors-embeddings.md @@ -5,54 +5,82 @@ menu: - ['Other Embeddings', 'embeddings'] --- - - ## Word vectors and similarity -> #### Training word vectors -> -> Dense, real valued vectors representing distributional similarity information -> are now a cornerstone of practical NLP. The most common way to train these -> vectors is the [Word2vec](https://en.wikipedia.org/wiki/Word2vec) family of -> algorithms. If you need to train a word2vec model, we recommend the -> implementation in the Python library -> [Gensim](https://radimrehurek.com/gensim/). +An old idea in linguistics is that you can "know a word by the company it +keeps": that is, word meanings can be understood relationally, based on their +patterns of usage. This idea inspired a branch of NLP research known as +"distributional semantics" that has aimed to compute databases of lexical knowledge +automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec) family of +algorithms are a key milestone in this line of research. For simplicity, we +will refer to a distributional word representation as a "word vector", and +algorithms that computes word vectors (such as GloVe, FastText, etc) as +"word2vec algorithms". -import Vectors101 from 'usage/101/\_vectors-similarity.md' +Word vector tables are included in some of the spaCy model packages we +distribute, and you can easily create your own model packages with word vectors +you train or download yourself. In some cases you can also add word vectors to +an existing pipeline, although each pipeline can only have a single word +vectors table, and a model package that already has word vectors is unlikely to +work correctly if you replace the vectors with new ones. - +## What's a word vector? -### Customizing word vectors {#custom} +For spaCy's purposes, a "word vector" is a 1-dimensional slice from +a 2-dimensional _vectors table_, with a deterministic mapping from word types +to rows in the table. -Word vectors let you import knowledge from raw text into your model. The -knowledge is represented as a table of numbers, with one row per term in your -vocabulary. If two terms are used in similar contexts, the algorithm that learns -the vectors should assign them **rows that are quite similar**, while words that -are used in different contexts will have quite different values. This lets you -use the row-values assigned to the words as a kind of dictionary, to tell you -some things about what the words in your text mean. +```python +def what_is_a_word_vector( + word_id: int, + key2row: Dict[int, int], + vectors_table: Floats2d, + *, + default_row: int=0 +) -> Floats1d: + return vectors_table[key2row.get(word_id, default_row)] +``` -Word vectors are particularly useful for terms which **aren't well represented -in your labelled training data**. For instance, if you're doing named entity -recognition, there will always be lots of names that you don't have examples of. -For instance, imagine your training data happens to contain some examples of the -term "Microsoft", but it doesn't contain any examples of the term "Symantec". In -your raw text sample, there are plenty of examples of both terms, and they're -used in similar contexts. The word vectors make that fact available to the -entity recognition model. It still won't see examples of "Symantec" labelled as -a company. However, it'll see that "Symantec" has a word vector that usually -corresponds to company terms, so it can **make the inference**. +word2vec algorithms try to produce vectors tables that let you estimate useful +relationships between words using simple linear algebra operations. For +instance, you can often find close synonyms of a word by finding the vectors +closest to it by cosine distance, and then finding the words that are mapped to +those neighboring vectors. Word vectors can also be useful as features in +statistical models. -In order to make best use of the word vectors, you want the word vectors table -to cover a **very large vocabulary**. However, most words are rare, so most of -the rows in a large word vectors table will be accessed very rarely, or never at -all. You can usually cover more than **95% of the tokens** in your corpus with -just **a few thousand rows** in the vector table. However, it's those **5% of -rare terms** where the word vectors are **most useful**. The problem is that -increasing the size of the vector table produces rapidly diminishing returns in -coverage over these rare terms. +The key difference between word vectors and contextual language models such as +ElMo, BERT and GPT-2 is that word vectors model _lexical types_, rather than +_tokens_. If you have a list of terms with no context around them, a model like +BERT can't really help you. BERT is designed to understand language in context, +which isn't what you have. A word vectors table will be a much better fit for +your task. However, if you do have words in context --- whole sentences or +paragraphs of running text --- word vectors will only provide a very rough +approximation of what the text is about. -### Converting word vectors for use in spaCy {#converting new="2.0.10"} +Word vectors are also very computationally efficient, as they map a word to a +vector with a single indexing operation. Word vectors are therefore useful as a +way to improve the accuracy of neural network models, especially models that +are small or have received little or no pretraining. In spaCy, word vector +tables are only used as static features. spaCy does not backpropagate gradients +to the pretrained word vectors table. The static vectors table is usually used +in combination with a smaller table of learned task-specific embeddings. + +## Using word vectors directly + +spaCy stores word vector information in the `vocab.vectors` attribute, so you +can access the whole vectors table from most spaCy objects. You can also access +the vector for a `Doc`, `Span`, `Token` or `Lexeme` instance via the `vector` +attribute. If your `Doc` or `Span` has multiple tokens, the average of the +word vectors will be returned, excluding any "out of vocabulary" entries that +have no vector available. If none of the words have a vector, a zeroed vector +will be returned. + +The `vector` attribute is a read-only numpy or cupy array (depending on whether +you've configured spaCy to use GPU memory), with dtype `float32`. The array is +read-only so that spaCy can avoid unnecessary copy operations where possible. +You can modify the vectors via the `Vocab` or `Vectors` table. + +### Converting word vectors for use in spaCy Custom word vectors can be trained using a number of open-source libraries, such as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc), @@ -151,20 +179,7 @@ This will create a spaCy model with vectors for the first 10,000 words in the vectors model. All other words in the vectors model are mapped to the closest vector among those retained. -### Adding vectors {#custom-vectors-add new="2"} - -spaCy's new [`Vectors`](/api/vectors) class greatly improves the way word -vectors are stored, accessed and used. The data is stored in two structures: - -- An array, which can be either on CPU or [GPU](#gpu). -- A dictionary mapping string-hashes to rows in the table. - -Keep in mind that the `Vectors` class itself has no -[`StringStore`](/api/stringstore), so you have to store the hash-to-string -mapping separately. If you need to manage the strings, you should use the -`Vectors` via the [`Vocab`](/api/vocab) class, e.g. `vocab.vectors`. To add -vectors to the vocabulary, you can use the -[`Vocab.set_vector`](/api/vocab#set_vector) method. +### Adding vectors ```python ### Adding vectors @@ -196,38 +211,3 @@ For more details on **adding hooks** and **overwriting** the built-in `Doc`, ### Storing vectors on a GPU {#gpu} -If you're using a GPU, it's much more efficient to keep the word vectors on the -device. You can do that by setting the [`Vectors.data`](/api/vectors#attributes) -attribute to a `cupy.ndarray` object if you're using spaCy or -[Chainer](https://chainer.org), or a `torch.Tensor` object if you're using -[PyTorch](http://pytorch.org). The `data` object just needs to support -`__iter__` and `__getitem__`, so if you're using another library such as -[TensorFlow](https://www.tensorflow.org), you could also create a wrapper for -your vectors data. - -```python -### spaCy, Thinc or Chainer -import cupy.cuda -from spacy.vectors import Vectors - -vector_table = numpy.zeros((3, 300), dtype="f") -vectors = Vectors(["dog", "cat", "orange"], vector_table) -with cupy.cuda.Device(0): - vectors.data = cupy.asarray(vectors.data) -``` - -```python -### PyTorch -import torch -from spacy.vectors import Vectors - -vector_table = numpy.zeros((3, 300), dtype="f") -vectors = Vectors(["dog", "cat", "orange"], vector_table) -vectors.data = torch.Tensor(vectors.data).cuda(0) -``` - -## Other embeddings {#embeddings} - - - - From 158d8c1e48961f8c962df01f72e5818f3ec2651d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 18:44:10 +0200 Subject: [PATCH 50/55] Update docs [ci skip] --- website/docs/api/architectures.md | 2 + website/docs/api/top-level.md | 25 ++ website/docs/api/transformer.md | 82 +++++- website/docs/images/pipeline_transformer.svg | 37 +++ website/docs/usage/transformers.md | 294 +++++++++++++------ 5 files changed, 347 insertions(+), 93 deletions(-) create mode 100644 website/docs/images/pipeline_transformer.svg diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index a87c2a1e8..43387b8ca 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -26,6 +26,8 @@ TODO: intro and how architectures work, link to ### spacy-transformers.TransformerModel.v1 {#TransformerModel} +### spacy-transformers.Tok2VecListener.v1 {#spacy-transformers.Tok2VecListener.v1} + ## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"} ### spacy.TransitionBasedParser.v1 {#TransitionBasedParser} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index a463441c7..ede7f9e21 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -304,6 +304,31 @@ factories. | `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | | `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | +### spacy-transformers registry {#registry-transformers} + +The following registries are added by the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package. +See the [`Transformer`](/api/transformer) API reference and +[usage docs](/usage/transformers) for details. + +> #### Example +> +> ```python +> import spacy_transformers +> +> @spacy_transformers.registry.annotation_setters("my_annotation_setter.v1") +> def configure_custom_annotation_setter(): +> def annotation_setter(docs, trf_data) -> None: +> # Set annotations on the docs +> +> return annotation_sette +> ``` + +| Registry name | Description | +| ------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`span_getters`](/api/transformer#span_getters) | Registry for functions that take a batch of `Doc` objects and return a list of `Span` objects to process by the transformer, e.g. sentences. | +| [`annotation_setters`](/api/transformers#annotation_setters) | Registry for functions that create annotation setters. Annotation setters are functions that take a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | + ## Training data and alignment {#gold source="spacy/gold"} ### gold.docs_to_json {#docs_to_json tag="function"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index e89ecb6b7..386f65a0a 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -31,8 +31,10 @@ attributes. We also calculate an alignment between the word-piece tokens and the spaCy tokenization, so that we can use the last hidden states to set the `Doc.tensor` attribute. When multiple word-piece tokens align to the same spaCy token, the spaCy token receives the sum of their values. To access the values, -you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. For -more details, see the [usage documentation](/usage/transformers). +you can use the custom [`Doc._.trf_data`](#custom-attributes) attribute. The +package also adds the function registries [`@span_getters`](#span_getters) and +[`@annotation_setters`](#annotation_setters) with several built-in registered +functions. For more details, see the [usage documentation](/usage/transformers). ## Config and implementation {#config} @@ -51,11 +53,11 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > ``` -| Setting | Type | Description | Default | -| ------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | -| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | -| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` | -| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | +| Setting | Type | Description | Default | +| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | +| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | +| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. | `null_annotation_setter` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransformerModel](/api/architectures#TransformerModel) | ```python https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py @@ -390,6 +392,72 @@ Split a `TransformerData` object that represents a batch into a list with one | ----------- | ----------------------- | -------------- | | **RETURNS** | `List[TransformerData]` | | +## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} + +Span getters are functions that take a batch of [`Doc`](/api/doc) objects and +return a lists of [`Span`](/api/span) objects for each doc, to be processed by +the transformer. The returned spans can overlap. + + Span getters can be referenced in the + +config's `[components.transformer.model.get_spans]` block to customize the +sequences processed by the transformer. You can also register custom span +getters using the `@registry.span_getters` decorator. + +> #### Example +> +> ```python +> @registry.span_getters("sent_spans.v1") +> def configure_get_sent_spans() -> Callable: +> def get_sent_spans(docs: Iterable[Doc]) -> List[List[Span]]: +> return [list(doc.sents) for doc in docs] +> +> return get_sent_spans +> ``` + +| Name | Type | Description | +| ----------- | ------------------ | ------------------------------------------------------------ | +| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. | +| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer, one list per `Doc`. | + +The following built-in functions are available: + +| Name | Description | +| ------------------ | ------------------------------------------------------------------ | +| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). | +| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. | +| `strided_spans.v1` | | + +## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} + +Annotation setters are functions that that take a batch of `Doc` objects and a +[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set +additional annotations on the `Doc`, e.g. to set custom or built-in attributes. +You can register custom annotation setters using the +`@registry.annotation_setters` decorator. + +> #### Example +> +> ```python +> @registry.annotation_setters("spacy-transformer.null_annotation_setter.v1") +> def configure_null_annotation_setter() -> Callable: +> def setter(docs: List[Doc], trf_data: FullTransformerBatch) -> None: +> pass +> +> return setter +> ``` + +| Name | Type | Description | +| ---------- | ---------------------- | ------------------------------------ | +| `docs` | `List[Doc]` | A batch of `Doc` objects. | +| `trf_data` | `FullTransformerBatch` | The transformers data for the batch. | + +The following built-in functions are available: + +| Name | Description | +| --------------------------------------------- | ------------------------------------- | +| `spacy-transformer.null_annotation_setter.v1` | Don't set any additional annotations. | + ## Custom attributes {#custom-attributes} The component sets the following diff --git a/website/docs/images/pipeline_transformer.svg b/website/docs/images/pipeline_transformer.svg new file mode 100644 index 000000000..cfbf470cc --- /dev/null +++ b/website/docs/images/pipeline_transformer.svg @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index d5ce4e891..791eaac37 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -1,10 +1,17 @@ --- title: Transformers teaser: Using transformer models like BERT in spaCy +menu: + - ['Installation', 'install'] + - ['Runtime Usage', 'runtime'] + - ['Training Usage', 'training'] --- +## Installation {#install hidden="true"} + spaCy v3.0 lets you use almost **any statistical model** to power your pipeline. -You can use models implemented in a variety of frameworks, including TensorFlow, +You can use models implemented in a variety of +[frameworks](https://thinc.ai/docs/usage-frameworks), including TensorFlow, PyTorch and MXNet. To keep things sane, spaCy expects models from these frameworks to be wrapped with a common interface, using our machine learning library [Thinc](https://thinc.ai). A transformer model is just a statistical @@ -15,34 +22,110 @@ that do the required plumbing. We also provide a pipeline component, [`Transformer`](/api/transformer), that lets you do multi-task learning and lets you save the transformer outputs for later use. - +To use transformers with spaCy, you need the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +installed. It takes care of all the setup behind the scenes, and makes sure the +transformer pipeline component is available to spaCy. -Try out a BERT-based model pipeline using this project template: swap in your -data, edit the settings and hyperparameters and train, evaluate, package and -visualize your model. +```bash +$ pip install spacy-transformers +``` - + - + + + +### Customizing the settings {#training-custom-settings} + +To change any of the settings, you can edit the `config.cfg` and re-run the +training. To change any of the functions, like the span getter, you can replace +the name of the referenced function – e.g. `@span_getters = "sent_spans.v1"` to +process sentences. You can also register your own functions using the +`span_getters` registry: + +> #### config.cfg +> +> ```ini +> [components.transformer.model.get_spans] +> @span_getters = "custom_sent_spans" +> ``` + ```python -from spacy_transformers import Transformer +### code.py +import spacy_transformers -trf = Transformer( - nlp.vocab, - TransformerModel( - "bert-base-cased", - get_spans=get_doc_spans, - tokenizer_config={"use_fast": True}, - ), - annotation_setter=null_annotation_setter, - max_batch_size=32, -) +@spacy_transformers.registry.span_getters("custom_sent_spans") +def configure_custom_sent_spans(): + # TODO: write custom example + def get_sent_spans(docs): + return [list(doc.sents) for doc in docs] + + return get_sent_spans ``` -The `components.transformer` block adds the `transformer` component to the -pipeline, and the `components.transformer.model` block describes the creation of -a Thinc [`Model`](https://thinc.ai/docs/api-model) object that will be passed -into the component. The block names a function registered in the -`@architectures` registry. This function will be looked up and called using the -provided arguments. You're not limited to just that function --- you can write -your own or use someone else's. The only limitation is that it must return an -object of type `Model[List[Doc], FullTransformerBatch]`: that is, a Thinc model -that takes a list of `Doc` objects, and returns a `FullTransformerBatch` object -with the transformer data. +To resolve the config during training, spaCy needs to know about your custom +function. You can make it available via the `--code` argument that can point to +a Python file: -The same idea applies to task models that power the downstream components. Most -of spaCy's built-in model creation functions support a `tok2vec` argument, which -should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This is -where we'll plug in our transformer model, using the `Tok2VecTransformer` layer, -which sneakily delegates to the `Transformer` pipeline component. +```bash +$ python -m spacy train ./train.spacy ./dev.spacy ./config.cfg --code ./code.py +``` + +### Customizing the model implementations {#training-custom-model} + +The [`Transformer`](/api/transformer) component expects a Thinc +[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model` +argument. You're not limited to the implementation provided by +`spacy-transformers` – the only requirement is that your registered function +must return an object of type `Model[List[Doc], FullTransformerBatch]`: that is, +a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a +[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the +transformer data. + +> #### Model type annotations +> +> In the documentation and code base, you may come across type annotations and +> descriptions of [Thinc](https://thinc.ai) model types, like +> `Model[List[Doc], List[Floats2d]]`. This so-called generic type describes the +> layer and its input and output type – in this case, it takes a list of `Doc` +> objects as the input and list of 2-dimensional arrays of floats as the output. +> You can read more about defining Thinc +> models [here](https://thinc.ai/docs/usage-models). Also see the +> [type checking](https://thinc.ai/docs/usage-type-checking) for how to enable +> linting in your editor to see live feedback if your inputs and outputs don't +> match. + +The same idea applies to task models that power the **downstream components**. +Most of spaCy's built-in model creation functions support a `tok2vec` argument, +which should be a Thinc layer of type `Model[List[Doc], List[Floats2d]]`. This +is where we'll plug in our transformer model, using the +[Tok2VecListener](/api/architectures#Tok2VecListener) layer, which sneakily +delegates to the `Transformer` pipeline component. ```ini -[nlp] -lang = "en" -pipeline = ["ner"] - +### config.cfg (excerpt) {highlight="12"} [components.ner] factory = "ner" @@ -108,49 +255,24 @@ grad_factor = 1.0 @layers = "reduce_mean.v1" ``` -The `Tok2VecListener` layer expects a `pooling` layer, which needs to be of type -`Model[Ragged, Floats2d]`. This layer determines how the vector for each spaCy -token will be computed from the zero or more source rows the token is aligned -against. Here we use the `reduce_mean` layer, which averages the wordpiece rows. -We could instead use `reduce_last`, `reduce_max`, or a custom function you write -yourself. +The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a +[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops), which needs to +be of type `Model[Ragged, Floats2d]`. This layer determines how the vector for +each spaCy token will be computed from the zero or more source rows the token is +aligned against. Here we use the +[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which +averages the wordpiece rows. We could instead use `reduce_last`, +[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom +function you write yourself. + + You can have multiple components all listening to the same transformer model, and all passing gradients back to it. By default, all of the gradients will be -equally weighted. You can control this with the `grad_factor` setting, which +**equally weighted**. You can control this with the `grad_factor` setting, which lets you reweight the gradients from the different listeners. For instance, setting `grad_factor = 0` would disable gradients from one of the listeners, while `grad_factor = 2.0` would multiply them by 2. This is similar to having a custom learning rate for each component. Instead of a constant, you can also provide a schedule, allowing you to freeze the shared parameters at the start of training. - -### Runtime usage - -Transformer models can be used as drop-in replacements for other types of neural -networks, so your spaCy pipeline can include them in a way that's completely -invisible to the user. Users will download, load and use the model in the -standard way, like any other spaCy pipeline. - -Instead of using the transformers as subnetworks directly, you can also use them -via the [`Transformer`](/api/transformer) pipeline component. This sets the -[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, -which lets you access the transformers outputs at runtime via the -`doc._.trf_data` extension attribute. You can also customize how the -`Transformer` object sets annotations onto the `Doc`, by customizing the -`Transformer.annotation_setter` object. This callback will be called with the -raw input and output data for the whole batch, along with the batch of `Doc` -objects, allowing you to implement whatever you need. - -```python -import spacy - -nlp = spacy.load("en_core_trf_lg") -for doc in nlp.pipe(["some text", "some other text"]): - doc._.trf_data.tensors - tokvecs = doc._.trf_data.tensors[-1] -``` - -The `nlp` object in this example is just like any other spaCy pipeline - - --> From 6a5c853edb61557fa5c4d0ca90172e318afe951d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 18:45:12 +0200 Subject: [PATCH 51/55] Fix docs [ci skip] --- website/docs/api/architectures.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 43387b8ca..534f0bdf0 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -26,7 +26,7 @@ TODO: intro and how architectures work, link to ### spacy-transformers.TransformerModel.v1 {#TransformerModel} -### spacy-transformers.Tok2VecListener.v1 {#spacy-transformers.Tok2VecListener.v1} +### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener} ## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"} From 9f69afdd1e1a059ed855a7830318091bb9ab5271 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 19:09:44 +0200 Subject: [PATCH 52/55] Update docs [ci skip] --- website/docs/api/transformer.md | 19 ++-- website/docs/usage/transformers.md | 1 + website/docs/usage/vectors-embeddings.md | 107 +++++++++++++---------- 3 files changed, 69 insertions(+), 58 deletions(-) diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 386f65a0a..764b3dd88 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -394,14 +394,13 @@ Split a `TransformerData` object that represents a batch into a list with one ## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} + + Span getters are functions that take a batch of [`Doc`](/api/doc) objects and return a lists of [`Span`](/api/span) objects for each doc, to be processed by -the transformer. The returned spans can overlap. - - Span getters can be referenced in the - -config's `[components.transformer.model.get_spans]` block to customize the -sequences processed by the transformer. You can also register custom span +the transformer. The returned spans can overlap. Span getters can be referenced +in the config's `[components.transformer.model.get_spans]` block to customize +the sequences processed by the transformer. You can also register custom span getters using the `@registry.span_getters` decorator. > #### Example @@ -415,10 +414,10 @@ getters using the `@registry.span_getters` decorator. > return get_sent_spans > ``` -| Name | Type | Description | -| ----------- | ------------------ | ------------------------------------------------------------ | -| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. | -| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer, one list per `Doc`. | +| Name | Type | Description | +| ----------- | ------------------ | ---------------------------------------- | +| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. | +| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. | The following built-in functions are available: diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index 791eaac37..0c98ad630 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -5,6 +5,7 @@ menu: - ['Installation', 'install'] - ['Runtime Usage', 'runtime'] - ['Training Usage', 'training'] +next: /usage/training --- ## Installation {#install hidden="true"} diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md index 8f6315901..823b30c20 100644 --- a/website/docs/usage/vectors-embeddings.md +++ b/website/docs/usage/vectors-embeddings.md @@ -1,34 +1,35 @@ --- -title: Word Vectors and Embeddings +title: Vectors and Embeddings menu: + - ["What's a Word Vector?", 'whats-a-vector'] - ['Word Vectors', 'vectors'] - ['Other Embeddings', 'embeddings'] +next: /usage/transformers --- -## Word vectors and similarity - An old idea in linguistics is that you can "know a word by the company it keeps": that is, word meanings can be understood relationally, based on their patterns of usage. This idea inspired a branch of NLP research known as -"distributional semantics" that has aimed to compute databases of lexical knowledge -automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec) family of -algorithms are a key milestone in this line of research. For simplicity, we -will refer to a distributional word representation as a "word vector", and -algorithms that computes word vectors (such as GloVe, FastText, etc) as -"word2vec algorithms". +"distributional semantics" that has aimed to compute databases of lexical +knowledge automatically. The [Word2vec](https://en.wikipedia.org/wiki/Word2vec) +family of algorithms are a key milestone in this line of research. For +simplicity, we will refer to a distributional word representation as a "word +vector", and algorithms that computes word vectors (such as +[GloVe](https://nlp.stanford.edu/projects/glove/), +[FastText](https://fasttext.cc), etc.) as "Word2vec algorithms". -Word vector tables are included in some of the spaCy model packages we -distribute, and you can easily create your own model packages with word vectors -you train or download yourself. In some cases you can also add word vectors to -an existing pipeline, although each pipeline can only have a single word -vectors table, and a model package that already has word vectors is unlikely to -work correctly if you replace the vectors with new ones. +Word vector tables are included in some of the spaCy [model packages](/models) +we distribute, and you can easily create your own model packages with word +vectors you train or download yourself. In some cases you can also add word +vectors to an existing pipeline, although each pipeline can only have a single +word vectors table, and a model package that already has word vectors is +unlikely to work correctly if you replace the vectors with new ones. -## What's a word vector? +## What's a word vector? {#whats-a-vector} -For spaCy's purposes, a "word vector" is a 1-dimensional slice from -a 2-dimensional _vectors table_, with a deterministic mapping from word types -to rows in the table. +For spaCy's purposes, a "word vector" is a 1-dimensional slice from a +2-dimensional **vectors table**, with a deterministic mapping from word types to +rows in the table. ```python def what_is_a_word_vector( @@ -41,51 +42,55 @@ def what_is_a_word_vector( return vectors_table[key2row.get(word_id, default_row)] ``` -word2vec algorithms try to produce vectors tables that let you estimate useful +Word2vec algorithms try to produce vectors tables that let you estimate useful relationships between words using simple linear algebra operations. For instance, you can often find close synonyms of a word by finding the vectors closest to it by cosine distance, and then finding the words that are mapped to those neighboring vectors. Word vectors can also be useful as features in statistical models. +### Word vectors vs. contextual language models {#vectors-vs-language-models} + The key difference between word vectors and contextual language models such as -ElMo, BERT and GPT-2 is that word vectors model _lexical types_, rather than +ElMo, BERT and GPT-2 is that word vectors model **lexical types**, rather than _tokens_. If you have a list of terms with no context around them, a model like -BERT can't really help you. BERT is designed to understand language in context, -which isn't what you have. A word vectors table will be a much better fit for -your task. However, if you do have words in context --- whole sentences or -paragraphs of running text --- word vectors will only provide a very rough +BERT can't really help you. BERT is designed to understand language **in +context**, which isn't what you have. A word vectors table will be a much better +fit for your task. However, if you do have words in context — whole sentences or +paragraphs of running text — word vectors will only provide a very rough approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a vector with a single indexing operation. Word vectors are therefore useful as a -way to improve the accuracy of neural network models, especially models that +way to **improve the accuracy** of neural network models, especially models that are small or have received little or no pretraining. In spaCy, word vector -tables are only used as static features. spaCy does not backpropagate gradients -to the pretrained word vectors table. The static vectors table is usually used -in combination with a smaller table of learned task-specific embeddings. +tables are only used as **static features**. spaCy does not backpropagate +gradients to the pretrained word vectors table. The static vectors table is +usually used in combination with a smaller table of learned task-specific +embeddings. -## Using word vectors directly +## Using word vectors directly {#vectors} -spaCy stores word vector information in the `vocab.vectors` attribute, so you -can access the whole vectors table from most spaCy objects. You can also access -the vector for a `Doc`, `Span`, `Token` or `Lexeme` instance via the `vector` -attribute. If your `Doc` or `Span` has multiple tokens, the average of the -word vectors will be returned, excluding any "out of vocabulary" entries that -have no vector available. If none of the words have a vector, a zeroed vector -will be returned. +spaCy stores word vector information in the +[`Vocab.vectors`](/api/vocab#attributes) attribute, so you can access the whole +vectors table from most spaCy objects. You can also access the vector for a +[`Doc`](/api/doc), [`Span`](/api/span), [`Token`](/api/token) or +[`Lexeme`](/api/lexeme) instance via the `vector` attribute. If your `Doc` or +`Span` has multiple tokens, the average of the word vectors will be returned, +excluding any "out of vocabulary" entries that have no vector available. If none +of the words have a vector, a zeroed vector will be returned. -The `vector` attribute is a read-only numpy or cupy array (depending on whether -you've configured spaCy to use GPU memory), with dtype `float32`. The array is -read-only so that spaCy can avoid unnecessary copy operations where possible. -You can modify the vectors via the `Vocab` or `Vectors` table. +The `vector` attribute is a **read-only** numpy or cupy array (depending on +whether you've configured spaCy to use GPU memory), with dtype `float32`. The +array is read-only so that spaCy can avoid unnecessary copy operations where +possible. You can modify the vectors via the `Vocab` or `Vectors` table. ### Converting word vectors for use in spaCy Custom word vectors can be trained using a number of open-source libraries, such as [Gensim](https://radimrehurek.com/gensim), [Fast Text](https://fasttext.cc), or Tomas Mikolov's original -[word2vec implementation](https://code.google.com/archive/p/word2vec/). Most +[Word2vec implementation](https://code.google.com/archive/p/word2vec/). Most word vector libraries output an easy-to-read text-based format, where each line consists of the word followed by its vector. For everyday use, we want to convert the vectors model into a binary format that loads faster and takes up @@ -165,11 +170,10 @@ the two words. In the example above, the vector for "Shore" was removed and remapped to the vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to -the vector of "leaving", which is identical. - -If you're using the [`init-model`](/api/cli#init-model) command, you can set the -`--prune-vectors` option to easily reduce the size of the vectors as you add -them to a spaCy model: +the vector of "leaving", which is identical. If you're using the +[`init-model`](/api/cli#init-model) command, you can set the `--prune-vectors` +option to easily reduce the size of the vectors as you add them to a spaCy +model: ```bash $ python -m spacy init-model /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 @@ -179,7 +183,7 @@ This will create a spaCy model with vectors for the first 10,000 words in the vectors model. All other words in the vectors model are mapped to the closest vector among those retained. -### Adding vectors +### Adding vectors {#adding-vectors} ```python ### Adding vectors @@ -209,5 +213,12 @@ For more details on **adding hooks** and **overwriting** the built-in `Doc`, + + +## Other embeddings {#embeddings} + + From 9c80cb673d843bc074341d04a144b702a30d3eb8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 19:41:34 +0200 Subject: [PATCH 53/55] Update docs [ci skip] --- website/docs/api/architectures.md | 45 ++++++++++++++++++++++ website/docs/api/transformer.md | 14 +++---- website/docs/usage/processing-pipelines.md | 22 ++++++----- website/docs/usage/transformers.md | 12 +++--- website/src/widgets/quickstart-install.js | 26 ++++++++++--- 5 files changed, 92 insertions(+), 27 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 534f0bdf0..95f7d0597 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -24,10 +24,55 @@ TODO: intro and how architectures work, link to ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} +The following architectures are provided by the package +[`spacy-transformers`](https://github.com/explosion/spacy-transformers). See the +[usage documentation](/usage/transformers) for how to integrate the +architectures into your training config. + ### spacy-transformers.TransformerModel.v1 {#TransformerModel} + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy-transformers.TransformerModel.v1" +> name = "roberta-base" +> tokenizer_config = {"use_fast": true} +> +> [model.get_spans] +> @span_getters = "strided_spans.v1" +> window = 128 +> stride = 96 +> ``` + +| Name | Type | Description | +| ------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `name` | str | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). | +| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. | +| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). | + ### spacy-transformers.Tok2VecListener.v1 {#Tok2VecListener} + + +> #### Example Config +> +> ```ini +> [model] +> @architectures = "spacy-transformers.Tok2VecListener.v1" +> grad_factor = 1.0 +> +> [model.pooling] +> @layers = "reduce_mean.v1" +> ``` + +| Name | Type | Description | +| ------------- | ------------------------- | ---------------------------------------------------------------------------------------------- | +| `grad_factor` | float | Factor for weighting the gradient if multiple components listen to the same transformer model. | +| `pooling` | `Model[Ragged, Floats2d]` | Pooling layer to determine how the vector for each spaCy token will be computed. | + ## Parser & NER architectures {#parser source="spacy/ml/models/parser.py"} ### spacy.TransitionBasedParser.v1 {#TransitionBasedParser} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 764b3dd88..70128d225 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -366,13 +366,13 @@ Transformer tokens and outputs for one `Doc` object. -| Name | Type | Description | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | -| `spans` | `List[List[Span]]` | | -| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=batchencoding#transformers.BatchEncoding) | | -| `tensors` | `List[torch.Tensor]` | | -| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | -| `doc_data` | `List[TransformerData]` | | +| Name | Type | Description | +| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | +| `spans` | `List[List[Span]]` | | +| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | | +| `tensors` | `List[torch.Tensor]` | | +| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | | +| `doc_data` | `List[TransformerData]` | | ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 08e8e964f..56ade692a 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -220,15 +220,19 @@ available pipeline components and component functions. > ruler = nlp.add_pipe("entity_ruler") > ``` -| String name | Component | Description | -| --------------- | ------------------------------------------- | ----------------------------------------------------------------------------------------- | -| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | -| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | -| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | -| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | -| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | -| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. | -| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | +| String name | Component | Description | +| --------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------- | +| `tagger` | [`Tagger`](/api/tagger) | Assign part-of-speech-tags. | +| `parser` | [`DependencyParser`](/api/dependencyparser) | Assign dependency labels. | +| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | +| `entity_linker` | [`EntityLinker`](/api/entitylinker) | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. | +| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules and dictionaries. | +| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | +| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | +| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | +| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | +| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | | +| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index 0c98ad630..a7fd83ac6 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -101,7 +101,9 @@ evaluate, package and visualize your model. The `[components]` section in the [`config.cfg`](#TODO:) describes the pipeline components and the settings used to construct them, including their model implementation. Here's a config snippet for the -[`Transformer`](/api/transformer) component, along with matching Python code: +[`Transformer`](/api/transformer) component, along with matching Python code. In +this case, the `[components.transformer]` block describes the `transformer` +component: > #### Python equivalent > @@ -257,10 +259,10 @@ grad_factor = 1.0 ``` The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a -[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops), which needs to -be of type `Model[Ragged, Floats2d]`. This layer determines how the vector for -each spaCy token will be computed from the zero or more source rows the token is -aligned against. Here we use the +[pooling layer](https://thinc.ai/docs/api-layers#reduction-ops) as the argument +`pooling`, which needs to be of type `Model[Ragged, Floats2d]`. This layer +determines how the vector for each spaCy token will be computed from the zero or +more source rows the token is aligned against. Here we use the [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which averages the wordpiece rows. We could instead use `reduce_last`, [`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index 237567eb8..b2e72752a 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -36,13 +36,18 @@ const DATA = [ ], }, { - id: 'data', - title: 'Additional data', + id: 'addition', + title: 'Additions', multiple: true, options: [ + { + id: 'transformers', + title: 'Transformers', + help: 'Use transformers like BERT to train your spaCy models', + }, { id: 'lookups', - title: 'Lemmatization', + title: 'Lemmatizer data', help: 'Install additional lookup tables and rules for lemmatization', }, ], @@ -86,13 +91,22 @@ const QuickstartInstall = ({ id, title }) => ( set PYTHONPATH=C:\path\to\spaCy pip install -r requirements.txt - + + pip install -U spacy-lookups-transformers + + + pip install -U spacy-transformers + + + conda install -c conda-forge spacy-transformers + + pip install -U spacy-lookups-data - + pip install -U spacy-lookups-data - + conda install -c conda-forge spacy-lookups-data python setup.py build_ext --inplace From 3449c45fd905393d6e94866d66a00ec8d62f6880 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 Jul 2020 19:48:26 +0200 Subject: [PATCH 54/55] Update docs [ci skip] --- website/docs/usage/training.md | 13 ++++++++++++- website/docs/usage/transformers.md | 7 ++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 948a13086..12785b6de 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -243,7 +243,14 @@ compound = 1.001 ### Using transformer models like BERT {#transformers} - +spaCy v3.0 lets you use almost any statistical model to power your pipeline. You +can use models implemented in a variety of frameworks. A transformer model is +just a statistical model, so the +[`spacy-transformers`](https://github.com/explosion/spacy-transformers) package +actually has very little work to do: it just has to provide a few functions that +do the required plumbing. It also provides a pipeline component, +[`Transformer`](/api/transformer), that lets you do multi-task learning and lets +you save the transformer outputs for later use. @@ -253,6 +260,10 @@ visualize your model. +For more details on how to integrate transformer models into your training +config and customize the implementations, see the usage guide on +[training transformers](/usage/transformers#training). + ### Pretraining with spaCy {#pretraining} diff --git a/website/docs/usage/transformers.md b/website/docs/usage/transformers.md index a7fd83ac6..bab1b82d3 100644 --- a/website/docs/usage/transformers.md +++ b/website/docs/usage/transformers.md @@ -18,8 +18,8 @@ frameworks to be wrapped with a common interface, using our machine learning library [Thinc](https://thinc.ai). A transformer model is just a statistical model, so the [`spacy-transformers`](https://github.com/explosion/spacy-transformers) package -actually has very little work to do: we just have to provide a few functions -that do the required plumbing. We also provide a pipeline component, +actually has very little work to do: it just has to provide a few functions that +do the required plumbing. It also provides a pipeline component, [`Transformer`](/api/transformer), that lets you do multi-task learning and lets you save the transformer outputs for later use. @@ -201,7 +201,8 @@ def configure_custom_sent_spans(): To resolve the config during training, spaCy needs to know about your custom function. You can make it available via the `--code` argument that can point to -a Python file: +a Python file. For more details on training with custom code, see the +[training documentation](/usage/training#custom-code). ```bash $ python -m spacy train ./train.spacy ./dev.spacy ./config.cfg --code ./code.py From ca491722ad450d2c88388d5b3a4d70de39b857d5 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 30 Jul 2020 23:30:54 +0200 Subject: [PATCH 55/55] The Parser is now a Pipe (2) (#5844) * moving syntax folder to _parser_internals * moving nn_parser and transition_system * move nn_parser and transition_system out of internals folder * moving nn_parser code into transition_system file * rename transition_system to transition_parser * moving parser_model and _state to ml * move _state back to internals * The Parser now inherits from Pipe! * small code fixes * removing unnecessary imports * remove link_vectors_to_models * transition_system to internals folder * little bit more cleanup * newlines --- bin/ud/ud_train.py | 2 +- examples/training/conllu.py | 2 +- setup.py | 16 +++--- spacy/cli/debug_data.py | 2 +- spacy/gold/example.pyx | 2 +- .../_parser_model.pxd => ml/parser_model.pxd} | 6 +-- .../_parser_model.pyx => ml/parser_model.pyx} | 17 ++----- spacy/ml/tb_framework.py | 2 +- .../_parser_internals}/__init__.py | 0 .../_parser_internals}/_state.pxd | 15 +++--- .../_parser_internals}/_state.pyx | 0 .../_parser_internals}/arc_eager.pxd | 6 +-- .../_parser_internals}/arc_eager.pyx | 19 +++---- .../_parser_internals}/ner.pxd | 2 - .../_parser_internals}/ner.pyx | 15 +++--- .../_parser_internals}/nonproj.pxd | 0 .../_parser_internals}/nonproj.pyx | 4 +- .../_parser_internals}/stateclass.pxd | 8 +-- .../_parser_internals}/stateclass.pyx | 2 +- .../_parser_internals}/transition_system.pxd | 8 +-- .../_parser_internals}/transition_system.pyx | 12 ++--- spacy/pipeline/dep_parser.pyx | 8 +-- spacy/pipeline/entity_linker.py | 12 ++--- spacy/pipeline/multitask.pyx | 7 ++- spacy/pipeline/ner.pyx | 6 +-- .../__init__.pxd => pipeline/nn_parser.pyx} | 0 spacy/pipeline/pipe.pxd | 2 + spacy/pipeline/pipe.pyx | 4 +- spacy/pipeline/sentencizer.pyx | 6 +++ spacy/pipeline/senter.pyx | 2 +- spacy/pipeline/simple_ner.py | 3 -- spacy/pipeline/textcat.py | 8 +-- spacy/pipeline/tok2vec.py | 3 ++ .../transition_parser.pxd} | 15 +++--- .../transition_parser.pyx} | 50 ++++++++----------- spacy/tests/parser/test_arc_eager_oracle.py | 4 +- spacy/tests/parser/test_ner.py | 2 +- spacy/tests/parser/test_neural_parser.py | 4 +- spacy/tests/parser/test_nonproj.py | 6 +-- 39 files changed, 124 insertions(+), 158 deletions(-) rename spacy/{syntax/_parser_model.pxd => ml/parser_model.pxd} (88%) rename spacy/{syntax/_parser_model.pyx => ml/parser_model.pyx} (97%) rename spacy/{syntax => pipeline/_parser_internals}/__init__.py (100%) rename spacy/{syntax => pipeline/_parser_internals}/_state.pxd (98%) rename spacy/{syntax => pipeline/_parser_internals}/_state.pyx (100%) rename spacy/{syntax => pipeline/_parser_internals}/arc_eager.pxd (65%) rename spacy/{syntax => pipeline/_parser_internals}/arc_eager.pyx (98%) rename spacy/{syntax => pipeline/_parser_internals}/ner.pxd (58%) rename spacy/{syntax => pipeline/_parser_internals}/ner.pyx (98%) rename spacy/{syntax => pipeline/_parser_internals}/nonproj.pxd (100%) rename spacy/{syntax => pipeline/_parser_internals}/nonproj.pyx (98%) rename spacy/{syntax => pipeline/_parser_internals}/stateclass.pxd (95%) rename spacy/{syntax => pipeline/_parser_internals}/stateclass.pyx (97%) rename spacy/{syntax => pipeline/_parser_internals}/transition_system.pxd (91%) rename spacy/{syntax => pipeline/_parser_internals}/transition_system.pyx (97%) rename spacy/{syntax/__init__.pxd => pipeline/nn_parser.pyx} (100%) create mode 100644 spacy/pipeline/pipe.pxd rename spacy/{syntax/nn_parser.pxd => pipeline/transition_parser.pxd} (62%) rename spacy/{syntax/nn_parser.pyx => pipeline/transition_parser.pyx} (95%) diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py index ac5987aa4..11ad564ec 100644 --- a/bin/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -16,7 +16,7 @@ from bin.ud import conll17_ud_eval from spacy.tokens import Token, Doc from spacy.gold import Example from spacy.util import compounding, minibatch, minibatch_by_words -from spacy.syntax.nonproj import projectivize +from spacy.pipeline._parser_internals.nonproj import projectivize from spacy.matcher import Matcher from spacy import displacy from collections import defaultdict diff --git a/examples/training/conllu.py b/examples/training/conllu.py index ecc07ccf2..a398b0ae0 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -13,7 +13,7 @@ import spacy import spacy.util from spacy.tokens import Token, Doc from spacy.gold import Example -from spacy.syntax.nonproj import projectivize +from spacy.pipeline._parser_internals.nonproj import projectivize from collections import defaultdict from spacy.matcher import Matcher diff --git a/setup.py b/setup.py index 6d962ab59..af4cd0ec6 100755 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ MOD_NAMES = [ "spacy.vocab", "spacy.attrs", "spacy.kb", + "spacy.ml.parser_model", "spacy.morphology", "spacy.pipeline.dep_parser", "spacy.pipeline.morphologizer", @@ -40,14 +41,14 @@ MOD_NAMES = [ "spacy.pipeline.sentencizer", "spacy.pipeline.senter", "spacy.pipeline.tagger", - "spacy.syntax.stateclass", - "spacy.syntax._state", + "spacy.pipeline.transition_parser", + "spacy.pipeline._parser_internals.arc_eager", + "spacy.pipeline._parser_internals.ner", + "spacy.pipeline._parser_internals.nonproj", + "spacy.pipeline._parser_internals._state", + "spacy.pipeline._parser_internals.stateclass", + "spacy.pipeline._parser_internals.transition_system", "spacy.tokenizer", - "spacy.syntax.nn_parser", - "spacy.syntax._parser_model", - "spacy.syntax.nonproj", - "spacy.syntax.transition_system", - "spacy.syntax.arc_eager", "spacy.gold.gold_io", "spacy.tokens.doc", "spacy.tokens.span", @@ -57,7 +58,6 @@ MOD_NAMES = [ "spacy.matcher.matcher", "spacy.matcher.phrasematcher", "spacy.matcher.dependencymatcher", - "spacy.syntax.ner", "spacy.symbols", "spacy.vectors", ] diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 1ffceeca1..fa6f7a7d5 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -10,7 +10,7 @@ from thinc.api import Config from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli from ..gold import Corpus, Example -from ..syntax import nonproj +from ..pipeline._parser_internals import nonproj from ..language import Language from .. import util diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 9101cefce..84d9f1622 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -10,7 +10,7 @@ from .align import Alignment from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import spans_from_biluo_tags from ..errors import Errors, Warnings -from ..syntax import nonproj +from ..pipeline._parser_internals import nonproj cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): diff --git a/spacy/syntax/_parser_model.pxd b/spacy/ml/parser_model.pxd similarity index 88% rename from spacy/syntax/_parser_model.pxd rename to spacy/ml/parser_model.pxd index 15befb372..6582b3468 100644 --- a/spacy/syntax/_parser_model.pxd +++ b/spacy/ml/parser_model.pxd @@ -1,8 +1,6 @@ from libc.string cimport memset, memcpy -from libc.stdlib cimport calloc, free, realloc -from ..typedefs cimport weight_t, class_t, hash_t - -from ._state cimport StateC +from ..typedefs cimport weight_t, hash_t +from ..pipeline._parser_internals._state cimport StateC cdef struct SizesC: diff --git a/spacy/syntax/_parser_model.pyx b/spacy/ml/parser_model.pyx similarity index 97% rename from spacy/syntax/_parser_model.pyx rename to spacy/ml/parser_model.pyx index eedd84bac..da937ca4f 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/ml/parser_model.pyx @@ -1,29 +1,18 @@ # cython: infer_types=True, cdivision=True, boundscheck=False -cimport cython.parallel cimport numpy as np from libc.math cimport exp -from libcpp.vector cimport vector from libc.string cimport memset, memcpy from libc.stdlib cimport calloc, free, realloc -from cymem.cymem cimport Pool -from thinc.extra.search cimport Beam from thinc.backends.linalg cimport Vec, VecVec cimport blis.cy import numpy import numpy.random -from thinc.api import Linear, Model, CupyOps, NumpyOps, use_ops, noop +from thinc.api import Model, CupyOps, NumpyOps -from ..typedefs cimport weight_t, class_t, hash_t -from ..tokens.doc cimport Doc -from .stateclass cimport StateClass -from .transition_system cimport Transition - -from ..compat import copy_array -from ..errors import Errors, TempErrors -from ..util import create_default_optimizer from .. import util -from . import nonproj +from ..typedefs cimport weight_t, class_t, hash_t +from ..pipeline._parser_internals.stateclass cimport StateClass cdef WeightsC get_c_weights(model) except *: diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py index 39d4b0a14..44f125a04 100644 --- a/spacy/ml/tb_framework.py +++ b/spacy/ml/tb_framework.py @@ -1,5 +1,5 @@ from thinc.api import Model, noop, use_ops, Linear -from ..syntax._parser_model import ParserStepModel +from .parser_model import ParserStepModel def TransitionModel(tok2vec, lower, upper, dropout=0.2, unseen_classes=set()): diff --git a/spacy/syntax/__init__.py b/spacy/pipeline/_parser_internals/__init__.py similarity index 100% rename from spacy/syntax/__init__.py rename to spacy/pipeline/_parser_internals/__init__.py diff --git a/spacy/syntax/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd similarity index 98% rename from spacy/syntax/_state.pxd rename to spacy/pipeline/_parser_internals/_state.pxd index fef4f0c92..0d0dd8c05 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -1,15 +1,14 @@ -from libc.string cimport memcpy, memset, memmove -from libc.stdlib cimport malloc, calloc, free +from libc.string cimport memcpy, memset +from libc.stdlib cimport calloc, free from libc.stdint cimport uint32_t, uint64_t from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from murmurhash.mrmr cimport hash64 -from ..vocab cimport EMPTY_LEXEME -from ..structs cimport TokenC, SpanC -from ..lexeme cimport Lexeme -from ..symbols cimport punct -from ..attrs cimport IS_SPACE -from ..typedefs cimport attr_t +from ...vocab cimport EMPTY_LEXEME +from ...structs cimport TokenC, SpanC +from ...lexeme cimport Lexeme +from ...attrs cimport IS_SPACE +from ...typedefs cimport attr_t cdef inline bint is_space_token(const TokenC* token) nogil: diff --git a/spacy/syntax/_state.pyx b/spacy/pipeline/_parser_internals/_state.pyx similarity index 100% rename from spacy/syntax/_state.pyx rename to spacy/pipeline/_parser_internals/_state.pyx diff --git a/spacy/syntax/arc_eager.pxd b/spacy/pipeline/_parser_internals/arc_eager.pxd similarity index 65% rename from spacy/syntax/arc_eager.pxd rename to spacy/pipeline/_parser_internals/arc_eager.pxd index a59be716a..e05a34f56 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/pipeline/_parser_internals/arc_eager.pxd @@ -1,8 +1,6 @@ -from cymem.cymem cimport Pool - from .stateclass cimport StateClass -from ..typedefs cimport weight_t, attr_t -from .transition_system cimport TransitionSystem, Transition +from ...typedefs cimport weight_t, attr_t +from .transition_system cimport Transition, TransitionSystem cdef class ArcEager(TransitionSystem): diff --git a/spacy/syntax/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx similarity index 98% rename from spacy/syntax/arc_eager.pyx rename to spacy/pipeline/_parser_internals/arc_eager.pyx index 6e63859f0..7db8aae0f 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -1,24 +1,17 @@ # cython: profile=True, cdivision=True, infer_types=True -from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool, Address from libc.stdint cimport int32_t from collections import defaultdict, Counter -import json -from ..typedefs cimport hash_t, attr_t -from ..strings cimport hash_string -from ..structs cimport TokenC -from ..tokens.doc cimport Doc, set_children_from_heads +from ...typedefs cimport hash_t, attr_t +from ...strings cimport hash_string +from ...structs cimport TokenC +from ...tokens.doc cimport Doc, set_children_from_heads +from ...gold.example cimport Example +from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC -from .transition_system cimport move_cost_func_t, label_cost_func_t -from ..gold.example cimport Example - -from ..errors import Errors -from .nonproj import is_nonproj_tree -from . import nonproj - # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 diff --git a/spacy/syntax/ner.pxd b/spacy/pipeline/_parser_internals/ner.pxd similarity index 58% rename from spacy/syntax/ner.pxd rename to spacy/pipeline/_parser_internals/ner.pxd index 989593a92..2264a1518 100644 --- a/spacy/syntax/ner.pxd +++ b/spacy/pipeline/_parser_internals/ner.pxd @@ -1,6 +1,4 @@ from .transition_system cimport TransitionSystem -from .transition_system cimport Transition -from ..typedefs cimport attr_t cdef class BiluoPushDown(TransitionSystem): diff --git a/spacy/syntax/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx similarity index 98% rename from spacy/syntax/ner.pyx rename to spacy/pipeline/_parser_internals/ner.pyx index c4125bbdf..2570ccdee 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/pipeline/_parser_internals/ner.pyx @@ -2,17 +2,14 @@ from collections import Counter from libc.stdint cimport int32_t from cymem.cymem cimport Pool -from ..typedefs cimport weight_t +from ...typedefs cimport weight_t, attr_t +from ...lexeme cimport Lexeme +from ...attrs cimport IS_SPACE +from ...gold.example cimport Example +from ...errors import Errors from .stateclass cimport StateClass from ._state cimport StateC -from .transition_system cimport Transition -from .transition_system cimport do_func_t -from ..lexeme cimport Lexeme -from ..attrs cimport IS_SPACE -from ..gold.iob_utils import biluo_tags_from_offsets -from ..gold.example cimport Example - -from ..errors import Errors +from .transition_system cimport Transition, do_func_t cdef enum: diff --git a/spacy/syntax/nonproj.pxd b/spacy/pipeline/_parser_internals/nonproj.pxd similarity index 100% rename from spacy/syntax/nonproj.pxd rename to spacy/pipeline/_parser_internals/nonproj.pxd diff --git a/spacy/syntax/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx similarity index 98% rename from spacy/syntax/nonproj.pyx rename to spacy/pipeline/_parser_internals/nonproj.pyx index 5ccb11f37..8f5fdaa71 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -5,9 +5,9 @@ scheme. """ from copy import copy -from ..tokens.doc cimport Doc, set_children_from_heads +from ...tokens.doc cimport Doc, set_children_from_heads -from ..errors import Errors +from ...errors import Errors DELIMITER = '||' diff --git a/spacy/syntax/stateclass.pxd b/spacy/pipeline/_parser_internals/stateclass.pxd similarity index 95% rename from spacy/syntax/stateclass.pxd rename to spacy/pipeline/_parser_internals/stateclass.pxd index 567982a3f..1d9f05538 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/pipeline/_parser_internals/stateclass.pxd @@ -1,12 +1,8 @@ -from libc.string cimport memcpy, memset - from cymem.cymem cimport Pool -cimport cython -from ..structs cimport TokenC, SpanC -from ..typedefs cimport attr_t +from ...structs cimport TokenC, SpanC +from ...typedefs cimport attr_t -from ..vocab cimport EMPTY_LEXEME from ._state cimport StateC diff --git a/spacy/syntax/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx similarity index 97% rename from spacy/syntax/stateclass.pyx rename to spacy/pipeline/_parser_internals/stateclass.pyx index e472e9861..880cf6cc5 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/pipeline/_parser_internals/stateclass.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True import numpy -from ..tokens.doc cimport Doc +from ...tokens.doc cimport Doc cdef class StateClass: diff --git a/spacy/syntax/transition_system.pxd b/spacy/pipeline/_parser_internals/transition_system.pxd similarity index 91% rename from spacy/syntax/transition_system.pxd rename to spacy/pipeline/_parser_internals/transition_system.pxd index 836c08168..ba4c33814 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/pipeline/_parser_internals/transition_system.pxd @@ -1,11 +1,11 @@ from cymem.cymem cimport Pool -from ..typedefs cimport attr_t, weight_t -from ..structs cimport TokenC -from ..strings cimport StringStore +from ...typedefs cimport attr_t, weight_t +from ...structs cimport TokenC +from ...strings cimport StringStore +from ...gold.example cimport Example from .stateclass cimport StateClass from ._state cimport StateC -from ..gold.example cimport Example cdef struct Transition: diff --git a/spacy/syntax/transition_system.pyx b/spacy/pipeline/_parser_internals/transition_system.pyx similarity index 97% rename from spacy/syntax/transition_system.pyx rename to spacy/pipeline/_parser_internals/transition_system.pyx index 17166dcf5..7694e7f34 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/pipeline/_parser_internals/transition_system.pyx @@ -1,19 +1,17 @@ # cython: infer_types=True from __future__ import print_function -from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from collections import Counter import srsly -from ..typedefs cimport weight_t -from ..tokens.doc cimport Doc -from ..structs cimport TokenC +from ...typedefs cimport weight_t, attr_t +from ...tokens.doc cimport Doc +from ...structs cimport TokenC from .stateclass cimport StateClass -from ..typedefs cimport attr_t -from ..errors import Errors -from .. import util +from ...errors import Errors +from ... import util cdef weight_t MIN_SCORE = -90000 diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx index a952385b4..65ffbbe50 100644 --- a/spacy/pipeline/dep_parser.pyx +++ b/spacy/pipeline/dep_parser.pyx @@ -1,13 +1,13 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional, Iterable -from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config +from thinc.api import Model, Config -from ..syntax.nn_parser cimport Parser -from ..syntax.arc_eager cimport ArcEager +from .transition_parser cimport Parser +from ._parser_internals.arc_eager cimport ArcEager from .functions import merge_subtokens from ..language import Language -from ..syntax import nonproj +from ._parser_internals import nonproj from ..scorer import Scorer diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index cc4e7b159..742b349e5 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -222,9 +222,9 @@ class EntityLinker(Pipe): set_dropout_rate(self.model, drop) if not sentence_docs: warnings.warn(Warnings.W093.format(name="Entity Linker")) - return 0.0 + return losses sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_similarity_loss( + loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) bp_context(d_scores) @@ -235,7 +235,7 @@ class EntityLinker(Pipe): self.set_annotations(docs, predictions) return losses - def get_similarity_loss(self, examples: Iterable[Example], sentence_encodings): + def get_loss(self, examples: Iterable[Example], sentence_encodings): entity_encodings = [] for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) @@ -247,7 +247,7 @@ class EntityLinker(Pipe): entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") if sentence_encodings.shape != entity_encodings.shape: err = Errors.E147.format( - method="get_similarity_loss", msg="gold entities do not match up" + method="get_loss", msg="gold entities do not match up" ) raise RuntimeError(err) gradients = self.distance.get_grad(sentence_encodings, entity_encodings) @@ -337,13 +337,13 @@ class EntityLinker(Pipe): final_kb_ids.append(candidates[0].entity_) else: random.shuffle(candidates) - # this will set all prior probabilities to 0 if they should be excluded from the model + # set all prior probabilities to 0 if incl_prior=False prior_probs = xp.asarray( [c.prior_prob for c in candidates] ) if not self.cfg.get("incl_prior"): prior_probs = xp.asarray( - [0.0 for c in candidates] + [0.0 for _ in candidates] ) scores = prior_probs # add in similarity from the context diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index 97826aaa6..d85030adb 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -1,7 +1,7 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional import numpy -from thinc.api import CosineDistance, to_categorical, to_categorical, Model, Config +from thinc.api import CosineDistance, to_categorical, Model, Config from thinc.api import set_dropout_rate from ..tokens.doc cimport Doc @@ -9,7 +9,7 @@ from ..tokens.doc cimport Doc from .pipe import Pipe from .tagger import Tagger from ..language import Language -from ..syntax import nonproj +from ._parser_internals import nonproj from ..attrs import POS, ID from ..errors import Errors @@ -219,3 +219,6 @@ class ClozeMultitask(Pipe): if losses is not None: losses[self.name] += loss + + def add_label(self, label): + raise NotImplementedError diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index 7ee4448fb..7f4fb8363 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -1,9 +1,9 @@ # cython: infer_types=True, profile=True, binding=True from typing import Optional, Iterable -from thinc.api import CosineDistance, to_categorical, get_array_module, Model, Config +from thinc.api import Model, Config -from ..syntax.nn_parser cimport Parser -from ..syntax.ner cimport BiluoPushDown +from .transition_parser cimport Parser +from ._parser_internals.ner cimport BiluoPushDown from ..language import Language from ..scorer import Scorer diff --git a/spacy/syntax/__init__.pxd b/spacy/pipeline/nn_parser.pyx similarity index 100% rename from spacy/syntax/__init__.pxd rename to spacy/pipeline/nn_parser.pyx diff --git a/spacy/pipeline/pipe.pxd b/spacy/pipeline/pipe.pxd new file mode 100644 index 000000000..bb97f79d0 --- /dev/null +++ b/spacy/pipeline/pipe.pxd @@ -0,0 +1,2 @@ +cdef class Pipe: + cdef public str name diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 196cdebdc..1a94905a2 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -8,7 +8,7 @@ from ..errors import Errors from .. import util -class Pipe: +cdef class Pipe: """This class is a base class and not instantiated directly. Trainable pipeline components like the EntityRecognizer or TextCategorizer inherit from it and it defines the interface that components should follow to @@ -17,8 +17,6 @@ class Pipe: DOCS: https://spacy.io/api/pipe """ - name = None - def __init__(self, vocab, model, name, **cfg): """Initialize a pipeline component. diff --git a/spacy/pipeline/sentencizer.pyx b/spacy/pipeline/sentencizer.pyx index 31208ea2c..be4351212 100644 --- a/spacy/pipeline/sentencizer.pyx +++ b/spacy/pipeline/sentencizer.pyx @@ -203,3 +203,9 @@ class Sentencizer(Pipe): cfg = srsly.read_json(path) self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self + + def get_loss(self, examples, scores): + raise NotImplementedError + + def add_label(self, label): + raise NotImplementedError diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx index c6eb43661..f826f21de 100644 --- a/spacy/pipeline/senter.pyx +++ b/spacy/pipeline/senter.pyx @@ -109,7 +109,7 @@ class SentenceRecognizer(Tagger): for eg in examples: eg_truth = [] for x in eg.get_aligned("sent_start"): - if x == None: + if x is None: eg_truth.append(None) elif x == 1: eg_truth.append(labels[1]) diff --git a/spacy/pipeline/simple_ner.py b/spacy/pipeline/simple_ner.py index 9b9872b77..44e1182c1 100644 --- a/spacy/pipeline/simple_ner.py +++ b/spacy/pipeline/simple_ner.py @@ -131,8 +131,6 @@ class SimpleNER(Pipe): return losses def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]: - loss = 0 - d_scores = [] truths = [] for eg in examples: tags = eg.get_aligned("TAG", as_string=True) @@ -159,7 +157,6 @@ class SimpleNER(Pipe): if not hasattr(get_examples, "__call__"): gold_tuples = get_examples get_examples = lambda: gold_tuples - labels = _get_labels(get_examples()) for label in _get_labels(get_examples()): self.add_label(label) labels = self.labels diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 2c399defc..639ce5514 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -238,8 +238,11 @@ class TextCategorizer(Pipe): DOCS: https://spacy.io/api/textcategorizer#rehearse """ + + if losses is not None: + losses.setdefault(self.name, 0.0) if self._rehearsal_model is None: - return + return losses try: docs = [eg.predicted for eg in examples] except AttributeError: @@ -250,7 +253,7 @@ class TextCategorizer(Pipe): raise TypeError(err) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. - return + return losses set_dropout_rate(self.model, drop) scores, bp_scores = self.model.begin_update(docs) target = self._rehearsal_model(examples) @@ -259,7 +262,6 @@ class TextCategorizer(Pipe): if sgd is not None: self.model.finish_update(sgd) if losses is not None: - losses.setdefault(self.name, 0.0) losses[self.name] += (gradient ** 2).sum() return losses diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index b147cf177..31643a7d3 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -199,6 +199,9 @@ class Tok2Vec(Pipe): docs = [Doc(self.vocab, words=["hello"])] self.model.initialize(X=docs) + def add_label(self, label): + raise NotImplementedError + class Tok2VecListener(Model): """A layer that gets fed its answers from an upstream connection, diff --git a/spacy/syntax/nn_parser.pxd b/spacy/pipeline/transition_parser.pxd similarity index 62% rename from spacy/syntax/nn_parser.pxd rename to spacy/pipeline/transition_parser.pxd index 7840ec27a..e594a3098 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/pipeline/transition_parser.pxd @@ -1,16 +1,15 @@ -from .stateclass cimport StateClass -from .arc_eager cimport TransitionSystem +from cymem.cymem cimport Pool + from ..vocab cimport Vocab -from ..tokens.doc cimport Doc -from ..structs cimport TokenC -from ._state cimport StateC -from ._parser_model cimport WeightsC, ActivationsC, SizesC +from .pipe cimport Pipe +from ._parser_internals.transition_system cimport Transition, TransitionSystem +from ._parser_internals._state cimport StateC +from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC -cdef class Parser: +cdef class Parser(Pipe): cdef readonly Vocab vocab cdef public object model - cdef public str name cdef public object _rehearsal_model cdef readonly TransitionSystem moves cdef readonly object cfg diff --git a/spacy/syntax/nn_parser.pyx b/spacy/pipeline/transition_parser.pyx similarity index 95% rename from spacy/syntax/nn_parser.pyx rename to spacy/pipeline/transition_parser.pyx index a0ee13a0a..b14a55cb4 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -1,42 +1,32 @@ # cython: infer_types=True, cdivision=True, boundscheck=False -cimport cython.parallel +from __future__ import print_function +from cymem.cymem cimport Pool cimport numpy as np from itertools import islice -from cpython.ref cimport PyObject, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno -from libc.math cimport exp from libcpp.vector cimport vector -from libc.string cimport memset, memcpy +from libc.string cimport memset from libc.stdlib cimport calloc, free -from cymem.cymem cimport Pool -from thinc.backends.linalg cimport Vec, VecVec -from thinc.api import chain, clone, Linear, list2array, NumpyOps, CupyOps, use_ops -from thinc.api import get_array_module, zero_init, set_dropout_rate -from itertools import islice import srsly + +from ._parser_internals.stateclass cimport StateClass +from ..ml.parser_model cimport alloc_activations, free_activations +from ..ml.parser_model cimport predict_states, arg_max_if_valid +from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss +from ..ml.parser_model cimport get_c_weights, get_c_sizes + +from ..tokens.doc cimport Doc +from ..errors import Errors, Warnings +from .. import util +from ..util import create_default_optimizer + +from thinc.api import set_dropout_rate import numpy.random import numpy import warnings -from ..tokens.doc cimport Doc -from ..typedefs cimport weight_t, class_t, hash_t -from ._parser_model cimport alloc_activations, free_activations -from ._parser_model cimport predict_states, arg_max_if_valid -from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss -from ._parser_model cimport get_c_weights, get_c_sizes -from .stateclass cimport StateClass -from ._state cimport StateC -from .transition_system cimport Transition -from ..util import create_default_optimizer, registry -from ..compat import copy_array -from ..errors import Errors, Warnings -from .. import util -from . import nonproj - - -cdef class Parser: +cdef class Parser(Pipe): """ Base class of the DependencyParser and EntityRecognizer. """ @@ -107,7 +97,7 @@ cdef class Parser: @property def tok2vec(self): - '''Return the embedding and convolutional layer of the model.''' + """Return the embedding and convolutional layer of the model.""" return self.model.get_ref("tok2vec") @property @@ -138,13 +128,13 @@ cdef class Parser: raise NotImplementedError def init_multitask_objectives(self, get_examples, pipeline, **cfg): - '''Setup models for secondary objectives, to benefit from multi-task + """Setup models for secondary objectives, to benefit from multi-task learning. This method is intended to be overridden by subclasses. For instance, the dependency parser can benefit from sharing an input representation with a label prediction model. These auxiliary models are discarded after training. - ''' + """ pass def use_params(self, params): diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 77e142215..fd1880030 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -4,8 +4,8 @@ from spacy import registry from spacy.gold import Example from spacy.pipeline import DependencyParser from spacy.tokens import Doc -from spacy.syntax.nonproj import projectivize -from spacy.syntax.arc_eager import ArcEager +from spacy.pipeline._parser_internals.nonproj import projectivize +from spacy.pipeline._parser_internals.arc_eager import ArcEager from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 4a6bf73a5..013ae6b7e 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -5,7 +5,7 @@ from spacy.lang.en import English from spacy.language import Language from spacy.lookups import Lookups -from spacy.syntax.ner import BiluoPushDown +from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.gold import Example from spacy.tokens import Doc from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index feae52f7f..6594c7e78 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -3,8 +3,8 @@ import pytest from spacy import registry from spacy.gold import Example from spacy.vocab import Vocab -from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.nn_parser import Parser +from spacy.pipeline._parser_internals.arc_eager import ArcEager +from spacy.pipeline.transition_parser import Parser from spacy.tokens.doc import Doc from thinc.api import Model from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 496ec7e03..5bdebd0ca 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -1,7 +1,7 @@ import pytest -from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc -from spacy.syntax.nonproj import is_nonproj_tree -from spacy.syntax import nonproj +from spacy.pipeline._parser_internals.nonproj import ancestors, contains_cycle, is_nonproj_arc +from spacy.pipeline._parser_internals.nonproj import is_nonproj_tree +from spacy.pipeline._parser_internals import nonproj from ..util import get_doc