From 251b3eb4e5c688e076f4e761a43ffbab9ea793b9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 14:59:13 +0200 Subject: [PATCH 01/18] add initialize method for entity_ruler --- spacy/errors.py | 2 ++ spacy/pipeline/entityruler.py | 30 +++++++++++++++++++++++++++++- spacy/training/initialize.py | 2 +- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 20edf45b5..18abb6bba 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,6 +456,8 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E900 = ("Patterns for component '{name}' not initialized. This can be fixed " + "by calling 'add_patterns' or 'initialize'.") E092 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " "https://nightly.spacy.io/api/cli#convert") diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 9166a69b8..a4bc098fb 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,7 +1,8 @@ -from typing import Optional, Union, List, Dict, Tuple, Iterable, Any +from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable from collections import defaultdict from pathlib import Path import srsly +from spacy.training import Example from ..language import Language from ..errors import Errors @@ -133,6 +134,7 @@ class EntityRuler: DOCS: https://nightly.spacy.io/api/entityruler#call """ + self._require_patterns() matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] @@ -183,6 +185,27 @@ class EntityRuler: all_labels.add(l) return tuple(all_labels) + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Optional[Language] = None, + patterns_path: Optional[Path] = None + ): + """Initialize the pipe for training. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + patterns_path: Path to serialized patterns. + + DOCS (TODO): https://nightly.spacy.io/api/entityruler#initialize + """ + if patterns_path: + patterns = srsly.read_jsonl(patterns_path) + self.add_patterns(patterns) + + @property def ent_ids(self) -> Tuple[str, ...]: """All entity ids present in the match patterns `id` properties @@ -292,6 +315,11 @@ class EntityRuler: self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(dict) + def _require_patterns(self) -> None: + """Raise an error if the component has no patterns.""" + if not self.patterns or list(self.patterns) == [""]: + raise ValueError(Errors.E900.format(name=self.name)) + def _split_label(self, label: str) -> Tuple[str, str]: """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index bbdf4f62b..7c84caf95 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -49,7 +49,7 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": nlp.resume_training(sgd=optimizer) with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) - logger.info("Initialized pipeline components") + logger.info(f"Initialized pipeline components: {nlp.pipe_names}") return nlp From 65abd777796b6850117180dc90399c5fb7f02ce3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 16:23:33 +0200 Subject: [PATCH 02/18] add finish_update to Pipe --- spacy/language.py | 2 +- spacy/pipeline/entity_linker.py | 2 +- spacy/pipeline/multitask.pyx | 2 +- spacy/pipeline/pipe.pyx | 13 ++++++++++++- spacy/pipeline/tagger.pyx | 4 ++-- spacy/pipeline/textcat.py | 4 ++-- spacy/pipeline/tok2vec.py | 2 +- spacy/pipeline/transition_parser.pyx | 4 ++-- website/docs/api/pipe.md | 18 ++++++++++++++++++ 9 files changed, 40 insertions(+), 11 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 9fdde03d5..be5886efa 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1094,7 +1094,7 @@ class Language: and hasattr(proc, "model") and proc.model not in (True, False, None) ): - proc.model.finish_update(sgd) + proc.finish_update(sgd) return losses def rehearse( diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index b67a15d32..2a5f3962d 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -238,7 +238,7 @@ class EntityLinker(Pipe): ) bp_context(d_scores) if sgd is not None: - self.model.finish_update(sgd) + self.finish_update(sgd) losses[self.name] += loss if set_annotations: self.set_annotations(docs, predictions) diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx index ba351f16e..fa304b842 100644 --- a/spacy/pipeline/multitask.pyx +++ b/spacy/pipeline/multitask.pyx @@ -209,7 +209,7 @@ class ClozeMultitask(Pipe): loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions) if sgd is not None: - self.model.finish_update(sgd) + self.finish_update(sgd) if losses is not None: losses[self.name] += loss return losses diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 41ca23ace..585cdc780 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -132,7 +132,7 @@ cdef class Pipe: loss, d_scores = self.get_loss(examples, scores) bp_scores(d_scores) if sgd not in (None, False): - self.model.finish_update(sgd) + self.finish_update(sgd) losses[self.name] += loss if set_annotations: docs = [eg.predicted for eg in examples] @@ -245,6 +245,17 @@ cdef class Pipe: with self.model.use_params(params): yield + def finish_update(self, sgd): + """Update parameters using the current parameter gradients. + The Optimizer instance contains the functionality to perform + the stochastic gradient descent. + + sgd (thinc.api.Optimizer): The optimizer. + + DOCS: https://nightly.spacy.io/api/pipe#finish_update + """ + self.model.finish_update(sgd) + def score(self, examples, **kwargs): """Score a batch of examples. diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index 6cb582b36..5122e8ea9 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -203,7 +203,7 @@ class Tagger(Pipe): loss, d_tag_scores = self.get_loss(examples, tag_scores) bp_tag_scores(d_tag_scores) if sgd not in (None, False): - self.model.finish_update(sgd) + self.finish_update(sgd) losses[self.name] += loss if set_annotations: @@ -238,7 +238,7 @@ class Tagger(Pipe): target = self._rehearsal_model(examples) gradient = guesses - target backprop(gradient) - self.model.finish_update(sgd) + self.finish_update(sgd) if losses is not None: losses.setdefault(self.name, 0.0) losses[self.name] += (gradient**2).sum() diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index fc60ebf89..a37212e9e 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -212,7 +212,7 @@ class TextCategorizer(Pipe): loss, d_scores = self.get_loss(examples, scores) bp_scores(d_scores) if sgd is not None: - self.model.finish_update(sgd) + self.finish_update(sgd) losses[self.name] += loss if set_annotations: docs = [eg.predicted for eg in examples] @@ -256,7 +256,7 @@ class TextCategorizer(Pipe): gradient = scores - target bp_scores(gradient) if sgd is not None: - self.model.finish_update(sgd) + self.finish_update(sgd) if losses is not None: losses[self.name] += (gradient ** 2).sum() return losses diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 89f9df757..0f309326e 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -188,7 +188,7 @@ class Tok2Vec(Pipe): accumulate_gradient(one_d_tokvecs) d_docs = bp_tokvecs(d_tokvecs) if sgd is not None: - self.model.finish_update(sgd) + self.finish_update(sgd) return d_docs batch_id = Tok2VecListener.get_batch_id(docs) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index bcaa8e8d4..2ad0acd3a 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -315,7 +315,7 @@ cdef class Parser(Pipe): backprop_tok2vec(golds) if sgd not in (None, False): - self.model.finish_update(sgd) + self.finish_update(sgd) if set_annotations: docs = [eg.predicted for eg in examples] self.set_annotations(docs, all_states) @@ -367,7 +367,7 @@ cdef class Parser(Pipe): # Do the backprop backprop_tok2vec(docs) if sgd is not None: - self.model.finish_update(sgd) + self.finish_update(sgd) losses[self.name] += loss / n_scores del backprop del backprop_tok2vec diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md index de35f9eb4..b98768dcf 100644 --- a/website/docs/api/pipe.md +++ b/website/docs/api/pipe.md @@ -294,6 +294,24 @@ context, the original parameters are restored. | -------- | -------------------------------------------------- | | `params` | The parameter values to use in the model. ~~dict~~ | +## Pipe.finish_update {#finish_update tag="method"} + +Update parameters using the current parameter gradients. Defaults to calling +[`self.model.finish_update`](https://thinc.ai/docs/api-model#finish_update). + +> #### Example +> +> ```python +> pipe = nlp.add_pipe("your_custom_pipe") +> optimizer = nlp.initialize() +> losses = pipe.update(examples, sgd=None) +> pipe.finish_update(sgd) +> ``` + +| Name | Description | +| ----- | ------------------------------------- | +| `sgd` | An optimizer. ~~Optional[Optimizer]~~ | + ## Pipe.add_label {#add_label tag="method"} > #### Example From dc06912c764991d2d6718919e5e96cae867a472d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 16:33:28 +0200 Subject: [PATCH 03/18] prevent loss keyerror for non-trainable components --- spacy/training/loggers.py | 17 ++++------------- spacy/training/loop.py | 5 +++-- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index f0ca7064a..467f1e36b 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -41,19 +41,10 @@ def console_logger(progress_bar: bool = False): if progress is not None: progress.update(1) return - try: - losses = [ - "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in logged_pipes - ] - except KeyError as e: - raise KeyError( - Errors.E983.format( - dict="scores (losses)", - key=str(e), - keys=list(info["losses"].keys()), - ) - ) from None + losses = [ + "{0:.2f}".format(float(info["losses"][pipe_name])) + for pipe_name in logged_pipes if pipe_name in info["losses"] + ] scores = [] for col in score_cols: diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 0d4414964..8f0aea6d4 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -184,7 +184,7 @@ def train_while_improving( and hasattr(proc, "model") and proc.model not in (True, False, None) ): - proc.model.finish_update(optimizer) + proc.finish_update(optimizer) optimizer.step_schedules() if not (step % eval_frequency): if optimizer.averages: @@ -287,7 +287,8 @@ def update_meta( if metric is not None: nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) for pipe_name in nlp.pipe_names: - nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] + if pipe_name in info["losses"]: + nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] def create_before_to_disk_callback( From 4e3ace4b8c32b1b8806874e2c3120989f9ddaba9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 17:43:42 +0200 Subject: [PATCH 04/18] is_trainable method --- spacy/language.py | 7 +++++-- spacy/pipeline/entityruler.py | 17 +++++++++-------- spacy/pipeline/pipe.pyx | 3 +++ spacy/training/loggers.py | 10 +++++++--- spacy/training/loop.py | 3 ++- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index be5886efa..c3c49d331 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1091,7 +1091,8 @@ class Language: for name, proc in self.pipeline: if ( name not in exclude - and hasattr(proc, "model") + and hasattr(proc, "is_trainable") + and proc.is_trainable() and proc.model not in (True, False, None) ): proc.finish_update(sgd) @@ -1297,7 +1298,9 @@ class Language: for name, pipe in self.pipeline: kwargs = component_cfg.get(name, {}) kwargs.setdefault("batch_size", batch_size) - if not hasattr(pipe, "pipe"): + # non-trainable components may have a pipe() implementation that refers to dummy + # predict and set_annotations methods + if not hasattr(pipe, "pipe") or not hasattr(pipe, "is_trainable") or not pipe.is_trainable(): docs = _pipe(docs, pipe, kwargs) else: docs = pipe.pipe(docs, **kwargs) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index a4bc098fb..e89dd8410 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -2,8 +2,9 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable from collections import defaultdict from pathlib import Path import srsly -from spacy.training import Example +from .pipe import Pipe +from ..training import Example from ..language import Language from ..errors import Errors from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList @@ -51,7 +52,7 @@ def make_entity_ruler( ) -class EntityRuler: +class EntityRuler(Pipe): """The EntityRuler lets you add spans to the `Doc.ents` using token-based rules or exact phrase matches. It can be combined with the statistical `EntityRecognizer` to boost accuracy, or used on its own to implement a @@ -134,7 +135,6 @@ class EntityRuler: DOCS: https://nightly.spacy.io/api/entityruler#call """ - self._require_patterns() matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = set( [(m_id, start, end) for m_id, start, end in matches if start != end] @@ -315,11 +315,6 @@ class EntityRuler: self.phrase_patterns = defaultdict(list) self._ent_ids = defaultdict(dict) - def _require_patterns(self) -> None: - """Raise an error if the component has no patterns.""" - if not self.patterns or list(self.patterns) == [""]: - raise ValueError(Errors.E900.format(name=self.name)) - def _split_label(self, label: str) -> Tuple[str, str]: """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep @@ -348,6 +343,12 @@ class EntityRuler: validate_examples(examples, "EntityRuler.score") return Scorer.score_spans(examples, "ents", **kwargs) + def predict(self, docs): + pass + + def set_annotations(self, docs, scores): + pass + def from_bytes( self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() ) -> "EntityRuler": diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx index 585cdc780..70cc1e54e 100644 --- a/spacy/pipeline/pipe.pyx +++ b/spacy/pipeline/pipe.pyx @@ -228,6 +228,9 @@ cdef class Pipe: def is_resizable(self): return hasattr(self, "model") and "resize_output" in self.model.attrs + def is_trainable(self): + return hasattr(self, "model") and isinstance(self.model, Model) + def set_output(self, nO): if self.is_resizable(): self.model.attrs["resize_output"](self.model, nO) diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 467f1e36b..3a133a0df 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -17,8 +17,12 @@ def console_logger(progress_bar: bool = False): nlp: "Language", stdout: IO = sys.stdout, stderr: IO = sys.stderr ) -> Tuple[Callable[[Optional[Dict[str, Any]]], None], Callable[[], None]]: msg = Printer(no_print=True) - # we assume here that only components are enabled that should be trained & logged - logged_pipes = nlp.pipe_names + # ensure that only trainable components are logged + logged_pipes = [ + name + for name, proc in nlp.pipeline + if hasattr(proc, "is_trainable") and proc.is_trainable() + ] eval_frequency = nlp.config["training"]["eval_frequency"] score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] @@ -43,7 +47,7 @@ def console_logger(progress_bar: bool = False): return losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) - for pipe_name in logged_pipes if pipe_name in info["losses"] + for pipe_name in logged_pipes ] scores = [] diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 8f0aea6d4..12395e0b4 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -181,7 +181,8 @@ def train_while_improving( for name, proc in nlp.pipeline: if ( name not in exclude - and hasattr(proc, "model") + and hasattr(proc, "is_trainable") + and proc.is_trainable() and proc.model not in (True, False, None) ): proc.finish_update(optimizer) From 3ac3447eee4b417ab257068bb894474bd6d6c059 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 17:50:37 +0200 Subject: [PATCH 05/18] cleanup --- spacy/errors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 59da84890..9d9a716d2 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -456,8 +456,6 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master - E900 = ("Patterns for component '{name}' not initialized. This can be fixed " - "by calling 'add_patterns' or 'initialize'.") E902 = ("The sentence-per-line IOB/IOB2 file is not formatted correctly. " "Try checking whitespace and delimiters. See " "https://nightly.spacy.io/api/cli#convert") From 193e0d5a98e81a52730e1721bacb7b220e93affe Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 18:04:08 +0200 Subject: [PATCH 06/18] add docs for entity_ruler.initialize --- spacy/pipeline/entityruler.py | 2 +- website/docs/api/entityruler.md | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index e89dd8410..cad6dbdbc 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -199,7 +199,7 @@ class EntityRuler(Pipe): nlp (Language): The current nlp object the component is part of. patterns_path: Path to serialized patterns. - DOCS (TODO): https://nightly.spacy.io/api/entityruler#initialize + DOCS: https://nightly.spacy.io/api/entityruler#initialize """ if patterns_path: patterns = srsly.read_jsonl(patterns_path) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 7b7e5b635..052047635 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -74,6 +74,30 @@ be a token pattern (list) or a phrase pattern (string). For example: | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ | | `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ | +## EntityRuler.initialize {#initialize tag="method" new="3"} + +Initialize the component with patterns from a file. + +> #### Example +> +> ```python +> entity_ruler = nlp.add_pipe("entity_ruler") +> entity_ruler.initialize(lambda: [], nlp=nlp, patterns_path=patterns_path) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.entity_ruler] +> patterns_path = "data/patterns/patterns.jsonl" +> ``` + +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `labels` | Path to the .json file holding the serialized patterns. ~~Path~~ | + ## EntityRuler.\_\len\_\_ {#len tag="method"} The number of all patterns added to the entity ruler. @@ -256,6 +280,6 @@ Get all patterns that were added to the entity ruler. | Name | Description | | ----------------- | --------------------------------------------------------------------------------------------------------------------- | | `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | -| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ | +| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ | | `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | From ff9ac39c88d8eac8e599041a63a69ff754690f5a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 5 Oct 2020 22:50:14 +0200 Subject: [PATCH 07/18] read entity_ruler patterns with srsly.read_jsonl.v1 --- spacy/language.py | 4 +++- spacy/pipeline/entityruler.py | 9 ++++--- spacy/pipeline/transition_parser.pyx | 4 +++- spacy/schemas.py | 2 +- spacy/tests/pipeline/test_attributeruler.py | 2 +- spacy/tests/pipeline/test_entity_ruler.py | 26 +++++++++++++++++++++ website/docs/api/entityruler.md | 9 ++++--- 7 files changed, 44 insertions(+), 12 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index c3c49d331..ba244617e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1410,7 +1410,9 @@ class Language: kwargs = component_cfg.get(name, {}) # Allow component_cfg to overwrite the top-level kwargs. kwargs.setdefault("batch_size", batch_size) - if hasattr(proc, "pipe"): + # non-trainable components may have a pipe() implementation that refers to dummy + # predict and set_annotations methods + if hasattr(proc, "pipe") and hasattr(proc, "is_trainable") and proc.is_trainable(): f = functools.partial(proc.pipe, **kwargs) else: # Apply the function, but yield the doc diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index cad6dbdbc..6ca586d05 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable +from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence from collections import defaultdict from pathlib import Path import srsly @@ -190,19 +190,18 @@ class EntityRuler(Pipe): get_examples: Callable[[], Iterable[Example]], *, nlp: Optional[Language] = None, - patterns_path: Optional[Path] = None + patterns: Optional[Sequence[PatternType]] = None, ): """Initialize the pipe for training. get_examples (Callable[[], Iterable[Example]]): Function that returns a representative sample of gold-standard Example objects. nlp (Language): The current nlp object the component is part of. - patterns_path: Path to serialized patterns. + patterns Optional[Iterable[PatternType]]: The list of patterns. DOCS: https://nightly.spacy.io/api/entityruler#initialize """ - if patterns_path: - patterns = srsly.read_jsonl(patterns_path) + if patterns: self.add_patterns(patterns) diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 2ad0acd3a..3b4406757 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -437,7 +437,9 @@ cdef class Parser(Pipe): for name, component in nlp.pipeline: if component is self: break - if hasattr(component, "pipe"): + # non-trainable components may have a pipe() implementation that refers to dummy + # predict and set_annotations methods + if hasattr(component, "pipe") and hasattr(component, "is_trainable") and component.is_trainable(): doc_sample = list(component.pipe(doc_sample, batch_size=8)) else: doc_sample = [component(doc) for doc in doc_sample] diff --git a/spacy/schemas.py b/spacy/schemas.py index 591b7e134..f4d306fd7 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -119,7 +119,7 @@ def validate_init_settings( if types don't match or required values are missing. func (Callable): The initialize method of a given component etc. - settings (Dict[str, Any]): The settings from the repsective [initialize] block. + settings (Dict[str, Any]): The settings from the respective [initialize] block. section (str): Initialize section, for error message. name (str): Name of the block in the section. exclude (Iterable[str]): Parameter names to exclude from schema. diff --git a/spacy/tests/pipeline/test_attributeruler.py b/spacy/tests/pipeline/test_attributeruler.py index c967bcdcd..fedeb192f 100644 --- a/spacy/tests/pipeline/test_attributeruler.py +++ b/spacy/tests/pipeline/test_attributeruler.py @@ -121,7 +121,7 @@ def test_attributeruler_init_patterns(nlp, pattern_dicts): assert doc.has_annotation("LEMMA") assert doc.has_annotation("MORPH") nlp.remove_pipe("attribute_ruler") - # initialize with patterns from asset + # initialize with patterns from misc registry nlp.config["initialize"]["components"]["attribute_ruler"] = { "patterns": {"@misc": "attribute_ruler_patterns"} } diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index d70d0326e..96deab24b 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -1,4 +1,6 @@ import pytest + +from spacy import registry from spacy.tokens import Span from spacy.language import Language from spacy.pipeline import EntityRuler @@ -11,6 +13,7 @@ def nlp(): @pytest.fixture +@registry.misc("entity_ruler_patterns") def patterns(): return [ {"label": "HELLO", "pattern": "hello world"}, @@ -42,6 +45,29 @@ def test_entity_ruler_init(nlp, patterns): assert doc.ents[1].label_ == "BYE" +def test_entity_ruler_init_patterns(nlp, patterns): + # initialize with patterns + ruler = nlp.add_pipe("entity_ruler") + assert len(ruler.labels) == 0 + ruler.initialize(lambda: [], patterns=patterns) + assert len(ruler.labels) == 4 + doc = nlp("hello world bye bye") + assert doc.ents[0].label_ == "HELLO" + assert doc.ents[1].label_ == "BYE" + nlp.remove_pipe("entity_ruler") + # initialize with patterns from misc registry + nlp.config["initialize"]["components"]["entity_ruler"] = { + "patterns": {"@misc": "entity_ruler_patterns"} + } + ruler = nlp.add_pipe("entity_ruler") + assert len(ruler.labels) == 0 + nlp.initialize() + assert len(ruler.labels) == 4 + doc = nlp("hello world bye bye") + assert doc.ents[0].label_ == "HELLO" + assert doc.ents[1].label_ == "BYE" + + def test_entity_ruler_existing(nlp, patterns): ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index 052047635..b8aab2f50 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -82,13 +82,16 @@ Initialize the component with patterns from a file. > > ```python > entity_ruler = nlp.add_pipe("entity_ruler") -> entity_ruler.initialize(lambda: [], nlp=nlp, patterns_path=patterns_path) +> entity_ruler.initialize(lambda: [], nlp=nlp, patterns=patterns) > ``` > > ```ini > ### config.cfg > [initialize.components.entity_ruler] -> patterns_path = "data/patterns/patterns.jsonl" +> +> [initialize.components.entity_ruler.patterns] +> @readers = "srsly.read_jsonl.v1" +> path = "corpus/entity_ruler_patterns.jsonl > ``` | Name | Description | @@ -96,7 +99,7 @@ Initialize the component with patterns from a file. | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Not used by the `EntityRuler`. ~~Callable[[], Iterable[Example]]~~ | | _keyword-only_ | | | `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | -| `labels` | Path to the .json file holding the serialized patterns. ~~Path~~ | +| `patterns` | The list of patterns. Defaults to `None`. ~~Optional[Sequence[Dict[str, Union[str, List[Dict[str, Any]]]]]]~~ | ## EntityRuler.\_\len\_\_ {#len tag="method"} From fd0f60e2bc6046454dd5624b71aaebf21364e76e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 6 Oct 2020 09:28:53 +0200 Subject: [PATCH 08/18] updates to data format for training and pretraining --- website/docs/api/data-formats.md | 58 ++++++++++++++++---------------- website/docs/usage/training.md | 2 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index c1b9bfef4..a97dcd2f6 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -180,24 +180,24 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -205,17 +205,17 @@ This section is optional and defines settings and controls for [language model pretraining](/usage/embeddings-transformers#pretraining). It's used when you run [`spacy pretrain`](/api/cli#pretrain). -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------ | -| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | -| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | -| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | -| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | -| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ | -| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ | -| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ | +| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ | +| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ | +| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `corpus` | Dot notation of the config location defining the corpus with raw text. Defaults to `corpora.pretrain`. ~~str~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `component` | Component name to identify the layer with the model to pretrain. Defaults to `"tok2vec"`. ~~str~~ | +| `layer` | The specific layer of the model to pretrain. If empty, the whole model will be used. ~~str~~ | ### initialize {#config-initialize tag="section"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 1981f03b7..64b3b85ad 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training). If you don't want a component to be updated, you can **freeze** it by adding it to the `frozen_components` list in the `[training]` block. Frozen components are **not updated** during training and are included in the final trained pipeline -as-is. +as-is. They are also excluded when calling `nlp.initialize()`. > #### Note on frozen components > From 9b4cf7b0b6b614ff044ae610217a3a73dcf35851 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 6 Oct 2020 09:47:23 +0200 Subject: [PATCH 09/18] update output of debug config command --- spacy/cli/_util.py | 2 +- website/docs/api/cli.md | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 373650172..60e400fb4 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -278,7 +278,7 @@ def show_validation_error( "fill-config' command to fill in all the defaults, if possible:", spaced=True, ) - print(f"{COMMAND} init fill-config {config_path} --base {config_path}\n") + print(f"{COMMAND} init fill-config {config_path} {config_path} \n") sys.exit(1) except InterpolationError as e: msg.fail("Config validation error", e, exits=1) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index e51e698dd..138b4b94b 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -300,17 +300,16 @@ $ python -m spacy debug config [config_path] [--code] [--show-functions] [--show ``` ✘ Config validation error +dropout field required +optimizer field required +optimize extra fields not permitted -training -> dropout field required -training -> optimizer field required -training -> optimize extra fields not permitted - -{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}} +{'seed': 0, 'accumulate_gradient': 1, 'dev_corpus': 'corpora.dev', 'train_corpus': 'corpora.train', 'gpu_allocator': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'before_to_disk': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'logger': {'@loggers': 'spacy.ConsoleLogger.v1', 'progress_bar': False}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}} If your config contains missing values, you can run the 'init fill-config' command to fill in all the defaults, if possible: -python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/starter-config_invalid.cfg +python -m spacy init fill-config tmp/starter-config_invalid.cfg tmp/starter-config_invalid.cfg ``` From 2e961817cbcf63afa1ee81ea8338850fc4cc157f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 6 Oct 2020 10:23:01 +0200 Subject: [PATCH 10/18] Update docs [ci skip] --- website/docs/usage/training.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 64b3b85ad..e63e25e52 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -378,7 +378,7 @@ weights and [resume training](/api/language#resume_training). If you don't want a component to be updated, you can **freeze** it by adding it to the `frozen_components` list in the `[training]` block. Frozen components are **not updated** during training and are included in the final trained pipeline -as-is. They are also excluded when calling `nlp.initialize()`. +as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize). > #### Note on frozen components > From 2fd7122074259bde66d79e2f6a289809d545777e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 6 Oct 2020 10:31:48 +0200 Subject: [PATCH 11/18] Update docs [ci skip] --- website/docs/api/attributeruler.md | 4 ++-- website/docs/api/data-formats.md | 3 +++ website/docs/api/entityruler.md | 9 +++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/website/docs/api/attributeruler.md b/website/docs/api/attributeruler.md index b89759080..d60362a47 100644 --- a/website/docs/api/attributeruler.md +++ b/website/docs/api/attributeruler.md @@ -128,8 +128,8 @@ Get all patterns that have been added to the attribute ruler in the ## AttributeRuler.initialize {#initialize tag="method"} -Initialize the component with data. Typically called before training to load in -rules from a file. This method is typically called by +Initialize the component with data and used before training to load in rules +from a file. This method is typically called by [`Language.initialize`](/api/language#initialize) and lets you customize arguments it receives via the [`[initialize.components]`](/api/data-formats#config-initialize) block in the diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index a97dcd2f6..c4cc5b1e4 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -224,6 +224,9 @@ It's used by [`Language.initialize`](/api/language#initialize) and typically called right before training (but not at runtime). The section allows you to specify local file paths or custom functions to load data resources from, without requiring them at runtime when you load the trained pipeline back in. +Also see the usage guides on the +[config lifecycle](/usage/training#config-lifecycle) and +[custom initialization](/usage/training#initialization). > #### Example > diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index b8aab2f50..76a4b3604 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -76,7 +76,12 @@ be a token pattern (list) or a phrase pattern (string). For example: ## EntityRuler.initialize {#initialize tag="method" new="3"} -Initialize the component with patterns from a file. +Initialize the component with data and used before training to load in rules +from a file. This method is typically called by +[`Language.initialize`](/api/language#initialize) and lets you customize +arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. > #### Example > @@ -204,7 +209,7 @@ only the patterns are saved as JSONL. If a directory name is provided, a ## EntityRuler.from_disk {#from_disk tag="method"} -Load the entity ruler from a file. Expects either a file containing +Load the entity ruler from a path. Expects either a file containing newline-delimited JSON (JSONL) with one entry per line, or a directory containing a `patterns.jsonl` file and a `cfg` file with the component configuration. From 59982d5ef8155fe8a3b1d58b016d669c05426f0b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 6 Oct 2020 10:40:43 +0200 Subject: [PATCH 12/18] Add pip upgrade step to README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3e5e5febe..5d310492d 100644 --- a/README.md +++ b/README.md @@ -104,9 +104,11 @@ For detailed installation instructions, see the ### pip Using pip, spaCy releases are available as source packages and binary wheels (as -of `v2.0.13`). +of `v2.0.13`). Before you install spaCy and its dependencies, make sure that +your `pip`, `setuptools` and `wheel` are up to date. ```bash +pip install -U pip setuptools wheel pip install spacy ``` From aa9c9f3bf0acf88c596b006c157b5f56ed306aeb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 6 Oct 2020 11:21:17 +0200 Subject: [PATCH 13/18] Update Chinese usage for spacy-pkuseg --- website/docs/usage/models.md | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index dc41385f2..fe3ee6e04 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -98,10 +98,10 @@ The Chinese language class supports three word segmentation options, `char`, > # Jieba > cfg = {"segmenter": "jieba"} > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) -> # PKUSeg with "default" model provided by pkuseg +> # PKUSeg with "mixed" model provided by pkuseg > cfg = {"segmenter": "pkuseg"} > nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) -> nlp.tokenizer.initialize(pkuseg_model="default") +> nlp.tokenizer.initialize(pkuseg_model="mixed") > ``` ```ini @@ -115,7 +115,7 @@ segmenter = "char" | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `char` | **Character segmentation:** Character segmentation is the default segmentation option. It's enabled when you create a new `Chinese` language class or call `spacy.blank("zh")`. | | `jieba` | **Jieba:** to use [Jieba](https://github.com/fxsjy/jieba) for word segmentation, you can set the option `segmenter` to `"jieba"`. | -| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/lancopku/PKUSeg-python) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. | +| `pkuseg` | **PKUSeg**: As of spaCy v2.3.0, support for [PKUSeg](https://github.com/explosion/spacy-pkuseg) has been added to support better segmentation for Chinese OntoNotes and the provided [Chinese pipelines](/models/zh). Enable PKUSeg by setting tokenizer option `segmenter` to `"pkuseg"`. | @@ -133,10 +133,10 @@ runtime. The `initialize` method for the Chinese tokenizer class supports the following config settings for loading `pkuseg` models: -| Name | Description | -| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | -| `pkuseg_model` | Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ | -| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`. ~~str~~ | +| Name | Description | +| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `pkuseg_model` | Name of a model provided by `spacy-pkuseg` or the path to a local model directory. ~~str~~ | +| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. Defaults to `"default"`, the default provided dictionary. ~~str~~ | The initialization settings are typically provided in the [training config](/usage/training#config) and the data is loaded in before @@ -164,14 +164,17 @@ You can also initialize the tokenizer for a blank language class by calling its cfg = {"segmenter": "pkuseg"} nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}}) -# Load "default" model -nlp.tokenizer.initialize(pkuseg_model="default") +# Load spaCy's OntoNotes model +nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes") + +# Load pkuseg's "news" model +nlp.tokenizer.initialize(pkuseg_model="news") # Load local model nlp.tokenizer.initialize(pkuseg_model="/path/to/pkuseg_model") # Override the user directory -nlp.tokenizer.initialize(pkuseg_model="default", pkuseg_user_dict="/path/to/user_dict") +nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes", pkuseg_user_dict="/path/to/user_dict") ``` You can also modify the user dictionary on-the-fly: @@ -195,13 +198,13 @@ The [Chinese pipelines](/models/zh) provided by spaCy include a custom `pkuseg` model trained only on [Chinese OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19), since the models provided by `pkuseg` include data restricted to research use. For -research use, `pkuseg` provides models for several different domains -(`"default"`, `"news"` `"web"`, `"medicine"`, `"tourism"`) and for other uses, -`pkuseg` provides a simple -[training API](https://github.com/lancopku/pkuseg-python/blob/master/readme/readme_english.md#usage): +research use, `pkuseg` provides models for several different domains (`"mixed"` +(equivalent to `"default"` from `pkuseg` packages), `"news"` `"web"`, +`"medicine"`, `"tourism"`) and for other uses, `pkuseg` provides a simple +[training API](https://github.com/explosion/spacy-pkuseg/blob/master/readme/readme_english.md#usage): ```python -import pkuseg +import spacy_pkuseg as pkuseg from spacy.lang.zh import Chinese # Train pkuseg model From 2a17566da3c7c39dfb6639f00f0453d9e988cb8f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 6 Oct 2020 14:15:08 +0200 Subject: [PATCH 14/18] Update docs [ci skip] --- website/docs/images/layers-architectures.svg | 97 -------------------- website/docs/images/trainable_component.svg | 55 +++++++++++ website/docs/usage/layers-architectures.md | 6 +- website/docs/usage/processing-pipelines.md | 14 +-- website/docs/usage/spacy-101.md | 69 +++++++++++++- 5 files changed, 134 insertions(+), 107 deletions(-) delete mode 100644 website/docs/images/layers-architectures.svg create mode 100644 website/docs/images/trainable_component.svg diff --git a/website/docs/images/layers-architectures.svg b/website/docs/images/layers-architectures.svg deleted file mode 100644 index 22e705ba1..000000000 --- a/website/docs/images/layers-architectures.svg +++ /dev/null @@ -1,97 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/website/docs/images/trainable_component.svg b/website/docs/images/trainable_component.svg new file mode 100644 index 000000000..621ff90ef --- /dev/null +++ b/website/docs/images/trainable_component.svg @@ -0,0 +1,55 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index 24c7bf1cf..7fa60e0f1 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -646,7 +646,9 @@ get_candidates = model.attrs["get_candidates"] To use our new relation extraction model as part of a custom [trainable component](/usage/processing-pipelines#trainable-components), we -create a subclass of [`Pipe`](/api/pipe) that holds the model: +create a subclass of [`Pipe`](/api/pipe) that holds the model. + +![Illustration of Pipe methods](../images/trainable_component.svg) ```python ### Pipeline component skeleton @@ -826,7 +828,7 @@ def __call__(self, Doc doc): Once our `Pipe` subclass is fully implemented, we can [register](/usage/processing-pipelines#custom-components-factories) the -component with the [`@Language.factory`](/api/lnguage#factory) decorator. This +component with the [`@Language.factory`](/api/language#factory) decorator. This assigns it a name and lets you create the component with [`nlp.add_pipe`](/api/language#add_pipe) and via the [config](/usage/training#config). diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index c8224dfc9..8b4e39ee9 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -1172,13 +1172,15 @@ doc = nlp("This is a text...") spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable components that have their own model instance, make predictions over `Doc` objects and can be updated using [`spacy train`](/api/cli#train). This lets you -plug fully custom machine learning components into your pipeline. You'll need -the following: +plug fully custom machine learning components into your pipeline. + +![Illustration of Pipe methods](../images/trainable_component.svg) + +You'll need the following: 1. **Model:** A Thinc [`Model`](https://thinc.ai/docs/api-model) instance. This - can be a model implemented in - [Thinc](/usage/layers-architectures#thinc), or a - [wrapped model](/usage/layers-architectures#frameworks) implemented in + can be a model implemented in [Thinc](/usage/layers-architectures#thinc), or + a [wrapped model](/usage/layers-architectures#frameworks) implemented in PyTorch, TensorFlow, MXNet or a fully custom solution. The model must take a list of [`Doc`](/api/doc) objects as input and can have any type of output. 2. **Pipe subclass:** A subclass of [`Pipe`](/api/pipe) that implements at least @@ -1283,7 +1285,7 @@ loss is calculated and to add evaluation scores to the training output. For more details on how to implement your own trainable components and model architectures, and plug existing models implemented in PyTorch or TensorFlow into your spaCy pipeline, see the usage guide on -[layers and model architectures](/usage/layers-architectures). +[layers and model architectures](/usage/layers-architectures#components). diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 5d7c7d7a5..c315c5f76 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -404,8 +404,73 @@ import Training101 from 'usage/101/\_training.md' To learn more about **training and updating** pipelines, how to create training -data and how to improve spaCy's named entity recognition models, see the usage -guides on [training](/usage/training). +data and how to improve spaCy's named models, see the usage guides on +[training](/usage/training). + + + +### Training config and lifecycle {#training-config} + +Training config files include all **settings and hyperparameters** for training +your pipeline. Instead of providing lots of arguments on the command line, you +only need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). +This also makes it easy to integrate custom models and architectures, written in +your framework of choice. A pipeline's `config.cfg` is considered the "single +source of truth", both at **training** and **runtime**. + +> ```ini +> ### config.cfg (excerpt) +> [training] +> accumulate_gradient = 3 +> +> [training.optimizer] +> @optimizers = "Adam.v1" +> +> [training.optimizer.learn_rate] +> @schedules = "warmup_linear.v1" +> warmup_steps = 250 +> total_steps = 20000 +> initial_rate = 0.01 +> ``` + +![Illustration of pipeline lifecycle](../images/lifecycle.svg) + + + +For more details on spaCy's **configuration system** and how to use it to +customize your pipeline components, component models, training settings and +hyperparameters, see the [training config](/usage/training#config) usage guide. + + + +### Trainable components {#training-components} + +spaCy's [`Pipe`](/api/pipe) class helps you implement your own trainable +components that have their own model instance, make predictions over `Doc` +objects and can be updated using [`spacy train`](/api/cli#train). This lets you +plug fully custom machine learning components into your pipeline that can be +configured via a single training config. + +> #### config.cfg (excerpt) +> +> ```ini +> [components.my_component] +> factory = "my_component" +> +> [components.my_component.model] +> @architectures = "my_model.v1" +> width = 128 +> ``` + +![Illustration of Pipe methods](../images/trainable_component.svg) + + + +To learn more about how to implement your own **model architectures** and use +them to power custom **trainable components**, see the usage guides on the +[trainable component API](/usage/processing-pipelines#trainable-components) and +implementing [layers and architectures](/usage/layers-architectures#components) +for trainable components. From cfb9770a94980db9e385724568b434b7790e8bc2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 6 Oct 2020 14:15:41 +0200 Subject: [PATCH 15/18] Fix empty input into StaticVectors layer (#6211) * Add test for empty doc(s) * Fix empty check in staticvectors * Remove xfail * Update spacy/ml/staticvectors.py --- spacy/ml/staticvectors.py | 2 +- spacy/tests/test_models.py | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py index c77247d33..da731dadb 100644 --- a/spacy/ml/staticvectors.py +++ b/spacy/ml/staticvectors.py @@ -34,7 +34,7 @@ def StaticVectors( def forward( model: Model[List[Doc], Ragged], docs: List[Doc], is_train: bool ) -> Tuple[Ragged, Callable]: - if not len(docs): + if not sum(len(doc) for doc in docs): return _handle_empty(model.ops, model.get_dim("nO")) key_attr = model.attrs["key_attr"] W = cast(Floats2d, model.ops.as_contig(model.get_param("W"))) diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 17408f7e8..8ca7f8b66 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -7,6 +7,7 @@ import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier +from spacy.ml.staticvectors import StaticVectors from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES @@ -185,3 +186,22 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X): model1 = get_updated_model() model2 = get_updated_model() assert_array_equal(get_all_params(model1), get_all_params(model2)) + + +@pytest.mark.parametrize( + "model_func,kwargs", + [ + (StaticVectors, {"nO": 128, "nM": 300}), + ] +) +def test_empty_docs(model_func, kwargs): + nlp = English() + model = model_func(**kwargs).initialize() + # Test the layer can be called successfully with 0, 1 and 2 empty docs. + for n_docs in range(3): + docs = [nlp("") for _ in range(n_docs)] + # Test predict + _ = model.predict(docs) + # Test backprop + output, backprop = model.begin_update(docs) + _ = backprop(output) From fff3f8ccfaec48dcfdd6b19e6811070724d33c80 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 6 Oct 2020 14:16:05 +0200 Subject: [PATCH 16/18] Fix packaging pin (#6212) * pin packaging to >=20.0 * ignore spacy-pkuseg in requirements unit test --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/tests/package/test_requirements.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 29695e9b4..3f3886a60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,7 @@ pydantic>=1.5.0,<2.0.0 pytokenizations # Official Python utilities setuptools -packaging +packaging>=20.0 importlib_metadata>=0.20; python_version < "3.8" typing_extensions>=3.7.4; python_version < "3.8" # Development dependencies diff --git a/setup.cfg b/setup.cfg index e77bda2fc..eef4fcf67 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,7 +55,7 @@ install_requires = pytokenizations # Official Python utilities setuptools - packaging + packaging>=20.0 importlib_metadata>=0.20; python_version < "3.8" typing_extensions>=3.7.4; python_version < "3.8" diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index 6cc8fa6a8..8145beba9 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -10,12 +10,14 @@ def test_build_dependencies(): "mock", "flake8", ] + # ignore language-specific packages that shouldn't be installed by all libs_ignore_setup = [ "fugashi", "natto-py", "pythainlp", "sudachipy", "sudachidict_core", + "spacy-pkuseg", ] # check requirements.txt From 1a500f9717bd92b2d376bde6aec387e3dfd92878 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 6 Oct 2020 14:19:07 +0200 Subject: [PATCH 17/18] Set version to v3.0.0a35 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 373d1d2b0..108689074 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a34" +__version__ = "3.0.0a35" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From ce14520789eae6123589423613ce513ae74ead1e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 6 Oct 2020 14:35:17 +0200 Subject: [PATCH 18/18] Update docs [ci skip] --- website/docs/usage/v3.md | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index a10fc6321..1024a2551 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -168,9 +168,13 @@ follow the same unified [`Model`](https://thinc.ai/docs/api-model) API and each `Model` can also be used as a sublayer of a larger network, allowing you to freely combine implementations from different frameworks into a single model. +![Illustration of Pipe methods](../images/trainable_component.svg) + -- **Usage: ** [Layers and architectures](/usage/layers-architectures) +- **Usage: ** [Layers and architectures](/usage/layers-architectures), + [Trainable component API](/usage/processing-pipelines#trainable-components), + [Trainable components and models](/usage/layers-architectures#components) - **Thinc: ** [Wrapping PyTorch, TensorFlow & MXNet](https://thinc.ai/docs/usage-frameworks), [`Model` API](https://thinc.ai/docs/api-model) @@ -503,36 +507,27 @@ Note that spaCy v3.0 now requires **Python 3.6+**. - Pipeline package symlinks, the `link` command and shortcut names are now deprecated. There can be many [different trained pipelines](/models) and not just one "English model", so you should always use the full package name like - [`en_core_web_sm`](/models/en) explicitly. -- A pipeline's [`meta.json`](/api/data-formats#meta) is now only used to provide - meta information like the package name, author, license and labels. It's - **not** used to construct the processing pipeline anymore. This is all defined - in the [`config.cfg`](/api/data-formats#config), which also includes all - settings used to train the pipeline. -- The [`train`](/api/cli#train) and [`pretrain`](/api/cli#pretrain) commands now - only take a `config.cfg` file containing the full - [training config](/usage/training#config). + `en_core_web_sm` explicitly. +- A pipeline's `meta.json` is now only used to provide meta information like the + package name, author, license and labels. It's **not** used to construct the + processing pipeline anymore. This is all defined in the + [`config.cfg`](/api/data-formats#config), which also includes all settings + used to train the pipeline. +- The `train`, `pretrain` and `debug data` commands now only take a + `config.cfg`. - [`Language.add_pipe`](/api/language#add_pipe) now takes the **string name** of the component factory instead of the component function. - **Custom pipeline components** now need to be decorated with the [`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory) decorator. -- [`Language.update`](/api/language#update) now takes a batch of - [`Example`](/api/example) objects instead of raw texts and annotations, or - `Doc` and `GoldParse` objects. -- The `Language.disable_pipes` context manager has been replaced by - [`Language.select_pipes`](/api/language#select_pipes), which can explicitly - disable or enable components. - The [`Language.update`](/api/language#update), [`Language.evaluate`](/api/language#evaluate) and [`Pipe.update`](/api/pipe#update) methods now all take batches of [`Example`](/api/example) objects instead of `Doc` and `GoldParse` objects, or raw text and a dictionary of annotations. - [`Language.initialize`](/api/language#initialize) and - [`Pipe.initialize`](/api/pipe#initialize) now take a function that returns a - sequence of `Example` objects to initialize the model instead of a list of - tuples. -- The `begin_training` methods have been renamed to `initialize`. +- The `begin_training` methods have been renamed to `initialize` and now take a + function that returns a sequence of `Example` objects to initialize the model + instead of a list of tuples. - [`Matcher.add`](/api/matcher#add) and [`PhraseMatcher.add`](/api/phrasematcher#add) now only accept a list of patterns as the second argument (instead of a variable number of arguments). @@ -557,7 +552,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | Removed | Replacement | | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | +| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe) | | `Language.begin_training`, `Pipe.begin_training`, ... | [`Language.initialize`](/api/language#initialize), [`Pipe.initialize`](/api/pipe#initialize), ... | | `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) | | `GoldParse` | [`Example`](/api/example) |