Merge branch 'master' into spacy.io

2025-07-15 02:32:37 +03:00 · 2021-03-22 22:47:30 +11:00 · 2021-03-22 22:47:30 +11:00 · 4cb7125f7a
commit 4cb7125f7a
parent 6db5414668 4bd3d01aaf
19 changed files with 333 additions and 95 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -73,8 +73,13 @@ class Warnings:
            "degree. If this is intentional or the language you're using "
            "doesn't have a normalization table, please ignore this warning. "
            "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed. The languages with lexeme normalization tables "
+            "package installed and load the table in your config. The "
-            "are currently: {langs}")
+            "languages with lexeme normalization tables are currently: "
            "{langs}\n\nLoad the table in your config with:\n\n"
            "[initialize.lookups]\n"
            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
            "lang = ${{nlp.lang}}\n"
            "tables = [\"lexeme_norm\"]\n")
    W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
            "attribute or operator.")
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1686,15 +1686,21 @@ class Language:
                )
        # Detect components with listeners that are not frozen consistently
        for name, proc in nlp.pipeline:
-            if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
+            # Remove listeners not in the pipeline
-                for listener in proc.listening_components:
+            listener_names = getattr(proc, "listening_components", [])
-                    # If it's a component sourced from another pipeline, we check if
+            unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
-                    # the tok2vec listeners should be replaced with standalone tok2vec
+            for listener_name in unused_listener_names:
-                    # models (e.g. so component can be frozen without its performance
+                for listener in proc.listener_map.get(listener_name, []):
-                    # degrading when other components/tok2vec are updated)
+                    proc.remove_listener(listener, listener_name)
-                    paths = sourced.get(listener, {}).get("replace_listeners", [])
+
-                    if paths:
+            for listener in getattr(proc, "listening_components", []):  # e.g. tok2vec/transformer
-                        nlp.replace_listeners(name, listener, paths)
+                # If it's a component sourced from another pipeline, we check if
                # the tok2vec listeners should be replaced with standalone tok2vec
                # models (e.g. so component can be frozen without its performance
                # degrading when other components/tok2vec are updated)
                paths = sourced.get(listener, {}).get("replace_listeners", [])
                if paths:
                    nlp.replace_listeners(name, listener, paths)
        return nlp
    def replace_listeners(
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -202,6 +202,8 @@ cdef class Matcher:
        doclike (Doc or Span): The document to match over.
        as_spans (bool): Return Span objects with labels instead of (match_id,
            start, end) tuples.
        allow_missing (bool): Whether to skip checks for missing annotation for
            attributes included in patterns. Defaults to False.
        RETURNS (list): A list of `(match_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `match_id` is an integer. If as_spans is set
@ -222,7 +224,7 @@ cdef class Matcher:
                    if attr == TAG:
                        pipe = "tagger"
                    elif attr in (POS, MORPH):
-                        pipe = "morphologizer"
+                        pipe = "morphologizer or tagger+attribute_ruler"
                    elif attr == LEMMA:
                        pipe = "lemmatizer"
                    elif attr == DEP:
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -194,7 +194,7 @@ cdef class PhraseMatcher:
                        if attr == TAG:
                            pipe = "tagger"
                        elif attr in (POS, MORPH):
-                            pipe = "morphologizer"
+                            pipe = "morphologizer or tagger+attribute_ruler"
                        elif attr == LEMMA:
                            pipe = "lemmatizer"
                        elif attr == DEP:
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -137,6 +137,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#initialize
        """
        validate_get_examples(get_examples, "Morphologizer.initialize")
        util.check_lexeme_norms(self.vocab, "morphologizer")
        if labels is not None:
            self.cfg["labels_morph"] = labels["morph"]
            self.cfg["labels_pos"] = labels["pos"]
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
        DOCS: https://spacy.io/api/sentencerecognizer#initialize
        """
        validate_get_examples(get_examples, "SentenceRecognizer.initialize")
        util.check_lexeme_norms(self.vocab, "senter")
        doc_sample = []
        label_sample = []
        assert self.labels, Errors.E924.format(name=self.name)
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
        DOCS: https://spacy.io/api/tagger#initialize
        """
        validate_get_examples(get_examples, "Tagger.initialize")
        util.check_lexeme_norms(self.vocab, "tagger")
        if labels is not None:
            for tag in labels:
                self.add_label(tag)
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
    def initialize(self, get_examples, nlp=None, labels=None):
        validate_get_examples(get_examples, "Parser.initialize")
-        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
+        util.check_lexeme_norms(self.vocab, "parser or NER")
        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
            langs = ", ".join(util.LEXEME_NORM_LANGS)
            util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
        if labels is not None:
            actions = dict(labels)
        else:
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -155,7 +155,7 @@ class Corpus:
                continue
            elif self.max_length == 0 or len(reference) < self.max_length:
                yield self._make_example(nlp, reference, False)
-            elif reference.is_sentenced:
+            elif reference.has_annotation("SENT_START"):
                for ref_sent in reference.sents:
                    if len(ref_sent) == 0:
                        continue
@ -166,7 +166,7 @@ class Corpus:
        self, nlp: "Language", reference_docs: Iterable[Doc]
    ) -> Iterator[Example]:
        for reference in reference_docs:
-            if reference.is_sentenced:
+            if reference.has_annotation("SENT_START"):
                ref_sents = [sent.as_doc() for sent in reference.sents]
            else:
                ref_sents = [reference]
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -72,13 +72,16 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
    # Detect components with listeners that are not frozen consistently
    for name, proc in nlp.pipeline:
-        if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
+        for listener in getattr(proc, "listening_components", []):  # e.g. tok2vec/transformer
-            for listener in proc.listening_components:
+            # Don't warn about components not in the pipeline
-                if listener in frozen_components and name not in frozen_components:
+            if listener not in nlp.pipe_names:
-                    logger.warning(Warnings.W087.format(name=name, listener=listener))
+                continue
-                # We always check this regardless, in case user freezes tok2vec
+
-                if listener not in frozen_components and name in frozen_components:
+            if listener in frozen_components and name not in frozen_components:
-                    logger.warning(Warnings.W086.format(name=name, listener=listener))
+                logger.warning(Warnings.W087.format(name=name, listener=listener))
            # We always check this regardless, in case user freezes tok2vec
            if listener not in frozen_components and name in frozen_components:
                logger.warning(Warnings.W086.format(name=name, listener=listener))
    return nlp
--- a/spacy/util.py
+++ b/spacy/util.py
@ -59,7 +59,7 @@ if TYPE_CHECKING:
 OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
 logger = logging.getLogger("spacy")
 logger_stream_handler = logging.StreamHandler()
-logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
+logger_stream_handler.setFormatter(
    logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
 )
 logger.addHandler(logger_stream_handler)
@ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool:
    if hasattr(func, attr):  # function or class instance
        return True
    # https://stackoverflow.com/a/55767059
-    if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
+    if (
-        and func.__module__ in sys.modules:  # method
+        hasattr(func, "__qualname__")
-            cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
+        and hasattr(func, "__module__")
-            return hasattr(cls_func, attr)
+        and func.__module__ in sys.modules
    ):  # method
        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
        return hasattr(cls_func, attr)
    return False
@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
    """
    if is_in_jupyter():
        from thinc.backends.cupy_ops import CupyOps
        if CupyOps.xp is not None:
            from thinc.backends import contextvars_eq_thread_ops
            if not contextvars_eq_thread_ops():
                warnings.warn(Warnings.W111)
 def check_lexeme_norms(vocab, component_name):
    lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
        langs = ", ".join(LEXEME_NORM_LANGS)
        logger.debug(Warnings.W033.format(model=component_name, langs=langs))
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -77,7 +77,7 @@ $ python -m spacy info [model] [--markdown] [--silent] [--exclude]
 | Name                                             | Description                                                                                   |
 | ------------------------------------------------ | --------------------------------------------------------------------------------------------- |
-| `model`                                          | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~     |
+| `model`                                          | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~         |
 | `--markdown`, `-md`                              | Print information as Markdown. ~~bool (flag)~~                                                |
 | `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~                                 |
 | `--exclude`, `-e`                                | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
@ -259,7 +259,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
 | Name                                             | Description                                                                                                                               |
 | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `input_file`                                     | Input file. ~~Path (positional)~~                                                                                                         |
-| `output_dir`                                     | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~        |
+| `output_dir`                                     | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~            |
 | `--converter`, `-c` <Tag variant="new">2</Tag>   | Name of converter to use (see below). ~~str (option)~~                                                                                    |
 | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
 | `--n-sents`, `-n`                                | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~                                         |
@ -642,7 +642,7 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
 | Name              | Description                                                                        |
 | ----------------- | ---------------------------------------------------------------------------------- |
 | `model`           | A loadable spaCy pipeline (package name or path). ~~str (positional)~~             |
-| `inputs`          | Optional path to input file, or `-` for standard input. ~~Path (positional)~~      |
+| `inputs`          | Path to input file, or `-` for standard input. ~~Path (positional)~~               |
 | `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                         |
 | **PRINTS**        | Profiling information for the pipeline.                                            |
@ -1191,14 +1191,14 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 > $ python -m spacy project dvc all
 > ```
-| Name              | Description                                                                                                       |
+| Name              | Description                                                                                                   |
-| ----------------- | ----------------------------------------------------------------------------------------------------------------- |
+| ----------------- | ------------------------------------------------------------------------------------------------------------- |
-| `project_dir`     | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                           |
+| `project_dir`     | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                       |
-| `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ |
+| `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
-| `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                       |
+| `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                   |
-| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~                                                              |
+| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~                                                          |
-| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                        |
+| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
-| **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                     |
+| **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |
 ## ray {#ray new="3"}
@ -1236,7 +1236,7 @@ $ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--a
 | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`       | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
 | `--code`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
-| `--output`, `-o`    | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                           |
+| `--output`, `-o`    | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                               |
 | `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~                                                                                                                                   |
 | `--address`, `-a`   | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~                                                                               |
 | `--gpu-id`, `-g`    | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -198,7 +198,6 @@ more efficient than processing texts one-by-one.
 | `as_tuples`                                | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
 | `batch_size`                               | The number of texts to buffer. ~~Optional[int]~~                                                                                                                    |
 | `disable`                                  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                                     |
 | `cleanup`                                  | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~                                                                                 |
 | `component_cfg`                            | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                      |
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
@ -872,10 +871,10 @@ when loading a config with
 > replace_listeners = ["model.tok2vec"]
 > ```
-| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                                            |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~                                                                                                                                                                                                                                                                                                                        |
+| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~                                                                                                                                                                                                                                                                                                                                                |
-| `pipe_name`    | Name of pipeline component to replace listeners for. ~~str~~                                                                                                                                                                                                                                                                                                                                                   |
+| `pipe_name`    | Name of pipeline component to replace listeners for. ~~str~~                                                                                                                                                                                                                                                                                                                                                                           |
 | `listeners`    | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ |
 ## Language.meta {#meta tag="property"}
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@ -133,8 +133,8 @@ The L2 norm of the lexeme's vector representation.
 | `norm_`                                      | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~                                                                                                                                                                                               |
 | `lower`                                      | Lowercase form of the word. ~~int~~                                                                                                                                                                                                                                  |
 | `lower_`                                     | Lowercase form of the word. ~~str~~                                                                                                                                                                                                                                  |
-| `shape`                                      | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape`                                      | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
-| `shape_`                                     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~  |
+| `shape_`                                     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~  |
 | `prefix`                                     | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~                                                                                                                                                                                            |
 | `prefix_`                                    | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~                                                                                                                                                                                            |
 | `suffix`                                     | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~                                                                                                                                                                                              |
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -120,12 +120,13 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 > matches = matcher(doc)
 > ```
-| Name                                  | Description                                                                                                                                                                                                                                                                                              |
+| Name                                       | Description                                                                                                                                                                                                                                                                                              |
-| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doclike`                             | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
+| `doclike`                                  | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
-| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
+| _keyword-only_                             |                                                                                                                                                                                                                                                                                                          |
-| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
+| `as_spans` <Tag variant="new">3</Tag>      | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
-| **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
+| `allow_missing` <Tag variant="new">3</Tag> | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
 | **RETURNS**                                | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
 ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@ -4,6 +4,7 @@ teaser: Downloadable trained pipelines and weights for spaCy
 menu:
  - ['Quickstart', 'quickstart']
  - ['Conventions', 'conventions']
  - ['Pipeline Design', 'design']
 ---
 <!-- TODO: include interactive demo -->
@ -53,3 +54,146 @@ For a detailed compatibility overview, see the
 [`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json).
 This is also the source of spaCy's internal compatibility check, performed when
 you run the [`download`](/api/cli#download) command.
 ## Pretrained pipeline design {#design}
 The spaCy v3 pretrained pipelines are designed to be efficient and configurable.
 For example, multiple components can share a common "token-to-vector" model and
 it's easy to swap out or disable the lemmatizer. The pipelines are designed to
 be efficient in terms of speed and size and work well when the pipeline is run
 in full.
 When modifying a pretrained pipeline, it's important to understand how the
 components **depend on** each other. Unlike spaCy v2, where the `tagger`,
 `parser` and `ner` components were all independent, some v3 components depend on
 earlier components in the pipeline. As a result, disabling or reordering
 components can affect the annotation quality or lead to warnings and errors.
 Main changes from spaCy v2 models:
 - The [`Tok2Vec`](/api/tok2vec) component may be a separate, shared component. A
  component like a tagger or parser can
  [listen](/api/architectures#Tok2VecListener) to an earlier `tok2vec` or
  `transformer` rather than having its own separate tok2vec layer.
 - Rule-based exceptions move from individual components to the
  `attribute_ruler`. Lemma and POS exceptions move from the tokenizer exceptions
  to the attribute ruler and the tag map and morph rules move from the tagger to
  the attribute ruler.
 - The lemmatizer tables and processing move from the vocab and tagger to a
  separate `lemmatizer` component.
 ### CNN/CPU pipeline design
 In the `sm`/`md`/`lg` models:
 - The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec`
  component.
 - The `attribute_ruler` maps `token.tag` to `token.pos` if there is no
  `morphologizer`. The `attribute_ruler` additionally makes sure whitespace is
  tagged consistently and copies `token.pos` to `token.tag` if there is no
  tagger. For English, the attribute ruler can improve its mapping from
  `token.tag` to `token.pos` if dependency parses from a `parser` are present,
  but the parser is not required.
 - The rule-based `lemmatizer` (Dutch, English, French, Greek, Macedonian,
  Norwegian and Spanish) requires `token.pos` annotation from either
  `tagger`+`attribute_ruler` or `morphologizer`.
 - The `ner` component is independent with its own internal tok2vec layer.
 <!-- TODO: pretty diagram -->
 ### Transformer pipeline design
 In the tranformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
 all listen to the `transformer` component. The `attribute_ruler` and
 `lemmatizer` have the same configuration as in the CNN models.
 <!-- TODO: pretty diagram -->
 ### Modifying the default pipeline
 For faster processing, you may only want to run a subset of the components in a
 pretrained pipeline. The `disable` and `exclude` arguments to
 [`spacy.load`](/api/top-level#spacy.load) let you control which components are
 loaded and run. Disabled components are loaded in the background so it's
 possible to reenable them in the same pipeline in the future with
 [`nlp.enable_pipe`](/api/language/#enable_pipe). To skip loading a component
 completely, use `exclude` instead of `disable`.
 #### Disable part-of-speech tagging and lemmatization
 To disable part-of-speech tagging and lemmatization, disable the `tagger`,
 `morphologizer`, `attribute_ruler` and `lemmatizer` components.
 ```python
 # Note: English doesn't include a morphologizer
 nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"])
 nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"])
 ```
 <Infobox variant="warning" title="Rule-based lemmatizers require Token.pos">
 The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for
 Dutch, English, French, Greek, Macedonian, Norwegian and Spanish. If you disable
 any of these components, you'll see lemmatizer warnings unless the lemmatizer is
 also disabled.
 </Infobox>
 #### Use senter rather than parser for fast sentence segmentation
 If you need fast sentence segmentation without dependency parses, disable the
 `parser` use the `senter` component instead:
 ```python
 nlp = spacy.load("en_core_web_sm")
 nlp.disable_pipe("parser")
 nlp.enable_pipe("senter")
 ```
 The `senter` component is ~10&times; faster than the parser and more accurate
 than the rule-based `sentencizer`.
 #### Switch from rule-based to lookup lemmatization
 For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish
 pipelines, you can switch from the default rule-based lemmatizer to a lookup
 lemmatizer:
 ```python
 # Requirements: pip install spacy-lookups-data
 nlp = spacy.load("en_core_web_sm")
 nlp.remove_pipe("lemmatizer")
 nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
 ```
 #### Disable everything except NER
 For the non-transformer models, the `ner` component is independent, so you can
 disable everything else:
 ```python
 nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
 ```
 In the transformer models, `ner` listens to the `transformer` component, so you
 can disable all components related tagging, parsing, and lemmatization.
 ```python
 nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
 ```
 #### Move NER to the end of the pipeline
 For access to `POS` and `LEMMA` features in an `entity_ruler`, move `ner` to the
 end of the pipeline after `attribute_ruler` and `lemmatizer`:
 ```python
 # load without NER
 nlp = spacy.load("en_core_web_sm", exclude=["ner"])
 # source NER from the same pipeline package as the last component
 nlp.add_pipe("ner", source=spacy.load("en_core_web_sm"))
 # insert the entity ruler
 nlp.add_pipe("entity_ruler", before="ner")
 ```
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -599,18 +599,27 @@ ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
 print('Before', ents)
 # The model didn't recognize "fb" as an entity :(
-fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
+# Create a span for the new entity
 fb_ent = Span(doc, 0, 1, label="ORG")
 # Option 1: Modify the provided entity spans, leaving the rest unmodified
 doc.set_ents([fb_ent], default="unmodified")
 # Option 2: Assign a complete list of ents to doc.ents
 doc.ents = list(doc.ents) + [fb_ent]
-ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
+ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
 print('After', ents)
-# [('fb', 0, 2, 'ORG')] 🎉
+# [('fb', 0, 1, 'ORG')] 🎉
 ```
-Keep in mind that you need to create a `Span` with the start and end index of
+Keep in mind that `Span` is initialized with the start and end **token**
-the **token**, not the start and end index of the entity in the document. In
+indices, not the character offsets. To create a span from character offsets, use
-this case, "fb" is token `(0, 1)` – but at the document level, the entity will
+[`Doc.char_span`](/api/doc#char_span):
-have the start and end indices `(0, 2)`.
+
 ```python
 fb_ent = doc.char_span(0, 2, label="ORG")
 ```
 #### Setting entity annotations from array {#setting-from-array}
@ -645,9 +654,10 @@ write efficient native code.
 ```python
 # cython: infer_types=True
 from spacy.typedefs cimport attr_t
 from spacy.tokens.doc cimport Doc
-cpdef set_entity(Doc doc, int start, int end, int ent_type):
+cpdef set_entity(Doc doc, int start, int end, attr_t ent_type):
    for i in range(start, end):
        doc.c[i].ent_type = ent_type
    doc.c[start].ent_iob = 3
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -54,9 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."]
 In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a
 (potentially very large) iterable of texts as a stream. Because we're only
 accessing the named entities in `doc.ents` (set by the `ner` component), we'll
-disable all other statistical components (the `tagger` and `parser`) during
+disable all other components during processing. `nlp.pipe` yields `Doc` objects,
-processing. `nlp.pipe` yields `Doc` objects, so we can iterate over them and
+so we can iterate over them and access the named entity predictions:
 access the named entity predictions:
 > #### ✏️ Things to try
 >
@ -73,7 +72,7 @@ texts = [
 ]
 nlp = spacy.load("en_core_web_sm")
-for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
+for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
    # Do something with the doc here
    print([(ent.text, ent.label_) for ent in doc.ents])
 ```
@ -92,6 +91,54 @@ have to call `list()` on it first:
 </Infobox>
 ### Multiprocessing {#multiprocessing}
 spaCy includes built-in support for multiprocessing with
 [`nlp.pipe`](/api/language#pipe) using the `n_process` option:
 ```python
 # Multiprocessing with 4 processes
 docs = nlp.pipe(texts, n_process=4)
 # With as many processes as CPUs (use with caution!)
 docs = nlp.pipe(texts, n_process=-1)
 ```
 Depending on your platform, starting many processes with multiprocessing can add
 a lot of overhead. In particular, the default start method `spawn` used in
 macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models
 because the model data is copied in memory for each new process. See the
 [Python docs on multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
 for further details.
 For shorter tasks and in particular with `spawn`, it can be faster to use a
 smaller number of processes with a larger batch size. The optimal `batch_size`
 setting will depend on the pipeline components, the length of your documents,
 the number of processes and how much memory is available.
 ```python
 # Default batch size is `nlp.batch_size` (typically 1000)
 docs = nlp.pipe(texts, n_process=2, batch_size=2000)
 ```
 <Infobox title="Multiprocessing on GPU" variant="warning">
 Multiprocessing is not generally recommended on GPU because RAM is too limited.
 If you want to try it out, be aware that it is only possible using `spawn` due
 to limitations in CUDA.
 </Infobox>
 <Infobox title="Multiprocessing with transformer models" variant="warning">
 In Linux, transformer models may hang or deadlock with multiprocessing due to an
 [issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One
 suggested workaround is to use `spawn` instead of `fork` and another is to limit
 the number of threads before loading any models using
 `torch.set_num_threads(1)`.
 </Infobox>
 ## Pipelines and built-in components {#pipelines}
 spaCy makes it very easy to create your own pipelines consisting of reusable
@ -144,10 +191,12 @@ nlp = spacy.load("en_core_web_sm")
 ```
 ... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the
-pipeline `["tok2vec", "tagger", "parser", "ner"]`. spaCy will then initialize
+pipeline
-`spacy.lang.en.English`, and create each pipeline component and add it to the
+`["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]`. spaCy
-processing pipeline. It'll then load in the model data from the data directory
+will then initialize `spacy.lang.en.English`, and create each pipeline component
-and return the modified `Language` class for you to use as the `nlp` object.
+and add it to the processing pipeline. It'll then load in the model data from
 the data directory and return the modified `Language` class for you to use as
 the `nlp` object.
 <Infobox title="Changed in v3.0" variant="warning">
@ -171,7 +220,7 @@ the binary data:
 ```python
 ### spacy.load under the hood
 lang = "en"
-pipeline = ["tok2vec", "tagger", "parser", "ner"]
+pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
 data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
@ -186,7 +235,7 @@ component** on the `Doc`, in order. Since the model data is loaded, the
 components can access it to assign annotations to the `Doc` object, and
 subsequently to the `Token` and `Span` which are only views of the `Doc`, and
 don't own any data themselves. All components return the modified document,
-which is then processed by the component next in the pipeline.
+which is then processed by the next component in the pipeline.
 ```python
 ### The pipeline under the hood
@ -201,9 +250,9 @@ list of human-readable component names.
 ```python
 print(nlp.pipeline)
-# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
+# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>), ('attribute_ruler', <spacy.pipeline.AttributeRuler>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer>)]
 print(nlp.pipe_names)
-# ['tok2vec', 'tagger', 'parser', 'ner']
+# ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
 ```
 ### Built-in pipeline components {#built-in}
@ -300,7 +349,7 @@ blocks.
 ```python
 ### Disable for block
 # 1. Use as a context manager
-with nlp.select_pipes(disable=["tagger", "parser"]):
+with nlp.select_pipes(disable=["tagger", "parser", "lemmatizer"]):
    doc = nlp("I won't be tagged and parsed")
 doc = nlp("I will be tagged and parsed")
@ -324,7 +373,7 @@ The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword
 argument if you only want to disable components during processing:
 ```python
-for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
+for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]):
    # Do something with the doc here
 ```
@ -1497,24 +1546,33 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
 | Name               | Customizes                                                                                                                                                                                                              |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `user_hooks`       | [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents)                                                                      |
+| `user_hooks`       | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents)                             |
 | `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) |
 | `user_span_hooks`  | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root)                     |
 ```python
 ### Add custom similarity hooks
 from spacy.language import Language
 class SimilarityModel:
-    def __init__(self, model):
+    def __init__(self, name: str, index: int):
-        self._model = model
+        self.name = name
        self.index = index
    def __call__(self, doc):
        doc.user_hooks["similarity"] = self.similarity
        doc.user_span_hooks["similarity"] = self.similarity
        doc.user_token_hooks["similarity"] = self.similarity
        return doc
    def similarity(self, obj1, obj2):
-        y = self._model([obj1.vector, obj2.vector])
+        return obj1.vector[self.index] + obj2.vector[self.index]
-        return float(y[0])
+
@Language.factory("similarity_component", default_config={"index": 0})
 def create_similarity_component(nlp, name, index: int):
    return SimilarityModel(name, index)
 ```
 ## Developing plugins and wrappers {#plugins}
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@ -19,9 +19,8 @@ import Serialization101 from 'usage/101/\_serialization.md'
 When serializing the pipeline, keep in mind that this will only save out the
 **binary data for the individual components** to allow spaCy to restore them –
 not the entire objects. This is a good thing, because it makes serialization
-safe. But it also means that you have to take care of storing the language name
+safe. But it also means that you have to take care of storing the config, which
-and pipeline component names as well, and restoring them separately before you
+contains the pipeline configuration and all the relevant settings.
 can load in the data.
 > #### Saving the meta and config
 >
@ -33,24 +32,21 @@ can load in the data.
 ```python
 ### Serialize
 config = nlp.config
 bytes_data = nlp.to_bytes()
 lang = nlp.config["nlp"]["lang"]  # "en"
 pipeline = nlp.config["nlp"]["pipeline"]  # ["tagger", "parser", "ner"]
 ```
 ```python
 ### Deserialize
-nlp = spacy.blank(lang)
+lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
-for pipe_name in pipeline:
+nlp = lang_cls.from_config(config)
    nlp.add_pipe(pipe_name)
 nlp.from_bytes(bytes_data)
 ```
 This is also how spaCy does it under the hood when loading a pipeline: it loads
 the `config.cfg` containing the language and pipeline information, initializes
-the language class, creates and adds the pipeline components based on the
+the language class, creates and adds the pipeline components based on the config
-defined [factories](/usage/processing-pipeline#custom-components-factories) and
+and _then_ loads in the binary data. You can read more about this process
 _then_ loads in the binary data. You can read more about this process
 [here](/usage/processing-pipelines#pipelines).
 ## Serializing Doc objects efficiently {#docs new="2.2"}