Merge branch 'develop' into feature/prepare

2025-07-06 04:43:17 +03:00 · 2020-09-29 20:53:05 +02:00 · 2020-09-29 20:53:05 +02:00 · d3c63b7965
commit d3c63b7965
parent 2be80379ec 361f91e286
39 changed files with 189 additions and 134 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -14,7 +14,7 @@ pathy
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 tqdm>=4.38.0,<5.0.0
-pydantic>=1.3.0,<2.0.0
+pydantic>=1.5.0,<2.0.0
 pytokenizations
 # Official Python utilities
 setuptools
--- a/setup.cfg
+++ b/setup.cfg
@ -51,7 +51,7 @@ install_requires =
    tqdm>=4.38.0,<5.0.0
    numpy>=1.15.0
    requests>=2.13.0,<3.0.0
-    pydantic>=1.3.0,<2.0.0
+    pydantic>=1.5.0,<2.0.0
    pytokenizations
    # Official Python utilities
    setuptools
--- a/spacy/cli/project/document.py
+++ b/spacy/cli/project/document.py
@ -114,6 +114,6 @@ def project_document(
                content = f"{before}{content}{after}"
            else:
                msg.warn("Replacing existing file")
-        with output_file.open("w") as f:
+        with output_file.open("w", encoding="utf8") as f:
            f.write(content)
        msg.good("Saved project documentation", output_file)
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -270,6 +270,7 @@ factory = "{{ pipe }}"
@readers = "spacy.Corpus.v1"
 path = ${paths.train}
 max_length = {{ 500 if hardware == "gpu" else 2000 }}
 augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
 [corpora.dev]
@readers = "spacy.Corpus.v1"
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -36,6 +36,11 @@ gold_preproc = false
 max_length = 0
 # Limitation on number of training examples
 limit = 0
 # Apply some simply data augmentation, where we replace tokens with variations.
 # This is especially useful for punctuation and case replacement, to help
 # generalize beyond corpora that don't have smart-quotes, or only have smart
 # quotes, etc.
 augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
 [corpora.dev]
@readers = "spacy.Corpus.v1"
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@ -4,7 +4,7 @@ from spacy.training import biluo_tags_to_spans, iob_to_biluo
 from spacy.training import Corpus, docs_to_json
 from spacy.training.example import Example
 from spacy.training.converters import json_to_docs
-from spacy.training.augment import make_orth_variants_example
+from spacy.training.augment import create_orth_variants_augmenter
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
 from spacy.util import get_words_and_spaces, minibatch
@ -496,9 +496,8 @@ def test_make_orth_variants(doc):
        output_file = tmpdir / "roundtrip.spacy"
        DocBin(docs=[doc]).to_disk(output_file)
        # due to randomness, test only that this runs with no errors for now
-        reader = Corpus(output_file)
+        reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
-        train_example = next(reader(nlp))
+        train_examples = list(reader(nlp))
    make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
@pytest.mark.skip("Outdated")
--- a/spacy/training/init.py
+++ b/spacy/training/init.py
@ -1,6 +1,7 @@
 from .corpus import Corpus  # noqa: F401
 from .example import Example, validate_examples  # noqa: F401
 from .align import Alignment  # noqa: F401
 from .augment import dont_augment, orth_variants_augmenter  # noqa: F401
 from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401
 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401
 from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@ -1,30 +1,50 @@
 from typing import Callable
 import random
 import itertools
 import copy
 from functools import partial
 from ..util import registry
-def make_orth_variants_example(nlp, example, orth_variant_level=0.0):  # TODO: naming
+@registry.augmenters("spacy.dont_augment.v1")
-    raw_text = example.text
+def create_null_augmenter():
-    orig_dict = example.to_dict()
+    return dont_augment
    variant_text, variant_token_annot = make_orth_variants(
        nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
    )
    doc = nlp.make_doc(variant_text)
    orig_dict["token_annotation"] = variant_token_annot
    return example.from_dict(doc, orig_dict)
-def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
+@registry.augmenters("spacy.orth_variants.v1")
-    if random.random() >= orth_variant_level:
+def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
-        return raw_text, orig_token_dict
+    """Create a data augmentation callback that uses orth-variant replacement.
-    if not orig_token_dict:
+    The callback can be added to a corpus or other data iterator during training.
-        return raw_text, orig_token_dict
+    """
-    raw = raw_text
+    return partial(orth_variants_augmenter, level=level, lower=lower)
-    token_dict = orig_token_dict
+
-    lower = False
+
-    if random.random() >= 0.5:
+def dont_augment(nlp, example):
-        lower = True
+    yield example
-        if raw is not None:
+
-            raw = raw.lower()
+
 def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
    if random.random() >= level:
        yield example
    else:
        raw_text = example.text
        orig_dict = example.to_dict()
        if not orig_dict["token_annotation"]:
            yield example
        else:
            variant_text, variant_token_annot = make_orth_variants(
                nlp,
                raw_text,
                orig_dict["token_annotation"],
                lower=raw_text is not None and random.random() < lower
            )
            doc = nlp.make_doc(variant_text)
            orig_dict["token_annotation"] = variant_token_annot
            yield example.from_dict(doc, orig_dict)
 def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
    orig_token_dict = copy.deepcopy(token_dict)
    orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
    ndsv = orth_variants.get("single", [])
    ndpv = orth_variants.get("paired", [])
@ -103,7 +123,7 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
            # something went wrong, abort
            # (add a warning message?)
            if not match_found:
-                return raw_text, orig_token_dict
+                return raw, orig_token_dict
            # add following whitespace
            while raw_idx < len(raw) and raw[raw_idx].isspace():
                variant_raw += raw[raw_idx]
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -1,9 +1,11 @@
 import warnings
 from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
 from typing import Optional
 from pathlib import Path
 import srsly
 from .. import util
 from .augment import dont_augment
 from .example import Example
 from ..errors import Warnings
 from ..tokens import DocBin, Doc
@ -18,9 +20,19 @@ FILE_TYPE = ".spacy"
@util.registry.readers("spacy.Corpus.v1")
 def create_docbin_reader(
-    path: Path, gold_preproc: bool, max_length: int = 0, limit: int = 0
+    path: Path,
    gold_preproc: bool,
    max_length: int = 0,
    limit: int = 0,
    augmenter: Optional[Callable] = None,
 ) -> Callable[["Language"], Iterable[Example]]:
-    return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
+    return Corpus(
        path,
        gold_preproc=gold_preproc,
        max_length=max_length,
        limit=limit,
        augmenter=augmenter,
    )
@util.registry.readers("spacy.JsonlReader.v1")
@ -70,6 +82,8 @@ class Corpus:
        0, which indicates no limit.
    limit (int): Limit corpus to a subset of examples, e.g. for debugging.
        Defaults to 0, which indicates no limit.
    augment (Callable[Example, Iterable[Example]]): Optional data augmentation
        function, to extrapolate additional examples from your annotations.
    DOCS: https://nightly.spacy.io/api/corpus
    """
@ -81,11 +95,13 @@ class Corpus:
        limit: int = 0,
        gold_preproc: bool = False,
        max_length: int = 0,
        augmenter: Optional[Callable] = None,
    ) -> None:
        self.path = util.ensure_path(path)
        self.gold_preproc = gold_preproc
        self.max_length = max_length
        self.limit = limit
        self.augmenter = augmenter if augmenter is not None else dont_augment
    def __call__(self, nlp: "Language") -> Iterator[Example]:
        """Yield examples from the data.
@ -100,7 +116,9 @@ class Corpus:
            examples = self.make_examples_gold_preproc(nlp, ref_docs)
        else:
            examples = self.make_examples(nlp, ref_docs)
-        yield from examples
+        for real_eg in examples:
            for augmented_eg in self.augmenter(nlp, real_eg):
                yield augmented_eg
    def _make_example(
        self, nlp: "Language", reference: Doc, gold_preproc: bool
--- a/spacy/util.py
+++ b/spacy/util.py
@ -83,6 +83,7 @@ class registry(thinc.registry):
    callbacks = catalogue.create("spacy", "callbacks")
    batchers = catalogue.create("spacy", "batchers", entry_points=True)
    readers = catalogue.create("spacy", "readers", entry_points=True)
    augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
    loggers = catalogue.create("spacy", "loggers", entry_points=True)
    # These are factories registered via third-party packages and the
    # spacy_factories entry point. This registry only exists so we can easily
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -143,11 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 Construct an embedding layer that separately embeds a number of lexical
 attributes using hash embedding, concatenates the results, and passes it through
-a feed-forward subnetwork to build a mixed representations. The features used
+a feed-forward subnetwork to build mixed representations. The features used are
-are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying
+the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
-definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
+depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
-pretrained static vectors can also be incorporated into the concatenated
+static vectors can also be incorporated into the concatenated representation.
 representation.
 | Name                      | Description                                                                                                                                                                                                       |
 | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -170,7 +169,7 @@ representation.
 > nC = 8
 > ```
-Construct an embedded representations based on character embeddings, using a
+Construct an embedded representation based on character embeddings, using a
 feed-forward network. A fixed number of UTF-8 byte characters are used for each
 word, taken from the beginning and end of the word equally. Padding is used in
 the center for words that are too short.
@ -392,7 +391,7 @@ a single token vector given zero or more wordpiece vectors.
 > ```
 Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
-**not** allow multiple components to share the transformer weights, and does
+**not** allow multiple components to share the transformer weights and does
 **not** allow the transformer to set annotations into the [`Doc`](/api/doc)
 object, but it's a **simpler solution** if you only need the transformer within
 one component.
@ -437,7 +436,7 @@ might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python)
 helpful for background information. The neural network state prediction model
 consists of either two or three subnetworks:
- **tok2vec**: Map each token into a vector representations. This subnetwork is
+- **tok2vec**: Map each token into a vector representation. This subnetwork is
  run once for each batch.
 - **lower**: Construct a feature-specific vector for each `(token, feature)`
  pair. This is also run once for each batch. Constructing the state
@ -575,14 +574,14 @@ architecture is usually less accurate than the ensemble, but runs faster.
 > nO = null
 > ```
-An ngram "bag-of-words" model. This architecture should run much faster than the
+An n-gram "bag-of-words" model. This architecture should run much faster than
-others, but may not be as accurate, especially if texts are short.
+the others, but may not be as accurate, especially if texts are short.
 | Name                | Description                                                                                                                                                                                    |
 | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
+| `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~                                           |
-| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                           |
+| `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~                                                          |
 | `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
@ -596,7 +595,7 @@ into the "real world". This requires 3 main components:
  synonyms and prior probabilities.
 - A candidate generation step to produce a set of likely identifiers, given a
  certain textual mention.
- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
+- A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
  most plausible ID from the set of candidates.
 ### spacy.EntityLinker.v1 {#EntityLinker}
--- a/website/docs/api/attributeruler.md
+++ b/website/docs/api/attributeruler.md
@ -71,7 +71,7 @@ pattern_dicts = [
 ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
-Apply the attribute ruler to a Doc, setting token attributes for tokens matched
+Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
 by the provided patterns.
 | Name        | Description                      |
@ -256,6 +256,6 @@ serialization by passing in the string names via the `exclude` argument.
 | Name       | Description                                                    |
 | ---------- | -------------------------------------------------------------- |
 | `vocab`    | The shared [`Vocab`](/api/vocab).                              |
-| `patterns` | The Matcher patterns. You usually don't want to exclude this.  |
+| `patterns` | The `Matcher` patterns. You usually don't want to exclude this.  |
 | `attrs`    | The attributes to set. You usually don't want to exclude this. |
 | `indices`  | The token indices. You usually don't want to exclude this.     |
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -81,7 +81,7 @@ $ python -m spacy info [model] [--markdown] [--silent]
 Find all trained pipeline packages installed in the current environment and
 check whether they are compatible with the currently installed version of spaCy.
 Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
-all installed packages are can be used with the new version. It will show a list
+all installed packages can be used with the new version. It will show a list
 of packages and their installed versions. If any package is out of date, the
 latest compatible versions and command for updating are shown.
@ -408,7 +408,7 @@ File       /path/to/thinc/thinc/schedules.py (line 91)
 ### debug data {#debug-data tag="command"}
-Analyze, debug, and validate your training and development data. Get useful
+Analyze, debug and validate your training and development data. Get useful
 stats, and find problems like invalid entity annotations, cyclic dependencies,
 low data labels and more.
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -74,6 +74,7 @@ train/test skew.
 |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~                     |
 | `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
 | `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                     |
 | `augmenter`     | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
 ## Corpus.\_\_call\_\_ {#call tag="method"}
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -274,7 +274,7 @@ Typically, the extension for these binary files is `.spacy`, and they are used
 as input format for specifying a [training corpus](/api/corpus) and for spaCy's
 CLI [`train`](/api/cli#train) command. The built-in
 [`convert`](/api/cli#convert) command helps you convert spaCy's previous
-[JSON format](#json-input) to the new binary format format. It also supports
+[JSON format](#json-input) to the new binary format. It also supports
 conversion of the `.conllu` format used by the
 [Universal Dependencies corpora](https://github.com/UniversalDependencies).
@ -338,7 +338,7 @@ $ python -m spacy convert ./data.json ./output.spacy
 <Accordion title="Sample JSON data" spaced>
-Here's an example of dependencies, part-of-speech tags and names entities, taken
+Here's an example of dependencies, part-of-speech tags and named entities, taken
 from the English Wall Street Journal portion of the Penn Treebank:
 ```json
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -21,8 +21,7 @@ non-projective parses.
 The parser is trained using an **imitation learning objective**. It follows the
 actions predicted by the current weights, and at each state, determines which
 actions are compatible with the optimal parse that could be reached from the
-current state. The weights such that the scores assigned to the set of optimal
+current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
 actions is increased, while scores assigned to other actions are decreased. Note
 that more than one action may be optimal for a given state.
 ## Config and implementation {#config}
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -503,8 +503,7 @@ invalidated, although they may accidentally continue to work.
 Mark a span for merging. The `attrs` will be applied to the resulting token (if
 they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
 underlying lexeme (if they're context-independent lexical attributes like
-`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
+`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
 dictionary mapping attribute names to values as the `"_"` key.
 > #### Example
 >
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -94,7 +94,7 @@ providing custom registered functions.
 ## EntityLinker.\_\_call\_\_ {#call tag="method"}
-Apply the pipe to one document. The document is modified in place, and returned.
+Apply the pipe to one document. The document is modified in place and returned.
 This usually happens under the hood when the `nlp` object is called on a text
 and all pipeline components are applied to the `Doc` in order. Both
 [`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
 | Setting                       | Description                                                                                                                                                                                                                                         |
 | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]                                                                                                                                       |
+| `moves`                       | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~                                                                                                                                       |
 | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
 | `model`                       | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~                                                 |
@ -83,7 +83,7 @@ shortcut for this and instantiate the component using its string name and
 ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
-Apply the pipe to one document. The document is modified in place, and returned.
+Apply the pipe to one document. The document is modified in place and returned.
 This usually happens under the hood when the `nlp` object is called on a text
 and all pipeline components are applied to the `Doc` in order. Both
 [`__call__`](/api/entityrecognizer#call) and
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@ -256,6 +256,6 @@ Get all patterns that were added to the entity ruler.
 | Name              | Description                                                                                                           |
 | ----------------- | --------------------------------------------------------------------------------------------------------------------- |
 | `matcher`         | The underlying matcher used to process token patterns. ~~Matcher~~                                                    |
-| `phrase_matcher`  | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~                                     |
+| `phrase_matcher`  | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~                                     |
 | `token_patterns`  | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
 | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~                             |
--- a/website/docs/api/example.md
+++ b/website/docs/api/example.md
@ -33,8 +33,8 @@ both documents.
 | Name           | Description                                                                                                              |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `predicted`    | The document containing (partial) predictions. Can not be `None`. ~~Doc~~                                                |
+| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~                                                |
-| `reference`    | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~                                            |
+| `reference`    | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~                                            |
 | _keyword-only_ |                                                                                                                          |
 | `alignment`    | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ |
@ -58,8 +58,8 @@ see the [training format documentation](/api/data-formats#dict-input).
 | Name           | Description                                                               |
 | -------------- | ------------------------------------------------------------------------- |
-| `predicted`    | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
+| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ |
-| `example_dict` | `Dict[str, obj]`                                                          | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ |
+| `example_dict` | `Dict[str, obj]`                                                          | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ |
 | **RETURNS**    | The newly constructed object. ~~Example~~                                 |
 ## Example.text {#text tag="property"}
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -46,10 +46,11 @@ information in [`Language.meta`](/api/language#meta) and not to configure the
 ## Language.from_config {#from_config tag="classmethod" new="3"}
 Create a `Language` object from a loaded config. Will set up the tokenizer and
-language data, add pipeline components based on the pipeline and components
+language data, add pipeline components based on the pipeline and add pipeline
-define in the config and validate the results. If no config is provided, the
+components based on the definitions specified in the config. If no config is
-default config of the given language is used. This is also how spaCy loads a
+provided, the default config of the given language is used. This is also how
-model under the hood based on its [`config.cfg`](/api/data-formats#config).
+spaCy loads a model under the hood based on its
 [`config.cfg`](/api/data-formats#config).
 > #### Example
 >
@ -107,7 +108,7 @@ decorator. For more details and examples, see the
 | `assigns`      | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
 | `requires`     | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
 | `retokenizes`  | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                               |
-| `func`         | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~                                                                                    |
+| `func`         | Optional function if not used as a decorator. ~~Optional[Callable[[Doc], Doc]]~~                                                                                   |
 ## Language.factory {#factory tag="classmethod"}
@ -154,7 +155,7 @@ examples, see the
 | `requires`              | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~                                                                                                                                                                 |
 | `retokenizes`           | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~                                                                                                                                                                                                               |
 | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
-| `func`                  | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                                                                                                   |
+| `func`                  | Optional function if not used as a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~                                                                                                                                                                                                                                  |
 ## Language.\_\_call\_\_ {#call tag="method"}
@ -609,7 +610,7 @@ does nothing.
 ## Language.enable_pipe {#enable_pipe tag="method" new="3"}
-Enable a previously disable component (e.g. via
+Enable a previously disabled component (e.g. via
 [`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
 the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
 already enabled, this method does nothing.
@ -636,7 +637,7 @@ pipeline will be restored to the initial state at the end of the block.
 Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
 you can use to undo your changes. You can specify either `disable` (as a list or
 string), or `enable`. In the latter case, all components not in the `enable`
-list, will be disabled. Under the hood, this method calls into
+list will be disabled. Under the hood, this method calls into
 [`disable_pipe`](/api/language#disable_pipe) and
 [`enable_pipe`](/api/language#enable_pipe).
@ -669,7 +670,7 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
 | -------------- | ------------------------------------------------------------------------------------------------------ |
 | _keyword-only_ |                                                                                                        |
 | `disable`      | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~                     |
-| `enable`       | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~     |
+| `enable`       | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~      |
 | **RETURNS**    | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
 ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
@ -881,10 +882,10 @@ Loads state from a directory, including all data that was saved with the
 <Infobox variant="warning" title="Important note">
-Keep in mind that this method **only loads serialized state** and doesn't set up
+Keep in mind that this method **only loads the serialized state** and doesn't
-the `nlp` object. This means that it requires the correct language class to be
+set up the `nlp` object. This means that it requires the correct language class
-initialized and all pipeline components to be added to the pipeline. If you want
+to be initialized and all pipeline components to be added to the pipeline. If
-to load a serialized pipeline from a directory, you should use
+you want to load a serialized pipeline from a directory, you should use
 [`spacy.load`](/api/top-level#spacy.load), which will set everything up for you.
 </Infobox>
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -38,7 +38,7 @@ The default config is defined by the pipeline component factory and describes
 how the component should be configured. You can override its settings via the
 `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
 [`config.cfg` for training](/usage/training#config). For examples of the lookups
-data formats used by the lookup and rule-based lemmatizers, see
+data format used by the lookup and rule-based lemmatizers, see
 [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
 > #### Example
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -61,7 +61,7 @@ matched:
 | `!` | Negate the pattern, by requiring it to match exactly 0 times.    |
 | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
 | `+` | Require the pattern to match 1 or more times.                    |
-| `*` | Allow the pattern to match zero or more times.                   |
+| `*` | Allow the pattern to match 0 or more times.                   |
 Token patterns can also map to a **dictionary of properties** instead of a
 single value to indicate whether the expected value is a member of a list or how
--- a/website/docs/api/morphology.md
+++ b/website/docs/api/morphology.md
@ -12,7 +12,7 @@ container storing a single morphological analysis.
 ## Morphology.\_\_init\_\_ {#init tag="method"}
-Create a Morphology object.
+Create a `Morphology` object.
 > #### Example
 >
@ -101,7 +101,7 @@ representation.
 | Name         | Description                                                                                                                                             |
 | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~                                                                                          |
-| **RETURNS**  | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
+| **RETURNS**  | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
 ## Attributes {#attributes}
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@ -200,7 +200,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
 ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
-current model to make predictions similar to an initial model, to try to address
+current model to make predictions similar to an initial model to try to address
 the "catastrophic forgetting" problem. This feature is experimental.
 > #### Example
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@ -8,7 +8,7 @@ api_string_name: sentencizer
 api_trainable: false
 ---
-A simple pipeline component, to allow custom sentence boundary detection logic
+A simple pipeline component to allow custom sentence boundary detection logic
 that doesn't require the dependency parse. By default, sentence segmentation is
 performed by the [`DependencyParser`](/api/dependencyparser), so the
 `Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
@ -130,7 +130,7 @@ Score a batch of examples.
 ## Sentencizer.to_disk {#to_disk tag="method"}
-Save the sentencizer settings (punctuation characters) a directory. Will create
+Save the sentencizer settings (punctuation characters) to a directory. Will create
 a file `sentencizer.json`. This also happens automatically when you save an
 `nlp` object with a sentencizer added to its pipeline.
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -8,7 +8,7 @@ A slice from a [`Doc`](/api/doc) object.
 ## Span.\_\_init\_\_ {#init tag="method"}
-Create a Span object from the slice `doc[start : end]`.
+Create a `Span` object from the slice `doc[start : end]`.
 > #### Example
 >
@ -187,7 +187,7 @@ the character indices don't map to a valid span.
 | Name                                 | Description                                                                               |
 | ------------------------------------ | ----------------------------------------------------------------------------------------- |
 | `start`                              | The index of the first character of the span. ~~int~~                                     |
-| `end`                                | The index of the last character after the span. ~int~~                                    |
+| `end`                                | The index of the last character after the span. ~~int~~                                    |
 | `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
 | `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
 | `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -157,7 +157,7 @@ This method was previously called `begin_training`.
 ## TextCategorizer.predict {#predict tag="method"}
-Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 modifying them.
 > #### Example
@ -174,7 +174,7 @@ modifying them.
 ## TextCategorizer.set_annotations {#set_annotations tag="method"}
-Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
+Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
 > #### Example
 >
@ -217,7 +217,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
 Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
-current model to make predictions similar to an initial model, to try to address
+current model to make predictions similar to an initial model to try to address
 the "catastrophic forgetting" problem. This feature is experimental.
 > #### Example
@ -290,7 +290,7 @@ Create an optimizer for the pipeline component.
 ## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values.
+Modify the pipe's model to use the given parameter values.
 > #### Example
 >
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@ -150,7 +150,7 @@ by [`Language.initialize`](/api/language#initialize).
 ## Tok2Vec.predict {#predict tag="method"}
-Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 modifying them.
 > #### Example
@ -223,7 +223,7 @@ Create an optimizer for the pipeline component.
 ## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values. At the end of the
+Modify the pipe's model to use the given parameter values. At the end of the
 context, the original parameters are restored.
 > #### Example
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -243,7 +243,7 @@ A sequence of the token's immediate syntactic children.
 ## Token.lefts {#lefts tag="property" model="parser"}
-The leftward immediate children of the word, in the syntactic dependency parse.
+The leftward immediate children of the word in the syntactic dependency parse.
 > #### Example
 >
@ -259,7 +259,7 @@ The leftward immediate children of the word, in the syntactic dependency parse.
 ## Token.rights {#rights tag="property" model="parser"}
-The rightward immediate children of the word, in the syntactic dependency parse.
+The rightward immediate children of the word in the syntactic dependency parse.
 > #### Example
 >
@ -275,7 +275,7 @@ The rightward immediate children of the word, in the syntactic dependency parse.
 ## Token.n_lefts {#n_lefts tag="property" model="parser"}
-The number of leftward immediate children of the word, in the syntactic
+The number of leftward immediate children of the word in the syntactic
 dependency parse.
 > #### Example
@ -291,7 +291,7 @@ dependency parse.
 ## Token.n_rights {#n_rights tag="property" model="parser"}
-The number of rightward immediate children of the word, in the syntactic
+The number of rightward immediate children of the word in the syntactic
 dependency parse.
 > #### Example
@ -422,8 +422,8 @@ The L2 norm of the token's vector representation.
 | `norm_`                                      | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~                                                                                                 |
 | `lower`                                      | Lowercase form of the token. ~~int~~                                                                                                                                                                                                                                   |
 | `lower_`                                     | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~                                                                                                                                                                                          |
-| `shape`                                      | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape`                                      | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
-| `shape_`                                     | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
+| `shape_`                                     | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
 | `prefix`                                     | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~                                                                                                                                                                             |
 | `prefix_`                                    | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~                                                                                                                                                                                           |
 | `suffix`                                     | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~                                                                                                                                                                               |
@ -451,7 +451,7 @@ The L2 norm of the token's vector representation.
 | `tag`                                        | Fine-grained part-of-speech. ~~int~~                                                                                                                                                                                                                                   |
 | `tag_`                                       | Fine-grained part-of-speech. ~~str~~                                                                                                                                                                                                                                   |
 | `morph` <Tag variant="new">3</Tag>           | Morphological analysis. ~~MorphAnalysis~~                                                                                                                                                                                                                              |
-| `morph_` <Tag variant="new">3</Tag>          | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~                                                                                                                     |
+| `morph_` <Tag variant="new">3</Tag>          | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~                                                                                                                     |
 | `dep`                                        | Syntactic dependency relation. ~~int~~                                                                                                                                                                                                                                 |
 | `dep_`                                       | Syntactic dependency relation. ~~str~~                                                                                                                                                                                                                                 |
 | `lang`                                       | Language of the parent document's vocabulary. ~~int~~                                                                                                                                                                                                                  |
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -1,6 +1,6 @@
 ---
 title: Tokenizer
-teaser: Segment text into words, punctuations marks etc.
+teaser: Segment text into words, punctuations marks, etc.
 tag: class
 source: spacy/tokenizer.pyx
 ---
@ -15,14 +15,14 @@ source: spacy/tokenizer.pyx
 Segment text, and create `Doc` objects with the discovered segment boundaries.
 For a deeper understanding, see the docs on
 [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
-The tokenizer is typically created automatically when the a
+The tokenizer is typically created automatically when a
 [`Language`](/api/language) subclass is initialized and it reads its settings
 like punctuation and special case rules from the
 [`Language.Defaults`](/api/language#defaults) provided by the language subclass.
 ## Tokenizer.\_\_init\_\_ {#init tag="method"}
-Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples
+Create a `Tokenizer` to create `Doc` objects given unicode text. For examples
 of how to construct a custom tokenizer with different tokenization rules, see
 the
 [usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
@ -87,7 +87,7 @@ Tokenize a stream of texts.
 | ------------ | ------------------------------------------------------------------------------------ |
 | `texts`      | A sequence of unicode texts. ~~Iterable[str]~~                                       |
 | `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
-| **YIELDS**   | The tokenized Doc objects, in order. ~~Doc~~                                         |
+| **YIELDS**   | The tokenized `Doc` objects, in order. ~~Doc~~                                         |
 ## Tokenizer.find_infix {#find_infix tag="method"}
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -198,7 +198,7 @@ browser. Will run a simple web server.
 | `page`    | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                      |
 | `minify`  | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                  |
 | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                  |
-| `manual`  | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
+| `manual`  | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
 | `port`    | Port to serve visualization. Defaults to `5000`. ~~int~~                                                                                                           |
 | `host`    | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~                                                                                                      |
@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
 | `page`      | Render markup as full HTML page. Defaults to `True`. ~~bool~~                                                                                                                          |
 | `minify`    | Minify HTML markup. Defaults to `False`. ~~bool~~                                                                                                                                      |
 | `options`   | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~                                                                                                      |
-| `manual`    | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                     |
+| `manual`    | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~                     |
 | `jupyter`   | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
 | **RETURNS** | The rendered HTML markup. ~~str~~                                                                                                                                                      |
@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
 | Name                                       | Description                                                                                                                                  |
 | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
 | `fine_grained`                             | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~             |
-| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                     |
+| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~                                                     |
 | `collapse_punct`                           | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
 | `collapse_phrases`                         | Merge noun phrases into one token. Defaults to `False`. ~~bool~~                                                                             |
 | `compact`                                  | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~                                                    |
@ -623,7 +623,7 @@ sequences in the batch.
 Encode labelled spans into per-token tags, using the
 [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
-Out). Returns a list of strings, describing the tags. Each tag string will be of
+Out). Returns a list of strings, describing the tags. Each tag string will be in
 the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of
 `"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets
 don't align with the tokenization in the `Doc` object. The training algorithm
@ -747,7 +747,7 @@ decorator.
 ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
 Check whether a `Language` subclass is already loaded. `Language` subclasses are
-loaded lazily, to avoid expensive setup code associated with the language data.
+loaded lazily to avoid expensive setup code associated with the language data.
 > #### Example
 >
@ -935,7 +935,7 @@ Compile a sequence of prefix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
 ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
@ -952,7 +952,7 @@ Compile a sequence of suffix rules into a regex object.
 | Name        | Description                                                                                                                                 |
 | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
+| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~                                        |
 ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
@ -969,7 +969,7 @@ Compile a sequence of infix rules into a regex object.
 | Name        | Description                                                                                                                               |
 | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `entries`   | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
-| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                     |
+| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~                                     |
 ### util.minibatch {#util.minibatch tag="function" new="2"}
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -185,7 +185,7 @@ by [`Language.initialize`](/api/language#initialize).
 ## Transformer.predict {#predict tag="method"}
-Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
+Apply the component's model to a batch of [`Doc`](/api/doc) objects without
 modifying them.
 > #### Example
@ -202,7 +202,7 @@ modifying them.
 ## Transformer.set_annotations {#set_annotations tag="method"}
-Assign the extracted features to the Doc objects. By default, the
+Assign the extracted features to the `Doc` objects. By default, the
 [`TransformerData`](/api/transformer#transformerdata) object is written to the
 [`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations`
 callback is then called, if provided.
@ -271,7 +271,7 @@ Create an optimizer for the pipeline component.
 ## Transformer.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values. At the end of the
+Modify the pipe's model to use the given parameter values. At the end of the
 context, the original parameters are restored.
 > #### Example
@ -387,8 +387,8 @@ by this class. Instances of this class are typically assigned to the
 | Name      | Description                                                                                                                                                                                                                                                                                                                                             |
 | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tokens`  | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                                         |
+| `tokens`  | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~                                         |
-| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
+| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
 | `align`   | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                    |
 | `width`   | The width of the last hidden layer. ~~int~~                                                                                                                                                                                                                                                                                                             |
@ -408,7 +408,7 @@ objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
 | Name       | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `spans`    | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
+| `spans`    | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
 | `tokens`   | The output of the tokenizer. ~~transformers.BatchEncoding~~                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | `tensors`  | The output of the transformer model. ~~List[torch.Tensor]~~                                                                                                                                                                                                                                                                                                                                                                                                                          |
 | `align`    | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~                                                                                                                                                                                                                                   |
@ -438,10 +438,10 @@ Split a `TransformerData` object that represents a batch into a list with one
 ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
 Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
-return a lists of [`Span`](/api/span) objects for each doc, to be processed by
+return a lists of [`Span`](/api/span) objects for each doc to be processed by
-the transformer. This is used to manage long documents, by cutting them into
+the transformer. This is used to manage long documents by cutting them into
 smaller sequences before running the transformer. The spans are allowed to
-overlap, and you can also omit sections of the Doc if they are not relevant.
+overlap, and you can also omit sections of the `Doc` if they are not relevant.
 Span getters can be referenced in the `[components.transformer.model.get_spans]`
 block of the config to customize the sequences processed by the transformer. You
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@ -290,7 +290,7 @@ If a table is full, it can be resized using
 ## Vectors.n_keys {#n_keys tag="property"}
 Get the number of keys in the table. Note that this is the number of _all_ keys,
-not just unique vectors. If several keys are mapped are mapped to the same
+not just unique vectors. If several keys are mapped to the same
 vectors, they will be counted individually.
 > #### Example
@ -307,10 +307,10 @@ vectors, they will be counted individually.
 ## Vectors.most_similar {#most_similar tag="method"}
-For each of the given vectors, find the `n` most similar entries to it, by
+For each of the given vectors, find the `n` most similar entries to it by
 cosine. Queries are by vector. Results are returned as a
 `(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are
-performed in chunks, to avoid consuming too much memory. You can set the
+performed in chunks to avoid consuming too much memory. You can set the
 `batch_size` to control the size/space trade-off during the calculations.
 > #### Example
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@ -29,7 +29,7 @@ Create the vocabulary.
 | `oov_prob`                                  | The default OOV probability. Defaults to `-20.0`. ~~float~~                                                                                             |
 | `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~                                                                                                           |
 | `writing_system`                            | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~          |
-| `get_noun_chunks`                           | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
+| `get_noun_chunks`                           | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
 ## Vocab.\_\_len\_\_ {#len tag="method"}
@ -150,7 +150,7 @@ rows, we would discard the vectors for "feline" and "reclined". These words
 would then be remapped to the closest remaining vector – so "feline" would have
 the same vector as "cat", and "reclined" would have the same vector as "sat".
 The similarities are judged by cosine. The original vectors may be large, so the
-cosines are calculated in minibatches, to reduce memory usage.
+cosines are calculated in minibatches to reduce memory usage.
 > #### Example
 >
@ -170,7 +170,7 @@ cosines are calculated in minibatches, to reduce memory usage.
 Retrieve a vector for a word in the vocabulary. Words can be looked up by string
 or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
 is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
-subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
+subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`).
 > #### Example
 >
@ -182,13 +182,13 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
 | Name                                | Description                                                                                                            |
 | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
 | `orth`                              | The hash value of a word, or its unicode string. ~~Union[int, str]~~                                                   |
-| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~                 |
+| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~                 |
-| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~                 |
+| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~                 |
 | **RETURNS**                         | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
 ## Vocab.set_vector {#set_vector tag="method" new="2"}
-Set a vector for a word in the vocabulary. Words can be referenced by by string
+Set a vector for a word in the vocabulary. Words can be referenced by string
 or hash value.
 > #### Example
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -41,8 +41,8 @@ transformers is that word vectors model **lexical types**, rather than _tokens_.
 If you have a list of terms with no context around them, a transformer model
 like BERT can't really help you. BERT is designed to understand language **in
 context**, which isn't what you have. A word vectors table will be a much better
-fit for your task. However, if you do have words in context — whole sentences or
+fit for your task. However, if you do have words in context – whole sentences or
-paragraphs of running text — word vectors will only provide a very rough
+paragraphs of running text – word vectors will only provide a very rough
 approximation of what the text is about.
 Word vectors are also very computationally efficient, as they map a word to a
@ -256,7 +256,7 @@ for doc in nlp.pipe(["some text", "some other text"]):
 ```
 You can also customize how the [`Transformer`](/api/transformer) component sets
-annotations onto the [`Doc`](/api/doc), by specifying a custom
+annotations onto the [`Doc`](/api/doc) by specifying a custom
 `set_extra_annotations` function. This callback will be called with the raw
 input and output data for the whole batch, along with the batch of `Doc`
 objects, allowing you to implement whatever you need. The annotation setter is
@ -675,7 +675,7 @@ given you a 10% error reduction, pretraining with spaCy might give you another
 The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
 subnetwork** within one of your components, and add additional layers to build a
-network for a temporary task, that forces the model to learn something about
+network for a temporary task that forces the model to learn something about
 sentence structure and word cooccurrence statistics. Pretraining produces a
 **binary weights file** that can be loaded back in at the start of training. The
 weights file specifies an initial set of weights. Training then proceeds as
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -6,6 +6,7 @@ menu:
  - ['Introduction', 'basics']
  - ['Quickstart', 'quickstart']
  - ['Config System', 'config']
  <!-- - ['Data Utilities', 'data'] -->
  - ['Custom Functions', 'custom-functions']
  - ['Parallel Training', 'parallel-training']
  - ['Internal API', 'api']
@ -505,6 +506,16 @@ still look good.
 </Accordion>
 <!--
 ## Data Utilities {#data-utilities}
 * spacy convert
 * The [corpora] block
 * Custom corpus class
 * Minibatching
 * Data augmentation
 -->
 ## Custom Functions {#custom-functions}
 Registered functions in the training config files can refer to built-in
@ -689,7 +700,7 @@ from pathlib import Path
@spacy.registry.loggers("my_custom_logger.v1")
 def custom_logger(log_path):
    def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
-        with Path(log_path).open("w") as file_:
+        with Path(log_path).open("w", encoding="utf8") as file_:
            file_.write("step\\t")
            file_.write("score\\t")
            for pipe in nlp.pipe_names:
--- a/website/setup/jinja_to_js.py
+++ b/website/setup/jinja_to_js.py
@ -1256,7 +1256,7 @@ def main(template_path, output=None, data_path=None):
    data_str = f"export const DATA = {data}"
    result = compiler.get_output()
    if output is not None:
-        with output.open("w") as f:
+        with output.open("w", encoding="utf8") as f:
            f.write(f"{header}\n{result}\n{data_str}")
        print(f"Updated {output.parts[-1]}")
    else: