Merge pull request #5933 from svlandeg/feature/more-v3-docs [ci skip]

2025-10-27 14:11:04 +03:00 · 2020-08-19 11:29:02 +02:00 · 2020-08-19 11:29:02 +02:00 · 2285e59765
commit 2285e59765
parent 13291e97ba 6ed67d495a
10 changed files with 186 additions and 123 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -216,7 +216,7 @@ list of available editor integrations.
 #### Disabling formatting
 There are a few cases where auto-formatting doesn't improve readability – for
-example, in some of the the language data files like the `tag_map.py`, or in
+example, in some of the language data files like the `tag_map.py`, or in
 the tests that construct `Doc` objects from lists of words and other labels.
 Wrapping a block in `# fmt: off` and `# fmt: on` lets you disable formatting
 for that particular code. Here's an example:
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -235,7 +235,7 @@ def train_while_improving(
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
    where info is a dict, and is_best_checkpoint is in [True, False, None] --
    None indicating that the iteration was not evaluated as a checkpoint.
-    The evaluation is conducted by calling the evaluate callback, which should
+    The evaluation is conducted by calling the evaluate callback.
    Positional arguments:
        nlp: The spaCy pipeline to evaluate.
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -546,7 +546,7 @@ network has an internal CNN Tok2Vec layer and uses attention.
 <!-- TODO: model return type -->
 | Name                 | Description                                                                                                                                                                                        |
-| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
 | `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                        |
 | `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                             |
@ -555,7 +555,7 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                        |
 | `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
 | `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                        |
-| `nO`                 | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
 | **CREATES**          | The model using the architecture. ~~Model~~                                                                                                                                                        |
 ### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -586,10 +586,10 @@ architecture is usually less accurate than the ensemble, but runs faster.
 <!-- TODO: model return type -->
 | Name                | Description                                                                                                                                                                                        |
-| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
 | `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                            |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model~~                                                                                                                                                        |
 ### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -611,11 +611,11 @@ others, but may not be as accurate, especially if texts are short.
 <!-- TODO: model return type -->
 | Name                | Description                                                                                                                                                                                        |
-| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                         |
 | `ngram_size`        | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                                |
 | `no_output_layer`   | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~                                                               |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
+| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
 | **CREATES**         | The model using the architecture. ~~Model~~                                                                                                                                                        |
 ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
--- a/website/docs/api/corpus.md
+++ b/website/docs/api/corpus.md
@ -17,7 +17,7 @@ customize the data loading during training, you can register your own
 or evaluation data. It takes the same arguments as the `Corpus` class and
 returns a callable that yields [`Example`](/api/example) objects. You can
 replace it with your own registered function in the
-[`@readers` registry](/api/top-level#regsitry) to customize the data loading and
+[`@readers` registry](/api/top-level#registry) to customize the data loading and
 streaming.
 > #### Example config
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -162,7 +162,7 @@ run [`spacy pretrain`](/api/cli#pretrain).
 | `dropout`                    | The dropout rate. Defaults to `0.2`. ~~float~~                                                                                  |
 | `n_save_every`               | Saving frequency. Defaults to `null`. ~~Optional[int]~~                                                                         |
 | `batch_size`                 | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
-| `seed`                       | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                 |
+| `seed`                       | The random seed. Defaults to variable `${system:seed}`. ~~int~~                                                                 |
 | `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~                              |
 | `tok2vec_model`              | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~                   |
 | `objective`                  | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~                          |
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -169,7 +169,7 @@ $ python setup.py build_ext --inplace           # compile spaCy
 Compared to regular install via pip, the
 [`requirements.txt`](https://github.com/explosion/spaCy/tree/master/requirements.txt)
-additionally installs developer dependencies such as Cython. See the the
+additionally installs developer dependencies such as Cython. See the 
 [quickstart widget](#quickstart) to get the right commands for your platform and
 Python version.
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@ -551,9 +551,9 @@ setup(
 )
 ```
-After installing the package, the the custom colors will be used when
+After installing the package, the custom colors will be used when visualizing
-visualizing text with `displacy`. Whenever the label `SNEK` is assigned, it will
+text with `displacy`. Whenever the label `SNEK` is assigned, it will be
-be displayed in `#3dff74`.
+displayed in `#3dff74`.
 import DisplaCyEntSnekHtml from 'images/displacy-ent-snek.html'
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -144,7 +144,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/default_config.cfg
 Under the hood, the config is parsed into a dictionary. It's divided into
 sections and subsections, indicated by the square brackets and dot notation. For
-example, `[training]` is a section and `[training.batch_size]` a subsections.
+example, `[training]` is a section and `[training.batch_size]` a subsection.
 Subsections can define values, just like a dictionary, or use the `@` syntax to
 refer to [registered functions](#config-functions). This allows the config to
 not just define static settings, but also construct objects like architectures,
@ -156,7 +156,7 @@ sections of a config file are:
 | `nlp`         | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names.                                           |
 | `components`  | Definitions of the [pipeline components](/usage/processing-pipelines) and their models.                                                                         |
 | `paths`       | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI.          |
-| `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
+| `system`      | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. |
 | `training`    | Settings and controls for the training and evaluation process.                                                                                                  |
 | `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining).                                                                              |
@ -514,11 +514,11 @@ language class and `nlp` object at different points of the lifecycle:
 | `after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer.          |
 | `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                |
-The `@spacy.registry.callbacks` decorator lets you register that function in the
+The `@spacy.registry.callbacks` decorator lets you register your custom function
-`callbacks` [registry](/api/top-level#registry) under a given name. You can then
+in the `callbacks` [registry](/api/top-level#registry) under a given name. You
-reference the function in a config block using the `@callbacks` key. If a block
+can then reference the function in a config block using the `@callbacks` key. If
-contains a key starting with an `@`, it's interpreted as a reference to a
+a block contains a key starting with an `@`, it's interpreted as a reference to
-function. Because you've registered the function, spaCy knows how to create it
+a function. Because you've registered the function, spaCy knows how to create it
 when you reference `"customize_language_data"` in your config. Here's an example
 of a callback that runs before the `nlp` object is created and adds a few custom
 tokenization rules to the defaults:
@ -593,9 +593,9 @@ spaCy's configs are powered by our machine learning library Thinc's
 using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered
 function provides type hints, the values that are passed in will be checked
 against the expected types. For example, `debug: bool` in the example above will
-ensure that the value received as the argument `debug` is an boolean. If the
+ensure that the value received as the argument `debug` is a boolean. If the
 value can't be coerced into a boolean, spaCy will raise an error.
-`start: pydantic.StrictBool` will force the value to be an boolean and raise an
+`debug: pydantic.StrictBool` will force the value to be a boolean and raise an
 error if it's not – for instance, if your config defines `1` instead of `true`.
 </Infobox>
@ -642,7 +642,9 @@ settings in the block will be passed to the function as keyword arguments. Keep
 in mind that the config shouldn't have any hidden defaults and all arguments on
 the functions need to be represented in the config. If your function defines
 **default argument values**, spaCy is able to auto-fill your config when you run
-[`init fill-config`](/api/cli#init-fill-config).
+[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
 given parameter is always explicitely set in the config, avoid setting a default
 value for it.
 ```ini
 ### config.cfg (excerpt)
@ -654,7 +656,68 @@ factor = 1.005
 #### Example: Custom data reading and batching {#custom-code-readers-batchers}
-<!-- TODO: -->
+Some use-cases require streaming in data or manipulating datasets on the fly,
 rather than generating all data beforehand and storing it to file. Instead of
 using the built-in reader `"spacy.Corpus.v1"`, which uses static file paths, you
 can create and register a custom function that generates
 [`Example`](/api/example) objects. The resulting generator can be infinite. When
 using this dataset for training, stopping criteria such as maximum number of
 steps, or stopping when the loss does not decrease further, can be used.
 In this example we assume a custom function `read_custom_data()` which loads or
 generates texts with relevant textcat annotations. Then, small lexical
 variations of the input text are created before generating the final `Example`
 objects.
 We can also customize the batching strategy by registering a new "batcher" which
 turns a stream of items into a stream of batches. spaCy has several useful
 built-in batching strategies with customizable sizes<!-- TODO: link  -->, but
 it's also easy to implement your own. For instance, the following function takes
 the stream of generated `Example` objects, and removes those which have the
 exact same underlying raw text, to avoid duplicates within each batch. Note that
 in a more realistic implementation, you'd also want to check whether the
 annotations are exactly the same.
 > ```ini
 > [training.train_corpus]
 > @readers = "corpus_variants.v1"
 >
 > [training.batcher]
 > @batchers = "filtering_batch.v1"
 > size = 150
 > ```
 ```python
 ### functions.py
 from typing import Callable, Iterable, List
 import spacy
 from spacy.gold import Example
 import random
@spacy.registry.readers("corpus_variants.v1")
 def stream_data() -> Callable[["Language"], Iterable[Example]]:
    def generate_stream(nlp):
        for text, cats in read_custom_data():
            random_index = random.randint(0, len(text) - 1)
            variant = text[:random_index] + text[random_index].upper() + text[random_index + 1:]
            doc = nlp.make_doc(variant)
            example = Example.from_dict(doc, {"cats": cats})
            yield example
    return generate_stream
@spacy.registry.batchers("filtering_batch.v1")
 def filter_batch(size: int) -> Callable[[Iterable[Example]], Iterable[List[Example]]]:
    def create_filtered_batches(examples: Iterable[Example]) -> Iterable[List[Example]]:
        batch = []
        for eg in examples:
            if eg.text not in [x.text for x in batch]:
                batch.append(eg)
            if len(batch) == size:
                yield batch
                batch = []
    return create_filtered_batches
 ```
 ### Wrapping PyTorch and TensorFlow {#custom-frameworks}
--- a/website/package.json
+++ b/website/package.json
@ -60,7 +60,7 @@
        "clear": "rm -rf .cache",
        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
        "python:install": "pip install setup/requirements.txt",
-        "python:setup": "cd setup && ./setup.sh"
+        "python:setup": "cd setup && sh setup.sh"
    },
    "devDependencies": {
        "@sindresorhus/slugify": "^0.8.0",
--- a/website/setup/jinja_to_js.py
+++ b/website/setup/jinja_to_js.py
@ -2,7 +2,7 @@
 # With additional functionality: in/not in, replace, pprint, round, + for lists,
 # rendering empty dicts
 # This script is mostly used to generate the JavaScript function for the
-# training quicktart widget.
+# training quickstart widget.
 import contextlib
 import json
 import re