Merge branch 'develop' into nightly.spacy.io

2025-12-05 17:24:29 +03:00 · 2020-09-18 13:37:35 +02:00 · 2020-09-18 13:37:35 +02:00 · 9c23386735
commit 9c23386735
parent 129e832306 a88106e852
5 changed files with 172 additions and 39 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -119,9 +119,6 @@ class Warnings:
    W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
            "need to match on a stream of documents, you can use nlp.pipe and "
            "call the {matcher} on each Doc object.")
-    W106 = ("Both HEAD and SENT_START are included as attributes in "
-            "doc.from_array(). The parse trees based on the HEAD attribute "
-            "will override the values in SENT_START.")
    W107 = ("The property Doc.{prop} is deprecated. Use "
            "Doc.has_annotation(\"{attr}\") instead.")

--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -274,12 +274,11 @@ def test_doc_from_array_sent_starts(en_vocab):
    # fmt: on
    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)

-    # HEAD overrides SENT_START with warning
+    # HEAD overrides SENT_START without warning
    attrs = [SENT_START, HEAD]
    arr = doc.to_array(attrs)
    new_doc = Doc(en_vocab, words=words)
-    with pytest.warns(UserWarning):
-        new_doc.from_array(attrs, arr)
+    new_doc.from_array(attrs, arr)

    # no warning using default attrs
    attrs = doc._get_array_attrs()
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -817,8 +817,6 @@ cdef class Doc:
        if array.dtype != numpy.uint64:
            warnings.warn(Warnings.W028.format(type=array.dtype))

-        if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
-            warnings.warn(Warnings.W106)
        cdef int i, col
        cdef int32_t abs_head_index
        cdef attr_id_t attr_id
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -161,14 +161,14 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```

-| Name                  | Description                                                                                                                         |
-| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `base_path`           | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
-| `output_file`         | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                                                                                                                                                                                                    |
-| `--diff`, `-D`        | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
-| `--help`, `-h`        | Show help message and available arguments. ~~bool (flag)~~                                                                          |
-| **CREATES**           | Complete and auto-filled config file for training.                                                                                  |
+| Name                   | Description                                                                                                                         |
+| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
+| `output_file`          | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                   |
+| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                          |
+| **CREATES**            | Complete and auto-filled config file for training.                                                                                  |

 ### init vocab {#init-vocab new="3" tag="command"}

--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -30,14 +30,20 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using

 <Accordion title="What’s the difference between word vectors and language models?" id="vectors-vs-language-models">

-The key difference between [word vectors](#word-vectors) and contextual language
-models such as [transformers](#transformers) is that word vectors model
-**lexical types**, rather than _tokens_. If you have a list of terms with no
-context around them, a transformer model like BERT can't really help you. BERT
-is designed to understand language **in context**, which isn't what you have. A
-word vectors table will be a much better fit for your task. However, if you do
-have words in context — whole sentences or paragraphs of running text — word
-vectors will only provide a very rough approximation of what the text is about.
+[Transformers](#transformers) are large and powerful neural networks that give
+you better accuracy, but are harder to deploy in production, as they require a
+GPU to run effectively. [Word vectors](#word-vectors) are a slightly older
+technique that can give your models a smaller improvement in accuracy, and can
+also provide some additional capabilities.
+
+The key difference between word-vectors and contextual language models such as
+transformers is that word vectors model **lexical types**, rather than _tokens_.
+If you have a list of terms with no context around them, a transformer model
+like BERT can't really help you. BERT is designed to understand language **in
+context**, which isn't what you have. A word vectors table will be a much better
+fit for your task. However, if you do have words in context — whole sentences or
+paragraphs of running text — word vectors will only provide a very rough
+approximation of what the text is about.

 Word vectors are also very computationally efficient, as they map a word to a
 vector with a single indexing operation. Word vectors are therefore useful as a
@ -478,7 +484,32 @@ training.

 ## Static vectors {#static-vectors}

-<!-- TODO: write -->
+If your pipeline includes a **word vectors table**, you'll be able to use the
+`.similarity()` method on the [`Doc`](/api/doc), [`Span`](/api/span),
+[`Token`](/api/token) and [`Lexeme`](/api/lexeme) objects. You'll also be able
+to access the vectors using the `.vector` attribute, or you can look up one or
+more vectors directly using the [`Vocab`](/api/vocab) object. Pipelines with
+word vectors can also **use the vectors as features** for the statistical
+models, which can **improve the accuracy** of your components.
+
+Word vectors in spaCy are "static" in the sense that they are not learned
+parameters of the statistical models, and spaCy itself does not feature any
+algorithms for learning word vector tables. You can train a word vectors table
+using tools such as [Gensim](https://radimrehurek.com/gensim/),
+[FastText](https://fasttext.cc/) or
+[GloVe](https://nlp.stanford.edu/projects/glove/), or download existing
+pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you
+convert vectors for use with spaCy and will give you a directory you can load or
+refer to in your [training configs](/usage/training#config).
+
+<Infobox title="Word vectors and similarity" emoji="📖">
+
+For more details on loading word vectors into spaCy, using them for similarity
+and improving word vector coverage by truncating and pruning the vectors, see
+the usage guide on
+[word vectors and similarity](/usage/linguistic-features#vectors-similarity).
+
+</Infobox>

 ### Using word vectors in your models {#word-vectors-models}

@ -579,33 +610,141 @@ def MyCustomVectors(

 ## Pretraining {#pretraining}

-<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
-</Infobox>
+The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
+models with **information from raw text**. Without pretraining, the models for
+your components will usually be initialized randomly. The idea behind
+pretraining is simple: random probably isn't optimal, so if we have some text to
+learn from, we can probably find a way to get the model off to a better start.

-<!--
- explain general concept and idea (short!)
- present it as a separate lightweight mechanism for pretraining the tok2vec
-  layer
- advantages (could also be pros/cons table)
- explain how it generates a separate file (!) and how it depends on the same
-  vectors
-->
+Pretraining uses the same [`config.cfg`](/usage/training#config) file as the
+regular training, which helps keep the settings and hyperparameters consistent.
+The additional `[pretraining]` section has several configuration subsections
+that are familiar from the training block: the `[pretraining.batcher]`,
+`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
+expect the same types of objects, although for pretraining your corpus does not
+need to have any annotations, so you will often use a different reader, such as
+the [`JsonlReader`](/api/toplevel#jsonlreader).

 > #### Raw text format
 >
-> The raw text can be provided as JSONL (newline-delimited JSON) with a key
-> `"text"` per entry. This allows the data to be read in line by line, while
-> also allowing you to include newlines in the texts.
+> The raw text can be provided in spaCy's
+> [binary `.spacy` format](/api/data-formats#training) consisting of serialized
+> `Doc` objects or as a JSONL (newline-delimited JSON) with a key `"text"` per
+> entry. This allows the data to be read in line by line, while also allowing
+> you to include newlines in the texts.
 >
 > ```json
 > {"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
 > {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
 > ```
+>
+> You can also use your own custom corpus loader instead.
+
+You can add a `[pretraining]` block to your config by setting the
+`--pretraining` flag on [`init config`](/api/cli#init-config) or
+[`init fill-config`](/api/cli#init-fill-config):

 ```cli
 $ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining
 ```

+You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config
+and pass in optional config overrides, like the path to the raw text file:
+
 ```cli
-$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg
+$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
 ```
+
+### How pretraining works {#pretraining-details}
+
+The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
+be worth trying if you're **not using a transformer** model and you have
+**relatively little training data** (for instance, fewer than 5,000 sentences).
+A good rule of thumb is that pretraining will generally give you a similar
+accuracy improvement to using word vectors in your model. If word vectors have
+given you a 10% error reduction, pretraining with spaCy might give you another
+10%, for a 20% error reduction in total.
+
+The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
+subnetwork** within one of your components, and add additional layers to build a
+network for a temporary task, that forces the model to learn something about
+sentence structure and word cooccurrence statistics. Pretraining produces a
+**binary weights file** that can be loaded back in at the start of training. The
+weights file specifies an initial set of weights. Training then proceeds as
+normal.
+
+You can only pretrain one subnetwork from your pipeline at a time, and the
+subnetwork must be typed ~~Model[List[Doc], List[Floats2d]]~~ (i.e. it has to be
+a "tok2vec" layer). The most common workflow is to use the
+[`Tok2Vec`](/api/tok2vec) component to create a shared token-to-vector layer for
+several components of your pipeline, and apply pretraining to its whole model.
+
+#### Configuring the pretraining {#pretraining-configure}
+
+The [`spacy pretrain`](/api/cli#pretrain) command is configured using the
+`[pretraining]` section of your [config file](/usage/training#config). The
+`component` and `layer` settings tell spaCy how to **find the subnetwork** to
+pretrain. The `layer` setting should be either the empty string (to use the
+whole model), or a
+[node reference](https://thinc.ai/docs/usage-models#model-state). Most of
+spaCy's built-in model architectures have a reference named `"tok2vec"` that
+will refer to the right layer.
+
+```ini
+### config.cfg
+# 1. Use the whole model of the "tok2vec" component
+[pretraining]
+component = "tok2vec"
+layer = ""
+
+# 2. Pretrain the "tok2vec" node of the "textcat" component
+[pretraining]
+component = "textcat"
+layer = "tok2vec"
+```
+
+#### Pretraining objectives {#pretraining-details}
+
+Two pretraining objectives are available, both of which are variants of the
+cloze task [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805) introduced
+for BERT. The objective can be defined and configured via the
+`[pretraining.objective]` config block.
+
+> ```ini
+> ### Characters objective
+> [pretraining.objective]
+> type = "characters"
+> n_characters = 4
+> ```
+>
+> ```ini
+> ### Vectors objective
+> [pretraining.objective]
+> type = "vectors"
+> loss = "cosine"
+> ```
+
+- **Characters:** The `"characters"` objective asks the model to predict some
+  number of leading and trailing UTF-8 bytes for the words. For instance,
+  setting `n_characters = 2`, the model will try to predict the first two and
+  last two characters of the word.
+
+- **Vectors:** The `"vectors"` objective asks the model to predict the word's
+  vector, from a static embeddings table. This requires a word vectors model to
+  be trained and loaded. The vectors objective can optimize either a cosine or
+  an L2 loss. We've generally found cosine loss to perform better.
+
+These pretraining objectives use a trick that we term **language modelling with
+approximate outputs (LMAO)**. The motivation for the trick is that predicting an
+exact word ID introduces a lot of incidental complexity. You need a large output
+layer, and even then, the vocabulary is too large, which motivates tokenization
+schemes that do not align to actual word boundaries. At the end of training, the
+output layer will be thrown away regardless: we just want a task that forces the
+network to model something about word cooccurrence statistics. Predicting
+leading and trailing characters does that more than adequately, as the exact
+word sequence could be recovered with high accuracy if the initial and trailing
+characters are predicted accurately. With the vectors objective, the pretraining
+is use the embedding space learned by an algorithm such as
+[GloVe](https://nlp.stanford.edu/projects/glove/) or
+[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
+focus on the contextual modelling we actual care about.