diff --git a/spacy/errors.py b/spacy/errors.py index 173aedab9..81e3616be 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -119,9 +119,6 @@ class Warnings: W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you " "need to match on a stream of documents, you can use nlp.pipe and " "call the {matcher} on each Doc object.") - W106 = ("Both HEAD and SENT_START are included as attributes in " - "doc.from_array(). The parse trees based on the HEAD attribute " - "will override the values in SENT_START.") W107 = ("The property Doc.{prop} is deprecated. Use " "Doc.has_annotation(\"{attr}\") instead.") diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ce979d3d1..c979931b1 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -274,12 +274,11 @@ def test_doc_from_array_sent_starts(en_vocab): # fmt: on doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) - # HEAD overrides SENT_START with warning + # HEAD overrides SENT_START without warning attrs = [SENT_START, HEAD] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) - with pytest.warns(UserWarning): - new_doc.from_array(attrs, arr) + new_doc.from_array(attrs, arr) # no warning using default attrs attrs = doc._get_array_attrs() diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5c5443258..2d9de278b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -817,8 +817,6 @@ cdef class Doc: if array.dtype != numpy.uint64: warnings.warn(Warnings.W028.format(type=array.dtype)) - if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs: - warnings.warn(Warnings.W106) cdef int i, col cdef int32_t abs_head_index cdef attr_id_t attr_id diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 4a3541d63..b485ff18e 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -161,14 +161,14 @@ validation error with more details. $ python -m spacy init fill-config [base_path] [output_file] [--diff] ``` -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | -| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | -| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | -| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | Complete and auto-filled config file for training. | +| Name | Description | +| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ | +| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ | +| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ | +| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | Complete and auto-filled config file for training. | ### init vocab {#init-vocab new="3" tag="command"} diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 8dd104ead..4adcd927c 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -30,14 +30,20 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using -The key difference between [word vectors](#word-vectors) and contextual language -models such as [transformers](#transformers) is that word vectors model -**lexical types**, rather than _tokens_. If you have a list of terms with no -context around them, a transformer model like BERT can't really help you. BERT -is designed to understand language **in context**, which isn't what you have. A -word vectors table will be a much better fit for your task. However, if you do -have words in context — whole sentences or paragraphs of running text — word -vectors will only provide a very rough approximation of what the text is about. +[Transformers](#transformers) are large and powerful neural networks that give +you better accuracy, but are harder to deploy in production, as they require a +GPU to run effectively. [Word vectors](#word-vectors) are a slightly older +technique that can give your models a smaller improvement in accuracy, and can +also provide some additional capabilities. + +The key difference between word-vectors and contextual language models such as +transformers is that word vectors model **lexical types**, rather than _tokens_. +If you have a list of terms with no context around them, a transformer model +like BERT can't really help you. BERT is designed to understand language **in +context**, which isn't what you have. A word vectors table will be a much better +fit for your task. However, if you do have words in context — whole sentences or +paragraphs of running text — word vectors will only provide a very rough +approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a vector with a single indexing operation. Word vectors are therefore useful as a @@ -478,7 +484,32 @@ training. ## Static vectors {#static-vectors} - +If your pipeline includes a **word vectors table**, you'll be able to use the +`.similarity()` method on the [`Doc`](/api/doc), [`Span`](/api/span), +[`Token`](/api/token) and [`Lexeme`](/api/lexeme) objects. You'll also be able +to access the vectors using the `.vector` attribute, or you can look up one or +more vectors directly using the [`Vocab`](/api/vocab) object. Pipelines with +word vectors can also **use the vectors as features** for the statistical +models, which can **improve the accuracy** of your components. + +Word vectors in spaCy are "static" in the sense that they are not learned +parameters of the statistical models, and spaCy itself does not feature any +algorithms for learning word vector tables. You can train a word vectors table +using tools such as [Gensim](https://radimrehurek.com/gensim/), +[FastText](https://fasttext.cc/) or +[GloVe](https://nlp.stanford.edu/projects/glove/), or download existing +pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you +convert vectors for use with spaCy and will give you a directory you can load or +refer to in your [training configs](/usage/training#config). + + + +For more details on loading word vectors into spaCy, using them for similarity +and improving word vector coverage by truncating and pruning the vectors, see +the usage guide on +[word vectors and similarity](/usage/linguistic-features#vectors-similarity). + + ### Using word vectors in your models {#word-vectors-models} @@ -579,33 +610,141 @@ def MyCustomVectors( ## Pretraining {#pretraining} - - +The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your +models with **information from raw text**. Without pretraining, the models for +your components will usually be initialized randomly. The idea behind +pretraining is simple: random probably isn't optimal, so if we have some text to +learn from, we can probably find a way to get the model off to a better start. - +Pretraining uses the same [`config.cfg`](/usage/training#config) file as the +regular training, which helps keep the settings and hyperparameters consistent. +The additional `[pretraining]` section has several configuration subsections +that are familiar from the training block: the `[pretraining.batcher]`, +`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and +expect the same types of objects, although for pretraining your corpus does not +need to have any annotations, so you will often use a different reader, such as +the [`JsonlReader`](/api/toplevel#jsonlreader). > #### Raw text format > -> The raw text can be provided as JSONL (newline-delimited JSON) with a key -> `"text"` per entry. This allows the data to be read in line by line, while -> also allowing you to include newlines in the texts. +> The raw text can be provided in spaCy's +> [binary `.spacy` format](/api/data-formats#training) consisting of serialized +> `Doc` objects or as a JSONL (newline-delimited JSON) with a key `"text"` per +> entry. This allows the data to be read in line by line, while also allowing +> you to include newlines in the texts. > > ```json > {"text": "Can I ask where you work now and what you do, and if you enjoy it?"} > {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} > ``` +> +> You can also use your own custom corpus loader instead. + +You can add a `[pretraining]` block to your config by setting the +`--pretraining` flag on [`init config`](/api/cli#init-config) or +[`init fill-config`](/api/cli#init-fill-config): ```cli $ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining ``` +You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config +and pass in optional config overrides, like the path to the raw text file: + ```cli -$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg +$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl ``` + +### How pretraining works {#pretraining-details} + +The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually +be worth trying if you're **not using a transformer** model and you have +**relatively little training data** (for instance, fewer than 5,000 sentences). +A good rule of thumb is that pretraining will generally give you a similar +accuracy improvement to using word vectors in your model. If word vectors have +given you a 10% error reduction, pretraining with spaCy might give you another +10%, for a 20% error reduction in total. + +The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific +subnetwork** within one of your components, and add additional layers to build a +network for a temporary task, that forces the model to learn something about +sentence structure and word cooccurrence statistics. Pretraining produces a +**binary weights file** that can be loaded back in at the start of training. The +weights file specifies an initial set of weights. Training then proceeds as +normal. + +You can only pretrain one subnetwork from your pipeline at a time, and the +subnetwork must be typed ~~Model[List[Doc], List[Floats2d]]~~ (i.e. it has to be +a "tok2vec" layer). The most common workflow is to use the +[`Tok2Vec`](/api/tok2vec) component to create a shared token-to-vector layer for +several components of your pipeline, and apply pretraining to its whole model. + +#### Configuring the pretraining {#pretraining-configure} + +The [`spacy pretrain`](/api/cli#pretrain) command is configured using the +`[pretraining]` section of your [config file](/usage/training#config). The +`component` and `layer` settings tell spaCy how to **find the subnetwork** to +pretrain. The `layer` setting should be either the empty string (to use the +whole model), or a +[node reference](https://thinc.ai/docs/usage-models#model-state). Most of +spaCy's built-in model architectures have a reference named `"tok2vec"` that +will refer to the right layer. + +```ini +### config.cfg +# 1. Use the whole model of the "tok2vec" component +[pretraining] +component = "tok2vec" +layer = "" + +# 2. Pretrain the "tok2vec" node of the "textcat" component +[pretraining] +component = "textcat" +layer = "tok2vec" +``` + +#### Pretraining objectives {#pretraining-details} + +Two pretraining objectives are available, both of which are variants of the +cloze task [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805) introduced +for BERT. The objective can be defined and configured via the +`[pretraining.objective]` config block. + +> ```ini +> ### Characters objective +> [pretraining.objective] +> type = "characters" +> n_characters = 4 +> ``` +> +> ```ini +> ### Vectors objective +> [pretraining.objective] +> type = "vectors" +> loss = "cosine" +> ``` + +- **Characters:** The `"characters"` objective asks the model to predict some + number of leading and trailing UTF-8 bytes for the words. For instance, + setting `n_characters = 2`, the model will try to predict the first two and + last two characters of the word. + +- **Vectors:** The `"vectors"` objective asks the model to predict the word's + vector, from a static embeddings table. This requires a word vectors model to + be trained and loaded. The vectors objective can optimize either a cosine or + an L2 loss. We've generally found cosine loss to perform better. + +These pretraining objectives use a trick that we term **language modelling with +approximate outputs (LMAO)**. The motivation for the trick is that predicting an +exact word ID introduces a lot of incidental complexity. You need a large output +layer, and even then, the vocabulary is too large, which motivates tokenization +schemes that do not align to actual word boundaries. At the end of training, the +output layer will be thrown away regardless: we just want a task that forces the +network to model something about word cooccurrence statistics. Predicting +leading and trailing characters does that more than adequately, as the exact +word sequence could be recovered with high accuracy if the initial and trailing +characters are predicted accurately. With the vectors objective, the pretraining +is use the embedding space learned by an algorithm such as +[GloVe](https://nlp.stanford.edu/projects/glove/) or +[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to +focus on the contextual modelling we actual care about.