From ba6cf9821f0ba4174fe91a840688785fbaa5ed98 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Sep 2020 14:28:28 +0200 Subject: [PATCH 1/8] Replace docs analytics [ci skip] --- website/gatsby-config.js | 9 --------- website/meta/site.json | 1 - website/package.json | 1 - 3 files changed, 11 deletions(-) diff --git a/website/gatsby-config.js b/website/gatsby-config.js index 2a5f957f4..144b8e93e 100644 --- a/website/gatsby-config.js +++ b/website/gatsby-config.js @@ -131,15 +131,6 @@ module.exports = { icon: `src/images/icon.png`, }, }, - { - resolve: `gatsby-plugin-google-analytics`, - options: { - trackingId: site.analytics, - head: false, - anonymize: true, - respectDNT: true, - }, - }, { resolve: `gatsby-plugin-plausible`, options: { diff --git a/website/meta/site.json b/website/meta/site.json index 4d12a4c46..31f2f2f68 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -14,7 +14,6 @@ "github": "explosion" }, "theme": "#09a3d5", - "analytics": "UA-58931649-1", "newsletter": { "user": "spacy.us12", "id": "83b0498b1e7fa3c91ce68c3f1", diff --git a/website/package.json b/website/package.json index a59bc9bdc..8d8ba6408 100644 --- a/website/package.json +++ b/website/package.json @@ -20,7 +20,6 @@ "gatsby-image": "^2.0.29", "gatsby-mdx": "^0.3.6", "gatsby-plugin-catch-links": "^2.0.11", - "gatsby-plugin-google-analytics": "^2.0.14", "gatsby-plugin-manifest": "^2.0.17", "gatsby-plugin-offline": "^2.0.24", "gatsby-plugin-plausible": "0.0.6", From 33d9c649771cf03122ccb9fe7544e8c14ed788fa Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Sep 2020 14:44:38 +0200 Subject: [PATCH 2/8] Fix outbound link and update package lock [ci skip] --- website/package-lock.json | 8 -------- website/src/components/link.js | 11 ++--------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/website/package-lock.json b/website/package-lock.json index dded33fb0..63e67ebd2 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -7441,14 +7441,6 @@ "escape-string-regexp": "^1.0.5" } }, - "gatsby-plugin-google-analytics": { - "version": "2.0.14", - "resolved": "https://registry.npmjs.org/gatsby-plugin-google-analytics/-/gatsby-plugin-google-analytics-2.0.14.tgz", - "integrity": "sha512-sFD73d9isJQknnDAAkDidaybHJx6VIaLfy3nO3DwbFaitvZ08RimbynYOkcWAeA0zwwix2RgAvbq/9pAmtTb/A==", - "requires": { - "@babel/runtime": "^7.0.0" - } - }, "gatsby-plugin-manifest": { "version": "2.0.17", "resolved": "https://registry.npmjs.org/gatsby-plugin-manifest/-/gatsby-plugin-manifest-2.0.17.tgz", diff --git a/website/src/components/link.js b/website/src/components/link.js index 4c4aa9492..dc0cfda8e 100644 --- a/website/src/components/link.js +++ b/website/src/components/link.js @@ -1,7 +1,6 @@ import React, { Fragment } from 'react' import PropTypes from 'prop-types' import { Link as GatsbyLink } from 'gatsby' -import { OutboundLink } from 'gatsby-plugin-google-analytics' import classNames from 'classnames' import Icon from './icon' @@ -74,15 +73,9 @@ const Link = ({ const rel = isInternal ? null : 'noopener nofollow noreferrer' return ( - + {content} - + ) } From a26f864ed3c227fab1d2a506e27cb4b5b5d831d2 Mon Sep 17 00:00:00 2001 From: Marek Grzenkowicz Date: Tue, 8 Sep 2020 21:13:50 +0200 Subject: [PATCH 3/8] Clarify how to choose pretrained weights files (closes #6027) [ci skip] (#6039) --- website/docs/api/cli.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 779fa7695..b97308aab 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -445,7 +445,8 @@ an approximate language-modeling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the -`spacy train` command. +`spacy train` command. You can try to use a few with low `Loss` values reported +in the output. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the From ec751068f328e47ae7fa8ca1745a1dd8ac00529d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 17 Sep 2020 16:42:53 +0200 Subject: [PATCH 4/8] Draft text for static vectors intro --- website/docs/usage/embeddings-transformers.md | 45 +++++++++++++++---- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 8dd104ead..6a239cb1e 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -30,14 +30,20 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using -The key difference between [word vectors](#word-vectors) and contextual language -models such as [transformers](#transformers) is that word vectors model -**lexical types**, rather than _tokens_. If you have a list of terms with no -context around them, a transformer model like BERT can't really help you. BERT -is designed to understand language **in context**, which isn't what you have. A -word vectors table will be a much better fit for your task. However, if you do -have words in context — whole sentences or paragraphs of running text — word -vectors will only provide a very rough approximation of what the text is about. +[Transformers](#transformers) are large and powerful neural networks that give +you better accuracy, but are harder to deploy in production, as they require a GPU to run +effectively. [Word vectors](#word-vectors) are a slightly older technique that +can give your models a smaller improvement in accuracy, and can also provide +some additional capabilities. + +The key difference between word-vectors and contextual language +models such as transformers is that word vectors model **lexical types**, rather +than _tokens_. If you have a list of terms with no context around them, a transformer +model like BERT can't really help you. BERT is designed to understand language +**in context**, which isn't what you have. A word vectors table will be a much +better fit for your task. However, if you do have words in context — whole sentences +or paragraphs of running text — word vectors will only provide a very rough +approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a vector with a single indexing operation. Word vectors are therefore useful as a @@ -478,7 +484,28 @@ training. ## Static vectors {#static-vectors} - +If your pipeline includes a word vectors table, you'll be able to use the +`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects. +You'll also be able to access the vectors using the `.vector` attribute, or you +can look up one or more vectors directly using the `Vocab` object. Pipelines +with word vectors can also use the vectors as features for the statistical +models, which can improve the accuracy of your components. + +Word vectors in spaCy are "static" in the sense that they are not learned +parameters of the statistical models, and spaCy itself does not feature any +algorithms for learning word vector tables. You can train a word vectors table +using tools such as Gensim, word2vec, FastText or GloVe. There are also many +word vector tables available for download. Once you have a word vectors table +you want to use, you can convert it for use with spaCy using the `spacy init vocab` +command, which will give you a directory you can load or refer to in your training +configs. + +When converting the vectors, there are two ways you can trim them down to make +your package smaller. You can _truncate_ the vectors with the `--truncate-vectors` +option, which will remove entries for rarer words from the table. Alternatively, +you can use the `--prune-vectors` option to remap rarer words to the closest vector +that remains in the table. This allows the vectors table to return meaningful +(albeit imperfect) results for more words than you have rows in the table. ### Using word vectors in your models {#word-vectors-models} From a2c8cda26ffbc6ba0e15b0872b8691ee4f366994 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 17 Sep 2020 17:12:51 +0200 Subject: [PATCH 5/8] Update docs [ci skip] --- website/docs/usage/embeddings-transformers.md | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 6a239cb1e..9f73661c3 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -31,18 +31,18 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using [Transformers](#transformers) are large and powerful neural networks that give -you better accuracy, but are harder to deploy in production, as they require a GPU to run -effectively. [Word vectors](#word-vectors) are a slightly older technique that -can give your models a smaller improvement in accuracy, and can also provide -some additional capabilities. +you better accuracy, but are harder to deploy in production, as they require a +GPU to run effectively. [Word vectors](#word-vectors) are a slightly older +technique that can give your models a smaller improvement in accuracy, and can +also provide some additional capabilities. -The key difference between word-vectors and contextual language -models such as transformers is that word vectors model **lexical types**, rather -than _tokens_. If you have a list of terms with no context around them, a transformer -model like BERT can't really help you. BERT is designed to understand language -**in context**, which isn't what you have. A word vectors table will be a much -better fit for your task. However, if you do have words in context — whole sentences -or paragraphs of running text — word vectors will only provide a very rough +The key difference between word-vectors and contextual language models such as +transformers is that word vectors model **lexical types**, rather than _tokens_. +If you have a list of terms with no context around them, a transformer model +like BERT can't really help you. BERT is designed to understand language **in +context**, which isn't what you have. A word vectors table will be a much better +fit for your task. However, if you do have words in context — whole sentences or +paragraphs of running text — word vectors will only provide a very rough approximation of what the text is about. Word vectors are also very computationally efficient, as they map a word to a @@ -484,28 +484,32 @@ training. ## Static vectors {#static-vectors} -If your pipeline includes a word vectors table, you'll be able to use the -`.similarity()` method on the `Doc`, `Span`, `Token` and `Lexeme` objects. -You'll also be able to access the vectors using the `.vector` attribute, or you -can look up one or more vectors directly using the `Vocab` object. Pipelines -with word vectors can also use the vectors as features for the statistical -models, which can improve the accuracy of your components. +If your pipeline includes a **word vectors table**, you'll be able to use the +`.similarity()` method on the [`Doc`](/api/doc), [`Span`](/api/span), +[`Token`](/api/token) and [`Lexeme`](/api/lexeme) objects. You'll also be able +to access the vectors using the `.vector` attribute, or you can look up one or +more vectors directly using the [`Vocab`](/api/vocab) object. Pipelines with +word vectors can also **use the vectors as features** for the statistical +models, which can **improve the accuracy** of your components. Word vectors in spaCy are "static" in the sense that they are not learned parameters of the statistical models, and spaCy itself does not feature any algorithms for learning word vector tables. You can train a word vectors table -using tools such as Gensim, word2vec, FastText or GloVe. There are also many -word vector tables available for download. Once you have a word vectors table -you want to use, you can convert it for use with spaCy using the `spacy init vocab` -command, which will give you a directory you can load or refer to in your training -configs. +using tools such as [Gensim](https://radimrehurek.com/gensim/), +[FastText](https://fasttext.cc/) or +[GloVe](https://nlp.stanford.edu/projects/glove/), or download existing +pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you +convert vectors for use with spaCy and will give you a directory you can load or +refer to in your [training configs](/usage/training#config). -When converting the vectors, there are two ways you can trim them down to make -your package smaller. You can _truncate_ the vectors with the `--truncate-vectors` -option, which will remove entries for rarer words from the table. Alternatively, -you can use the `--prune-vectors` option to remap rarer words to the closest vector -that remains in the table. This allows the vectors table to return meaningful -(albeit imperfect) results for more words than you have rows in the table. + + +For more details on loading word vectors into spaCy, using them for similarity +and improving word vector coverage by truncating and pruning the vectors, see +the usage guide on +[word vectors and similarity](/usage/linguistic-features#vectors-similarity). + + ### Using word vectors in your models {#word-vectors-models} From 6efb7688a65faae489de33073c1c40b11ec4f432 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 17 Sep 2020 18:17:03 +0200 Subject: [PATCH 6/8] Draft pretrain usage --- website/docs/usage/embeddings-transformers.md | 86 ++++++++++++++++--- 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 9f73661c3..678237dc2 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -610,17 +610,83 @@ def MyCustomVectors( ## Pretraining {#pretraining} - - +The `spacy pretrain` command lets you initialize your models with information +from raw text. Without pretraining, the models for your components will usually +be initialized randomly. The idea behind pretraining is simple: random probably +isn't optimal, so if we have some text to learn from, we can probably find +a way to get the model off to a better start. The impact of `spacy pretrain` varies, +but it will usually be worth trying if you're not using a transformer model and +you have relatively little training data (for instance, fewer than 5,000 sentence). +A good rule of thumb is that pretraining will generally give you a similar accuracy +improvement to using word vectors in your model. If word vectors have given you +a 10% error reduction, the `spacy pretrain` command might give you another 10%, +for a 20% error reduction in total. - +The `spacy pretrain` command will take a specific subnetwork within one of your +components, and add additional layers to build a network for a temporary task, +that forces the model to learn something about sentence structure and word +cooccurrence statistics. Pretraining produces a binary weights file that can be +loaded back in at the start of training. The weights file specifies an initial +set of weights. Training then proceeds as normal. + +You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork +must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer). +The most common workflow is to use the `Tok2Vec` component to create a shared +token-to-vector layer for several components of your pipeline, and apply +pretraining to its whole model. + +The `spacy pretrain` command is configured using the `[pretraining]` section of +your config file. The `pretraining.component` and `pretraining.layer` settings +tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer` +setting should be either the empty string (to use the whole model), or a +[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's +built-in model architectures have a reference named `"tok2vec"` that will refer +to the right layer. + +```ini +# Pretrain nlp.get_pipe("tok2vec").model +[pretraining] +component = "tok2vec" +layer = "" + +[pretraining] +# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec") +component = "textcat" +layer = "tok2vec" +``` + +two pretraining objectives are available, both of which are variants of the cloze +task Devlin et al (2018) introduced for BERT. + +* The *characters* objective asks the model to predict some number of leading and + trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the + model will try to predict the first two and last two characters of the word. + +* The *vectors* objective asks the model to predict the word's vector, from + a static embeddings table. This requires a word vectors model to be trained + and loaded. The vectors objective can optimize either a cosine or an L2 loss. + We've generally found cosine loss to perform better. + +These pretraining objectives use a trick that we term _language modelling with +approximate outputs (LMAO)_. The motivation for the trick is that predicting +an exact word ID introduces a lot of incidental complexity. You need a large +output layer, and even then, the vocabulary is too large, which motivates +tokenization schemes that do not align to actual word boundaries. At the end of +training, the output layer will be thrown away regardless: we just want a task +that forces the network to model something about word cooccurrence statistics. +Predicting leading and trailing characters does that more than adequately, as +the exact word sequence could be recovered with high accuracy if the initial +and trailing characters are predicted accurately. With the vectors objective, +the pretraining is use the embedding space learned by an algorithm such as +GloVe or word2vec, allowing the model to focus on the contextual +modelling we actual care about. + +The `[pretraining]` section has several configuration subsections that are +familiar from the training block: the `[pretraining.batcher]`, +[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and +expect the same types of objects, although for pretraining your corpus does not +need to have any annotations, so you will often use a different reader, such as +`spacy.training.JsonlReader1`. > #### Raw text format > From a0b4389a3845a1692b934a6ca79caf54bb29b1a3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 17 Sep 2020 19:24:48 +0200 Subject: [PATCH 7/8] Update docs [ci skip] --- website/docs/usage/embeddings-transformers.md | 200 +++++++++++------- 1 file changed, 121 insertions(+), 79 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 678237dc2..4adcd927c 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -610,99 +610,141 @@ def MyCustomVectors( ## Pretraining {#pretraining} -The `spacy pretrain` command lets you initialize your models with information -from raw text. Without pretraining, the models for your components will usually -be initialized randomly. The idea behind pretraining is simple: random probably -isn't optimal, so if we have some text to learn from, we can probably find -a way to get the model off to a better start. The impact of `spacy pretrain` varies, -but it will usually be worth trying if you're not using a transformer model and -you have relatively little training data (for instance, fewer than 5,000 sentence). -A good rule of thumb is that pretraining will generally give you a similar accuracy -improvement to using word vectors in your model. If word vectors have given you -a 10% error reduction, the `spacy pretrain` command might give you another 10%, -for a 20% error reduction in total. +The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your +models with **information from raw text**. Without pretraining, the models for +your components will usually be initialized randomly. The idea behind +pretraining is simple: random probably isn't optimal, so if we have some text to +learn from, we can probably find a way to get the model off to a better start. -The `spacy pretrain` command will take a specific subnetwork within one of your -components, and add additional layers to build a network for a temporary task, -that forces the model to learn something about sentence structure and word -cooccurrence statistics. Pretraining produces a binary weights file that can be -loaded back in at the start of training. The weights file specifies an initial -set of weights. Training then proceeds as normal. - -You can only pretrain one subnetwork from your pipeline at a time, and the subnetwork -must be typed `Model[List[Doc], List[Floats2d]]` (i.e., it has to be a "tok2vec" layer). -The most common workflow is to use the `Tok2Vec` component to create a shared -token-to-vector layer for several components of your pipeline, and apply -pretraining to its whole model. - -The `spacy pretrain` command is configured using the `[pretraining]` section of -your config file. The `pretraining.component` and `pretraining.layer` settings -tell spaCy how to find the subnetwork to pretrain. The `pretraining.layer` -setting should be either the empty string (to use the whole model), or a -[node reference](https://thinc.ai/docs/usage-models#model-state). Most of spaCy's -built-in model architectures have a reference named `"tok2vec"` that will refer -to the right layer. - -```ini -# Pretrain nlp.get_pipe("tok2vec").model -[pretraining] -component = "tok2vec" -layer = "" - -[pretraining] -# Pretrain nlp.get_pipe("textcat").model.get_ref("tok2vec") -component = "textcat" -layer = "tok2vec" -``` - -two pretraining objectives are available, both of which are variants of the cloze -task Devlin et al (2018) introduced for BERT. - -* The *characters* objective asks the model to predict some number of leading and - trailing UTF-8 bytes for the words. For instance, setting `n_characters=2`, the - model will try to predict the first two and last two characters of the word. - -* The *vectors* objective asks the model to predict the word's vector, from - a static embeddings table. This requires a word vectors model to be trained - and loaded. The vectors objective can optimize either a cosine or an L2 loss. - We've generally found cosine loss to perform better. - -These pretraining objectives use a trick that we term _language modelling with -approximate outputs (LMAO)_. The motivation for the trick is that predicting -an exact word ID introduces a lot of incidental complexity. You need a large -output layer, and even then, the vocabulary is too large, which motivates -tokenization schemes that do not align to actual word boundaries. At the end of -training, the output layer will be thrown away regardless: we just want a task -that forces the network to model something about word cooccurrence statistics. -Predicting leading and trailing characters does that more than adequately, as -the exact word sequence could be recovered with high accuracy if the initial -and trailing characters are predicted accurately. With the vectors objective, -the pretraining is use the embedding space learned by an algorithm such as -GloVe or word2vec, allowing the model to focus on the contextual -modelling we actual care about. - -The `[pretraining]` section has several configuration subsections that are -familiar from the training block: the `[pretraining.batcher]`, -[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and +Pretraining uses the same [`config.cfg`](/usage/training#config) file as the +regular training, which helps keep the settings and hyperparameters consistent. +The additional `[pretraining]` section has several configuration subsections +that are familiar from the training block: the `[pretraining.batcher]`, +`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and expect the same types of objects, although for pretraining your corpus does not -need to have any annotations, so you will often use a different reader, such as -`spacy.training.JsonlReader1`. +need to have any annotations, so you will often use a different reader, such as +the [`JsonlReader`](/api/toplevel#jsonlreader). > #### Raw text format > -> The raw text can be provided as JSONL (newline-delimited JSON) with a key -> `"text"` per entry. This allows the data to be read in line by line, while -> also allowing you to include newlines in the texts. +> The raw text can be provided in spaCy's +> [binary `.spacy` format](/api/data-formats#training) consisting of serialized +> `Doc` objects or as a JSONL (newline-delimited JSON) with a key `"text"` per +> entry. This allows the data to be read in line by line, while also allowing +> you to include newlines in the texts. > > ```json > {"text": "Can I ask where you work now and what you do, and if you enjoy it?"} > {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."} > ``` +> +> You can also use your own custom corpus loader instead. + +You can add a `[pretraining]` block to your config by setting the +`--pretraining` flag on [`init config`](/api/cli#init-config) or +[`init fill-config`](/api/cli#init-fill-config): ```cli $ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining ``` +You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config +and pass in optional config overrides, like the path to the raw text file: + ```cli -$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg +$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl ``` + +### How pretraining works {#pretraining-details} + +The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually +be worth trying if you're **not using a transformer** model and you have +**relatively little training data** (for instance, fewer than 5,000 sentences). +A good rule of thumb is that pretraining will generally give you a similar +accuracy improvement to using word vectors in your model. If word vectors have +given you a 10% error reduction, pretraining with spaCy might give you another +10%, for a 20% error reduction in total. + +The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific +subnetwork** within one of your components, and add additional layers to build a +network for a temporary task, that forces the model to learn something about +sentence structure and word cooccurrence statistics. Pretraining produces a +**binary weights file** that can be loaded back in at the start of training. The +weights file specifies an initial set of weights. Training then proceeds as +normal. + +You can only pretrain one subnetwork from your pipeline at a time, and the +subnetwork must be typed ~~Model[List[Doc], List[Floats2d]]~~ (i.e. it has to be +a "tok2vec" layer). The most common workflow is to use the +[`Tok2Vec`](/api/tok2vec) component to create a shared token-to-vector layer for +several components of your pipeline, and apply pretraining to its whole model. + +#### Configuring the pretraining {#pretraining-configure} + +The [`spacy pretrain`](/api/cli#pretrain) command is configured using the +`[pretraining]` section of your [config file](/usage/training#config). The +`component` and `layer` settings tell spaCy how to **find the subnetwork** to +pretrain. The `layer` setting should be either the empty string (to use the +whole model), or a +[node reference](https://thinc.ai/docs/usage-models#model-state). Most of +spaCy's built-in model architectures have a reference named `"tok2vec"` that +will refer to the right layer. + +```ini +### config.cfg +# 1. Use the whole model of the "tok2vec" component +[pretraining] +component = "tok2vec" +layer = "" + +# 2. Pretrain the "tok2vec" node of the "textcat" component +[pretraining] +component = "textcat" +layer = "tok2vec" +``` + +#### Pretraining objectives {#pretraining-details} + +Two pretraining objectives are available, both of which are variants of the +cloze task [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805) introduced +for BERT. The objective can be defined and configured via the +`[pretraining.objective]` config block. + +> ```ini +> ### Characters objective +> [pretraining.objective] +> type = "characters" +> n_characters = 4 +> ``` +> +> ```ini +> ### Vectors objective +> [pretraining.objective] +> type = "vectors" +> loss = "cosine" +> ``` + +- **Characters:** The `"characters"` objective asks the model to predict some + number of leading and trailing UTF-8 bytes for the words. For instance, + setting `n_characters = 2`, the model will try to predict the first two and + last two characters of the word. + +- **Vectors:** The `"vectors"` objective asks the model to predict the word's + vector, from a static embeddings table. This requires a word vectors model to + be trained and loaded. The vectors objective can optimize either a cosine or + an L2 loss. We've generally found cosine loss to perform better. + +These pretraining objectives use a trick that we term **language modelling with +approximate outputs (LMAO)**. The motivation for the trick is that predicting an +exact word ID introduces a lot of incidental complexity. You need a large output +layer, and even then, the vocabulary is too large, which motivates tokenization +schemes that do not align to actual word boundaries. At the end of training, the +output layer will be thrown away regardless: we just want a task that forces the +network to model something about word cooccurrence statistics. Predicting +leading and trailing characters does that more than adequately, as the exact +word sequence could be recovered with high accuracy if the initial and trailing +characters are predicted accurately. With the vectors objective, the pretraining +is use the embedding space learned by an algorithm such as +[GloVe](https://nlp.stanford.edu/projects/glove/) or +[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to +focus on the contextual modelling we actual care about. From a88106e852b08bcbbe607d5bb83929e5a13120f4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 18 Sep 2020 03:01:29 +0200 Subject: [PATCH 8/8] Remove W106: HEAD and SENT_START in doc.from_array (#6086) * Remove W106: HEAD and SENT_START in doc.from_array This warning was hacky and being triggered too often. * Fix test --- spacy/errors.py | 3 --- spacy/tests/doc/test_doc_api.py | 5 ++--- spacy/tokens/doc.pyx | 2 -- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 173aedab9..81e3616be 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -119,9 +119,6 @@ class Warnings: W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you " "need to match on a stream of documents, you can use nlp.pipe and " "call the {matcher} on each Doc object.") - W106 = ("Both HEAD and SENT_START are included as attributes in " - "doc.from_array(). The parse trees based on the HEAD attribute " - "will override the values in SENT_START.") W107 = ("The property Doc.{prop} is deprecated. Use " "Doc.has_annotation(\"{attr}\") instead.") diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ce979d3d1..c979931b1 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -274,12 +274,11 @@ def test_doc_from_array_sent_starts(en_vocab): # fmt: on doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) - # HEAD overrides SENT_START with warning + # HEAD overrides SENT_START without warning attrs = [SENT_START, HEAD] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) - with pytest.warns(UserWarning): - new_doc.from_array(attrs, arr) + new_doc.from_array(attrs, arr) # no warning using default attrs attrs = doc._get_array_attrs() diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5c5443258..2d9de278b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -817,8 +817,6 @@ cdef class Doc: if array.dtype != numpy.uint64: warnings.warn(Warnings.W028.format(type=array.dtype)) - if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs: - warnings.warn(Warnings.W106) cdef int i, col cdef int32_t abs_head_index cdef attr_id_t attr_id