diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 5aecc2584..1a74d2206 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -852,7 +852,7 @@ cdef class GoldParse:
self.c.sent_start[i] = 0
-def docs_to_json(docs, id=0):
+def docs_to_json(docs, id=0, ner_missing_tag="O"):
"""Convert a list of Doc objects into the JSON-serializable format used by
the spacy train command.
@@ -870,7 +870,7 @@ def docs_to_json(docs, id=0):
json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
- biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
+ biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 1d2bd6d63..31bc3c549 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -1,31 +1,29 @@
---
title: Models
-teaser: Downloadable statistical models for spaCy to predict linguistic features
+teaser: Downloadable pretrained models for spaCy
menu:
- ['Quickstart', 'quickstart']
- ['Model Architecture', 'architecture']
- ['Conventions', 'conventions']
---
-spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity
-recognition**. The models have been designed and implemented from scratch
-specifically for spaCy, to give you an unmatched balance of speed, size and
-accuracy. A novel bloom embedding strategy with subword features is used to
-support huge vocabularies in tiny tables. Convolutional layers with residual
-connections, layer normalization and maxout non-linearity are used, giving much
-better efficiency than the standard BiLSTM solution. For more details, see the
-notes on the [model architecture](#architecture).
+The models directory includes two types of pretrained models:
-The parser and NER use an imitation learning objective to deliver **accuracy
-in-line with the latest research systems**, even when evaluated from raw text.
-With these innovations, spaCy v2.0's models are **10× smaller**, **20% more
-accurate**, and **even cheaper to run** than the previous generation.
+1. **Core models:** General-purpose pretrained models to predict named entities,
+ part-of-speech tags and syntactic dependencies. Can be used out-of-the-box
+ and fine-tuned on more specific data.
+2. **Starter models:** Transfer learning starter packs with pretrained weights
+ you can initialize your models with to achieve better accuracy. They can
+ include word vectors (which will be used as features during training) or
+ other pretrained representations like BERT. These models don't include
+ components for specific tasks like NER or text classification and are
+ intended to be used as base models when training your own models.
### Quickstart {hidden="true"}
import QuickstartModels from 'widgets/quickstart-models.js'
-
+
@@ -36,10 +34,20 @@ For more details on how to use models with spaCy, see the
## Model architecture {#architecture}
-spaCy's statistical models have been custom-designed to give a high-performance
-mix of speed and accuracy. The current architecture hasn't been published yet,
-but in the meantime we prepared a video that explains how the models work, with
-particular focus on NER.
+spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity
+recognition**. The models have been designed and implemented from scratch
+specifically for spaCy, to give you an unmatched balance of speed, size and
+accuracy. A novel bloom embedding strategy with subword features is used to
+support huge vocabularies in tiny tables. Convolutional layers with residual
+connections, layer normalization and maxout non-linearity are used, giving much
+better efficiency than the standard BiLSTM solution.
+
+The parser and NER use an imitation learning objective to deliver **accuracy
+in-line with the latest research systems**, even when evaluated from raw text.
+With these innovations, spaCy v2.0's models are **10× smaller**, **20% more
+accurate**, and **even cheaper to run** than the previous generation. The
+current architecture hasn't been published yet, but in the meantime we prepared
+a video that explains how the models work, with particular focus on NER.
diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md
index 73c35950f..9ff55f815 100644
--- a/website/docs/usage/101/_vectors-similarity.md
+++ b/website/docs/usage/101/_vectors-similarity.md
@@ -68,8 +68,8 @@ representation consists of 300 dimensions of `0`, which means it's practically
nonexistent. If your application will benefit from a **large vocabulary** with
more vectors, you should consider using one of the larger models or loading in a
full vector package, for example,
-[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), which includes over **1
-million unique vectors**.
+[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes
+over **1 million unique vectors**.
spaCy is able to compare two objects, and make a prediction of **how similar
they are**. Predicting similarity is useful for building recommendation systems
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index da56f2397..5a3a95a53 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -714,8 +714,8 @@ print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)
```
For the best results, you should run this example using the
-[`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model (currently not
-available in the live demo).
+[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) model (currently
+not available in the live demo).
diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md
index aff797b84..0bb79779e 100644
--- a/website/docs/usage/vectors-similarity.md
+++ b/website/docs/usage/vectors-similarity.md
@@ -95,8 +95,9 @@ pruning the vectors will be taken care of automatically if you set the
`--prune-vectors` flag. You can also do it manually in the following steps:
1. Start with a **word vectors model** that covers a huge vocabulary. For
- instance, the [`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model
- provides 300-dimensional GloVe vectors for over 1 million terms of English.
+ instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
+ model provides 300-dimensional GloVe vectors for over 1 million terms of
+ English.
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
lexemes will be sorted by descending probability to determine which vectors
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
@@ -203,7 +204,7 @@ nlp.vocab.vectors.from_glove("/path/to/vectors")
If your instance of `Language` already contains vectors, they will be
overwritten. To create your own GloVe vectors model package like spaCy's
-[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), you can call
+[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call
[`nlp.to_disk`](/api/language#to_disk), and then package the model using the
[`package`](/api/cli#package) command.
diff --git a/website/gatsby-node.js b/website/gatsby-node.js
index 4aaf5f45e..fe9f22888 100644
--- a/website/gatsby-node.js
+++ b/website/gatsby-node.js
@@ -33,6 +33,7 @@ exports.createPages = ({ graphql, actions }) => {
code
name
models
+ starters
example
has_examples
}
@@ -210,6 +211,8 @@ exports.createPages = ({ graphql, actions }) => {
const langs = result.data.site.siteMetadata.languages
const modelLangs = langs.filter(({ models }) => models && models.length)
+ const starterLangs = langs.filter(({ starters }) => starters && starters.length)
+
modelLangs.forEach(({ code, name, models, example, has_examples }, i) => {
const slug = `/models/${code}`
const next = i < modelLangs.length - 1 ? modelLangs[i + 1] : null
@@ -229,6 +232,28 @@ exports.createPages = ({ graphql, actions }) => {
},
})
})
+
+ starterLangs.forEach(({ code, name, starters }, i) => {
+ const slug = `/models/${code}-starters`
+ const next = i < starterLangs.length - 1 ? starterLangs[i + 1] : null
+ createPage({
+ path: slug,
+ component: DEFAULT_TEMPLATE,
+ context: {
+ id: `${code}-starters`,
+ slug: slug,
+ isIndex: false,
+ title: name,
+ section: 'models',
+ sectionTitle: sections.models.title,
+ theme: sections.models.theme,
+ next: next
+ ? { title: next.name, slug: `/models/${next.code}-starters` }
+ : null,
+ meta: { models: starters, isStarters: true },
+ },
+ })
+ })
})
)
})
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 9b8c56bc6..c22ddad69 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -3,10 +3,8 @@
{
"code": "en",
"name": "English",
- "models": [
- "en_core_web_sm",
- "en_core_web_md",
- "en_core_web_lg",
+ "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
+ "starters": [
"en_vectors_web_lg",
"en_trf_bertbaseuncased_lg",
"en_trf_robertabase_lg",
@@ -19,7 +17,8 @@
{
"code": "de",
"name": "German",
- "models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
+ "models": ["de_core_news_sm", "de_core_news_md"],
+ "starters": ["de_trf_bertbasecased_lg"],
"example": "Dies ist ein Satz.",
"has_examples": true
},
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 68d46605f..3fafc52b0 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -41,7 +41,11 @@
"items": [{ "text": "Overview", "url": "/models" }]
},
{
- "label": "Language Models",
+ "label": "Core Models",
+ "items": []
+ },
+ {
+ "label": "Starter Models",
"items": []
}
]
diff --git a/website/src/templates/docs.js b/website/src/templates/docs.js
index 130506264..840dcbf1f 100644
--- a/website/src/templates/docs.js
+++ b/website/src/templates/docs.js
@@ -50,6 +50,17 @@ const Docs = ({ pageContext, children }) => (
id: model,
})),
}))
+ sidebar.items[2].items = languages
+ .filter(({ starters }) => starters && starters.length)
+ .map(lang => ({
+ text: lang.name,
+ url: `/models/${lang.code}-starters`,
+ isActive: id === `${lang.code}-starters`,
+ menu: lang.starters.map(model => ({
+ text: model,
+ id: model,
+ })),
+ }))
}
const sourcePath = source ? github(source) : null
const currentSource = getCurrentSource(slug, isIndex)
@@ -133,6 +144,7 @@ const query = graphql`
code
name
models
+ starters
}
sidebars {
section
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 3ac5e6ebf..845fec65d 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -331,7 +331,7 @@ const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({})
const { id, title, meta } = pageContext
- const { models } = meta
+ const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
useEffect(() => {
@@ -345,9 +345,27 @@ const Models = ({ pageContext, repo, children }) => {
}
}, [initialized, baseUrl])
+ const modelTitle = title
+ const modelTeaser = `Available pretrained statistical models for ${title}`
+
+ const starterTitle = `${title} starters`
+ const starterTeaser = `Available transfer learning starter packs for ${title}`
+
return (
<>
-
+
+ {isStarters && (
+
+
+ Starter packs are pretrained weights you can initialize your models with to
+ achieve better accuracy. They can include word vectors (which will be used
+ as features during training) or other pretrained representations like BERT.
+