From 0750d59e5a3e4f7e021a588523c1e1d24f4538f7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 13:47:21 +0100 Subject: [PATCH 1/2] Allow setting ner_missing_tag on docs_to_json --- spacy/gold.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5aecc2584..1a74d2206 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -852,7 +852,7 @@ cdef class GoldParse: self.c.sent_start[i] = 0 -def docs_to_json(docs, id=0): +def docs_to_json(docs, id=0, ner_missing_tag="O"): """Convert a list of Doc objects into the JSON-serializable format used by the spacy train command. @@ -870,7 +870,7 @@ def docs_to_json(docs, id=0): json_cat = {"label": cat, "value": val} json_para["cats"].append(json_cat) ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - biluo_tags = biluo_tags_from_offsets(doc, ent_offsets) + biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) for j, sent in enumerate(doc.sents): json_sent = {"tokens": [], "brackets": []} for token in sent: From 1b838d1313e2c3bd5dad7f4b86004681546c7769 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 14:10:22 +0100 Subject: [PATCH 2/2] Divide models into core and starters [ci skip] --- website/docs/models/index.md | 44 +++++++++++-------- website/docs/usage/101/_vectors-similarity.md | 4 +- website/docs/usage/spacy-101.md | 4 +- website/docs/usage/vectors-similarity.md | 7 +-- website/gatsby-node.js | 25 +++++++++++ website/meta/languages.json | 9 ++-- website/meta/sidebars.json | 6 ++- website/src/templates/docs.js | 12 +++++ website/src/templates/models.js | 23 ++++++++-- website/src/widgets/landing.js | 5 +++ 10 files changed, 105 insertions(+), 34 deletions(-) diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 1d2bd6d63..31bc3c549 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -1,31 +1,29 @@ --- title: Models -teaser: Downloadable statistical models for spaCy to predict linguistic features +teaser: Downloadable pretrained models for spaCy menu: - ['Quickstart', 'quickstart'] - ['Model Architecture', 'architecture'] - ['Conventions', 'conventions'] --- -spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity -recognition**. The models have been designed and implemented from scratch -specifically for spaCy, to give you an unmatched balance of speed, size and -accuracy. A novel bloom embedding strategy with subword features is used to -support huge vocabularies in tiny tables. Convolutional layers with residual -connections, layer normalization and maxout non-linearity are used, giving much -better efficiency than the standard BiLSTM solution. For more details, see the -notes on the [model architecture](#architecture). +The models directory includes two types of pretrained models: -The parser and NER use an imitation learning objective to deliver **accuracy -in-line with the latest research systems**, even when evaluated from raw text. -With these innovations, spaCy v2.0's models are **10× smaller**, **20% more -accurate**, and **even cheaper to run** than the previous generation. +1. **Core models:** General-purpose pretrained models to predict named entities, + part-of-speech tags and syntactic dependencies. Can be used out-of-the-box + and fine-tuned on more specific data. +2. **Starter models:** Transfer learning starter packs with pretrained weights + you can initialize your models with to achieve better accuracy. They can + include word vectors (which will be used as features during training) or + other pretrained representations like BERT. These models don't include + components for specific tasks like NER or text classification and are + intended to be used as base models when training your own models. ### Quickstart {hidden="true"} import QuickstartModels from 'widgets/quickstart-models.js' - + @@ -36,10 +34,20 @@ For more details on how to use models with spaCy, see the ## Model architecture {#architecture} -spaCy's statistical models have been custom-designed to give a high-performance -mix of speed and accuracy. The current architecture hasn't been published yet, -but in the meantime we prepared a video that explains how the models work, with -particular focus on NER. +spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity +recognition**. The models have been designed and implemented from scratch +specifically for spaCy, to give you an unmatched balance of speed, size and +accuracy. A novel bloom embedding strategy with subword features is used to +support huge vocabularies in tiny tables. Convolutional layers with residual +connections, layer normalization and maxout non-linearity are used, giving much +better efficiency than the standard BiLSTM solution. + +The parser and NER use an imitation learning objective to deliver **accuracy +in-line with the latest research systems**, even when evaluated from raw text. +With these innovations, spaCy v2.0's models are **10× smaller**, **20% more +accurate**, and **even cheaper to run** than the previous generation. The +current architecture hasn't been published yet, but in the meantime we prepared +a video that explains how the models work, with particular focus on NER. diff --git a/website/docs/usage/101/_vectors-similarity.md b/website/docs/usage/101/_vectors-similarity.md index 73c35950f..9ff55f815 100644 --- a/website/docs/usage/101/_vectors-similarity.md +++ b/website/docs/usage/101/_vectors-similarity.md @@ -68,8 +68,8 @@ representation consists of 300 dimensions of `0`, which means it's practically nonexistent. If your application will benefit from a **large vocabulary** with more vectors, you should consider using one of the larger models or loading in a full vector package, for example, -[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), which includes over **1 -million unique vectors**. +[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes +over **1 million unique vectors**. spaCy is able to compare two objects, and make a prediction of **how similar they are**. Predicting similarity is useful for building recommendation systems diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index da56f2397..5a3a95a53 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -714,8 +714,8 @@ print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector) ``` For the best results, you should run this example using the -[`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model (currently not -available in the live demo). +[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) model (currently +not available in the live demo). diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md index aff797b84..0bb79779e 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-similarity.md @@ -95,8 +95,9 @@ pruning the vectors will be taken care of automatically if you set the `--prune-vectors` flag. You can also do it manually in the following steps: 1. Start with a **word vectors model** that covers a huge vocabulary. For - instance, the [`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model - provides 300-dimensional GloVe vectors for over 1 million terms of English. + instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) + model provides 300-dimensional GloVe vectors for over 1 million terms of + English. 2. If your vocabulary has values set for the `Lexeme.prob` attribute, the lexemes will be sorted by descending probability to determine which vectors to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`. @@ -203,7 +204,7 @@ nlp.vocab.vectors.from_glove("/path/to/vectors") If your instance of `Language` already contains vectors, they will be overwritten. To create your own GloVe vectors model package like spaCy's -[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), you can call +[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call [`nlp.to_disk`](/api/language#to_disk), and then package the model using the [`package`](/api/cli#package) command. diff --git a/website/gatsby-node.js b/website/gatsby-node.js index 4aaf5f45e..fe9f22888 100644 --- a/website/gatsby-node.js +++ b/website/gatsby-node.js @@ -33,6 +33,7 @@ exports.createPages = ({ graphql, actions }) => { code name models + starters example has_examples } @@ -210,6 +211,8 @@ exports.createPages = ({ graphql, actions }) => { const langs = result.data.site.siteMetadata.languages const modelLangs = langs.filter(({ models }) => models && models.length) + const starterLangs = langs.filter(({ starters }) => starters && starters.length) + modelLangs.forEach(({ code, name, models, example, has_examples }, i) => { const slug = `/models/${code}` const next = i < modelLangs.length - 1 ? modelLangs[i + 1] : null @@ -229,6 +232,28 @@ exports.createPages = ({ graphql, actions }) => { }, }) }) + + starterLangs.forEach(({ code, name, starters }, i) => { + const slug = `/models/${code}-starters` + const next = i < starterLangs.length - 1 ? starterLangs[i + 1] : null + createPage({ + path: slug, + component: DEFAULT_TEMPLATE, + context: { + id: `${code}-starters`, + slug: slug, + isIndex: false, + title: name, + section: 'models', + sectionTitle: sections.models.title, + theme: sections.models.theme, + next: next + ? { title: next.name, slug: `/models/${next.code}-starters` } + : null, + meta: { models: starters, isStarters: true }, + }, + }) + }) }) ) }) diff --git a/website/meta/languages.json b/website/meta/languages.json index 9b8c56bc6..c22ddad69 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -3,10 +3,8 @@ { "code": "en", "name": "English", - "models": [ - "en_core_web_sm", - "en_core_web_md", - "en_core_web_lg", + "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"], + "starters": [ "en_vectors_web_lg", "en_trf_bertbaseuncased_lg", "en_trf_robertabase_lg", @@ -19,7 +17,8 @@ { "code": "de", "name": "German", - "models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"], + "models": ["de_core_news_sm", "de_core_news_md"], + "starters": ["de_trf_bertbasecased_lg"], "example": "Dies ist ein Satz.", "has_examples": true }, diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 68d46605f..3fafc52b0 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -41,7 +41,11 @@ "items": [{ "text": "Overview", "url": "/models" }] }, { - "label": "Language Models", + "label": "Core Models", + "items": [] + }, + { + "label": "Starter Models", "items": [] } ] diff --git a/website/src/templates/docs.js b/website/src/templates/docs.js index 130506264..840dcbf1f 100644 --- a/website/src/templates/docs.js +++ b/website/src/templates/docs.js @@ -50,6 +50,17 @@ const Docs = ({ pageContext, children }) => ( id: model, })), })) + sidebar.items[2].items = languages + .filter(({ starters }) => starters && starters.length) + .map(lang => ({ + text: lang.name, + url: `/models/${lang.code}-starters`, + isActive: id === `${lang.code}-starters`, + menu: lang.starters.map(model => ({ + text: model, + id: model, + })), + })) } const sourcePath = source ? github(source) : null const currentSource = getCurrentSource(slug, isIndex) @@ -133,6 +144,7 @@ const query = graphql` code name models + starters } sidebars { section diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 3ac5e6ebf..845fec65d 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -331,7 +331,7 @@ const Models = ({ pageContext, repo, children }) => { const [initialized, setInitialized] = useState(false) const [compatibility, setCompatibility] = useState({}) const { id, title, meta } = pageContext - const { models } = meta + const { models, isStarters } = meta const baseUrl = `https://raw.githubusercontent.com/${repo}/master` useEffect(() => { @@ -345,9 +345,27 @@ const Models = ({ pageContext, repo, children }) => { } }, [initialized, baseUrl]) + const modelTitle = title + const modelTeaser = `Available pretrained statistical models for ${title}` + + const starterTitle = `${title} starters` + const starterTeaser = `Available transfer learning starter packs for ${title}` + return ( <> - + <Title + title={isStarters ? starterTitle : modelTitle} + teaser={isStarters ? starterTeaser : modelTeaser} + /> + {isStarters && ( + <Section> + <p> + Starter packs are pretrained weights you can initialize your models with to + achieve better accuracy. They can include word vectors (which will be used + as features during training) or other pretrained representations like BERT. + </p> + </Section> + )} <StaticQuery query={query} render={({ site }) => @@ -360,7 +378,6 @@ const Models = ({ pageContext, repo, children }) => { compatibility={compatibility} baseUrl={baseUrl} repo={repo} - hasExamples={meta.hasExamples} licenses={arrayToObj(site.siteMetadata.licenses, 'id')} /> )) diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index 2b7dc10c1..2dc5d40dc 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -56,7 +56,11 @@ function getCounts(langs = []) { return { langs: langs.length, modelLangs: langs.filter(({ models }) => models && !!models.length).length, + starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length, models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0), + starters: langs + .map(({ starters }) => (starters ? starters.length : 0)) + .reduce((a, b) => a + b, 0), } } @@ -270,6 +274,7 @@ const landingQuery = graphql` repo languages { models + starters } logosUsers { id