mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
554fbb04b0
|
@ -852,7 +852,7 @@ cdef class GoldParse:
|
||||||
self.c.sent_start[i] = 0
|
self.c.sent_start[i] = 0
|
||||||
|
|
||||||
|
|
||||||
def docs_to_json(docs, id=0):
|
def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
||||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||||
the spacy train command.
|
the spacy train command.
|
||||||
|
|
||||||
|
@ -870,7 +870,7 @@ def docs_to_json(docs, id=0):
|
||||||
json_cat = {"label": cat, "value": val}
|
json_cat = {"label": cat, "value": val}
|
||||||
json_para["cats"].append(json_cat)
|
json_para["cats"].append(json_cat)
|
||||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
json_sent = {"tokens": [], "brackets": []}
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
for token in sent:
|
for token in sent:
|
||||||
|
|
|
@ -1,31 +1,29 @@
|
||||||
---
|
---
|
||||||
title: Models
|
title: Models
|
||||||
teaser: Downloadable statistical models for spaCy to predict linguistic features
|
teaser: Downloadable pretrained models for spaCy
|
||||||
menu:
|
menu:
|
||||||
- ['Quickstart', 'quickstart']
|
- ['Quickstart', 'quickstart']
|
||||||
- ['Model Architecture', 'architecture']
|
- ['Model Architecture', 'architecture']
|
||||||
- ['Conventions', 'conventions']
|
- ['Conventions', 'conventions']
|
||||||
---
|
---
|
||||||
|
|
||||||
spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity
|
The models directory includes two types of pretrained models:
|
||||||
recognition**. The models have been designed and implemented from scratch
|
|
||||||
specifically for spaCy, to give you an unmatched balance of speed, size and
|
|
||||||
accuracy. A novel bloom embedding strategy with subword features is used to
|
|
||||||
support huge vocabularies in tiny tables. Convolutional layers with residual
|
|
||||||
connections, layer normalization and maxout non-linearity are used, giving much
|
|
||||||
better efficiency than the standard BiLSTM solution. For more details, see the
|
|
||||||
notes on the [model architecture](#architecture).
|
|
||||||
|
|
||||||
The parser and NER use an imitation learning objective to deliver **accuracy
|
1. **Core models:** General-purpose pretrained models to predict named entities,
|
||||||
in-line with the latest research systems**, even when evaluated from raw text.
|
part-of-speech tags and syntactic dependencies. Can be used out-of-the-box
|
||||||
With these innovations, spaCy v2.0's models are **10× smaller**, **20% more
|
and fine-tuned on more specific data.
|
||||||
accurate**, and **even cheaper to run** than the previous generation.
|
2. **Starter models:** Transfer learning starter packs with pretrained weights
|
||||||
|
you can initialize your models with to achieve better accuracy. They can
|
||||||
|
include word vectors (which will be used as features during training) or
|
||||||
|
other pretrained representations like BERT. These models don't include
|
||||||
|
components for specific tasks like NER or text classification and are
|
||||||
|
intended to be used as base models when training your own models.
|
||||||
|
|
||||||
### Quickstart {hidden="true"}
|
### Quickstart {hidden="true"}
|
||||||
|
|
||||||
import QuickstartModels from 'widgets/quickstart-models.js'
|
import QuickstartModels from 'widgets/quickstart-models.js'
|
||||||
|
|
||||||
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below." />
|
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and test it." />
|
||||||
|
|
||||||
<Infobox title="📖 Installation and usage">
|
<Infobox title="📖 Installation and usage">
|
||||||
|
|
||||||
|
@ -36,10 +34,20 @@ For more details on how to use models with spaCy, see the
|
||||||
|
|
||||||
## Model architecture {#architecture}
|
## Model architecture {#architecture}
|
||||||
|
|
||||||
spaCy's statistical models have been custom-designed to give a high-performance
|
spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity
|
||||||
mix of speed and accuracy. The current architecture hasn't been published yet,
|
recognition**. The models have been designed and implemented from scratch
|
||||||
but in the meantime we prepared a video that explains how the models work, with
|
specifically for spaCy, to give you an unmatched balance of speed, size and
|
||||||
particular focus on NER.
|
accuracy. A novel bloom embedding strategy with subword features is used to
|
||||||
|
support huge vocabularies in tiny tables. Convolutional layers with residual
|
||||||
|
connections, layer normalization and maxout non-linearity are used, giving much
|
||||||
|
better efficiency than the standard BiLSTM solution.
|
||||||
|
|
||||||
|
The parser and NER use an imitation learning objective to deliver **accuracy
|
||||||
|
in-line with the latest research systems**, even when evaluated from raw text.
|
||||||
|
With these innovations, spaCy v2.0's models are **10× smaller**, **20% more
|
||||||
|
accurate**, and **even cheaper to run** than the previous generation. The
|
||||||
|
current architecture hasn't been published yet, but in the meantime we prepared
|
||||||
|
a video that explains how the models work, with particular focus on NER.
|
||||||
|
|
||||||
<YouTube id="sqDHBH9IjRU" />
|
<YouTube id="sqDHBH9IjRU" />
|
||||||
|
|
||||||
|
|
|
@ -68,8 +68,8 @@ representation consists of 300 dimensions of `0`, which means it's practically
|
||||||
nonexistent. If your application will benefit from a **large vocabulary** with
|
nonexistent. If your application will benefit from a **large vocabulary** with
|
||||||
more vectors, you should consider using one of the larger models or loading in a
|
more vectors, you should consider using one of the larger models or loading in a
|
||||||
full vector package, for example,
|
full vector package, for example,
|
||||||
[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), which includes over **1
|
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes
|
||||||
million unique vectors**.
|
over **1 million unique vectors**.
|
||||||
|
|
||||||
spaCy is able to compare two objects, and make a prediction of **how similar
|
spaCy is able to compare two objects, and make a prediction of **how similar
|
||||||
they are**. Predicting similarity is useful for building recommendation systems
|
they are**. Predicting similarity is useful for building recommendation systems
|
||||||
|
|
|
@ -714,8 +714,8 @@ print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)
|
||||||
```
|
```
|
||||||
|
|
||||||
For the best results, you should run this example using the
|
For the best results, you should run this example using the
|
||||||
[`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model (currently not
|
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) model (currently
|
||||||
available in the live demo).
|
not available in the live demo).
|
||||||
|
|
||||||
<Infobox>
|
<Infobox>
|
||||||
|
|
||||||
|
|
|
@ -95,8 +95,9 @@ pruning the vectors will be taken care of automatically if you set the
|
||||||
`--prune-vectors` flag. You can also do it manually in the following steps:
|
`--prune-vectors` flag. You can also do it manually in the following steps:
|
||||||
|
|
||||||
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
||||||
instance, the [`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model
|
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
||||||
provides 300-dimensional GloVe vectors for over 1 million terms of English.
|
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
||||||
|
English.
|
||||||
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
||||||
lexemes will be sorted by descending probability to determine which vectors
|
lexemes will be sorted by descending probability to determine which vectors
|
||||||
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
|
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
|
||||||
|
@ -203,7 +204,7 @@ nlp.vocab.vectors.from_glove("/path/to/vectors")
|
||||||
|
|
||||||
If your instance of `Language` already contains vectors, they will be
|
If your instance of `Language` already contains vectors, they will be
|
||||||
overwritten. To create your own GloVe vectors model package like spaCy's
|
overwritten. To create your own GloVe vectors model package like spaCy's
|
||||||
[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), you can call
|
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call
|
||||||
[`nlp.to_disk`](/api/language#to_disk), and then package the model using the
|
[`nlp.to_disk`](/api/language#to_disk), and then package the model using the
|
||||||
[`package`](/api/cli#package) command.
|
[`package`](/api/cli#package) command.
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,7 @@ exports.createPages = ({ graphql, actions }) => {
|
||||||
code
|
code
|
||||||
name
|
name
|
||||||
models
|
models
|
||||||
|
starters
|
||||||
example
|
example
|
||||||
has_examples
|
has_examples
|
||||||
}
|
}
|
||||||
|
@ -210,6 +211,8 @@ exports.createPages = ({ graphql, actions }) => {
|
||||||
|
|
||||||
const langs = result.data.site.siteMetadata.languages
|
const langs = result.data.site.siteMetadata.languages
|
||||||
const modelLangs = langs.filter(({ models }) => models && models.length)
|
const modelLangs = langs.filter(({ models }) => models && models.length)
|
||||||
|
const starterLangs = langs.filter(({ starters }) => starters && starters.length)
|
||||||
|
|
||||||
modelLangs.forEach(({ code, name, models, example, has_examples }, i) => {
|
modelLangs.forEach(({ code, name, models, example, has_examples }, i) => {
|
||||||
const slug = `/models/${code}`
|
const slug = `/models/${code}`
|
||||||
const next = i < modelLangs.length - 1 ? modelLangs[i + 1] : null
|
const next = i < modelLangs.length - 1 ? modelLangs[i + 1] : null
|
||||||
|
@ -229,6 +232,28 @@ exports.createPages = ({ graphql, actions }) => {
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
starterLangs.forEach(({ code, name, starters }, i) => {
|
||||||
|
const slug = `/models/${code}-starters`
|
||||||
|
const next = i < starterLangs.length - 1 ? starterLangs[i + 1] : null
|
||||||
|
createPage({
|
||||||
|
path: slug,
|
||||||
|
component: DEFAULT_TEMPLATE,
|
||||||
|
context: {
|
||||||
|
id: `${code}-starters`,
|
||||||
|
slug: slug,
|
||||||
|
isIndex: false,
|
||||||
|
title: name,
|
||||||
|
section: 'models',
|
||||||
|
sectionTitle: sections.models.title,
|
||||||
|
theme: sections.models.theme,
|
||||||
|
next: next
|
||||||
|
? { title: next.name, slug: `/models/${next.code}-starters` }
|
||||||
|
: null,
|
||||||
|
meta: { models: starters, isStarters: true },
|
||||||
|
},
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
|
@ -3,10 +3,8 @@
|
||||||
{
|
{
|
||||||
"code": "en",
|
"code": "en",
|
||||||
"name": "English",
|
"name": "English",
|
||||||
"models": [
|
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
|
||||||
"en_core_web_sm",
|
"starters": [
|
||||||
"en_core_web_md",
|
|
||||||
"en_core_web_lg",
|
|
||||||
"en_vectors_web_lg",
|
"en_vectors_web_lg",
|
||||||
"en_trf_bertbaseuncased_lg",
|
"en_trf_bertbaseuncased_lg",
|
||||||
"en_trf_robertabase_lg",
|
"en_trf_robertabase_lg",
|
||||||
|
@ -19,7 +17,8 @@
|
||||||
{
|
{
|
||||||
"code": "de",
|
"code": "de",
|
||||||
"name": "German",
|
"name": "German",
|
||||||
"models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
|
"models": ["de_core_news_sm", "de_core_news_md"],
|
||||||
|
"starters": ["de_trf_bertbasecased_lg"],
|
||||||
"example": "Dies ist ein Satz.",
|
"example": "Dies ist ein Satz.",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
|
|
@ -41,7 +41,11 @@
|
||||||
"items": [{ "text": "Overview", "url": "/models" }]
|
"items": [{ "text": "Overview", "url": "/models" }]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"label": "Language Models",
|
"label": "Core Models",
|
||||||
|
"items": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Starter Models",
|
||||||
"items": []
|
"items": []
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -50,6 +50,17 @@ const Docs = ({ pageContext, children }) => (
|
||||||
id: model,
|
id: model,
|
||||||
})),
|
})),
|
||||||
}))
|
}))
|
||||||
|
sidebar.items[2].items = languages
|
||||||
|
.filter(({ starters }) => starters && starters.length)
|
||||||
|
.map(lang => ({
|
||||||
|
text: lang.name,
|
||||||
|
url: `/models/${lang.code}-starters`,
|
||||||
|
isActive: id === `${lang.code}-starters`,
|
||||||
|
menu: lang.starters.map(model => ({
|
||||||
|
text: model,
|
||||||
|
id: model,
|
||||||
|
})),
|
||||||
|
}))
|
||||||
}
|
}
|
||||||
const sourcePath = source ? github(source) : null
|
const sourcePath = source ? github(source) : null
|
||||||
const currentSource = getCurrentSource(slug, isIndex)
|
const currentSource = getCurrentSource(slug, isIndex)
|
||||||
|
@ -133,6 +144,7 @@ const query = graphql`
|
||||||
code
|
code
|
||||||
name
|
name
|
||||||
models
|
models
|
||||||
|
starters
|
||||||
}
|
}
|
||||||
sidebars {
|
sidebars {
|
||||||
section
|
section
|
||||||
|
|
|
@ -331,7 +331,7 @@ const Models = ({ pageContext, repo, children }) => {
|
||||||
const [initialized, setInitialized] = useState(false)
|
const [initialized, setInitialized] = useState(false)
|
||||||
const [compatibility, setCompatibility] = useState({})
|
const [compatibility, setCompatibility] = useState({})
|
||||||
const { id, title, meta } = pageContext
|
const { id, title, meta } = pageContext
|
||||||
const { models } = meta
|
const { models, isStarters } = meta
|
||||||
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
|
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
|
@ -345,9 +345,27 @@ const Models = ({ pageContext, repo, children }) => {
|
||||||
}
|
}
|
||||||
}, [initialized, baseUrl])
|
}, [initialized, baseUrl])
|
||||||
|
|
||||||
|
const modelTitle = title
|
||||||
|
const modelTeaser = `Available pretrained statistical models for ${title}`
|
||||||
|
|
||||||
|
const starterTitle = `${title} starters`
|
||||||
|
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
<Title title={title} teaser={`Available pretrained statistical models for ${title}`} />
|
<Title
|
||||||
|
title={isStarters ? starterTitle : modelTitle}
|
||||||
|
teaser={isStarters ? starterTeaser : modelTeaser}
|
||||||
|
/>
|
||||||
|
{isStarters && (
|
||||||
|
<Section>
|
||||||
|
<p>
|
||||||
|
Starter packs are pretrained weights you can initialize your models with to
|
||||||
|
achieve better accuracy. They can include word vectors (which will be used
|
||||||
|
as features during training) or other pretrained representations like BERT.
|
||||||
|
</p>
|
||||||
|
</Section>
|
||||||
|
)}
|
||||||
<StaticQuery
|
<StaticQuery
|
||||||
query={query}
|
query={query}
|
||||||
render={({ site }) =>
|
render={({ site }) =>
|
||||||
|
@ -360,7 +378,6 @@ const Models = ({ pageContext, repo, children }) => {
|
||||||
compatibility={compatibility}
|
compatibility={compatibility}
|
||||||
baseUrl={baseUrl}
|
baseUrl={baseUrl}
|
||||||
repo={repo}
|
repo={repo}
|
||||||
hasExamples={meta.hasExamples}
|
|
||||||
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
|
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
|
||||||
/>
|
/>
|
||||||
))
|
))
|
||||||
|
|
|
@ -56,7 +56,11 @@ function getCounts(langs = []) {
|
||||||
return {
|
return {
|
||||||
langs: langs.length,
|
langs: langs.length,
|
||||||
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
|
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
|
||||||
|
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
|
||||||
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
|
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
|
||||||
|
starters: langs
|
||||||
|
.map(({ starters }) => (starters ? starters.length : 0))
|
||||||
|
.reduce((a, b) => a + b, 0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -270,6 +274,7 @@ const landingQuery = graphql`
|
||||||
repo
|
repo
|
||||||
languages {
|
languages {
|
||||||
models
|
models
|
||||||
|
starters
|
||||||
}
|
}
|
||||||
logosUsers {
|
logosUsers {
|
||||||
id
|
id
|
||||||
|
|
Loading…
Reference in New Issue
Block a user