mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-03 22:06:37 +03:00
Merge branch 'master' into spacy.io
This commit is contained in:
commit
554fbb04b0
|
@ -852,7 +852,7 @@ cdef class GoldParse:
|
|||
self.c.sent_start[i] = 0
|
||||
|
||||
|
||||
def docs_to_json(docs, id=0):
|
||||
def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command.
|
||||
|
||||
|
@ -870,7 +870,7 @@ def docs_to_json(docs, id=0):
|
|||
json_cat = {"label": cat, "value": val}
|
||||
json_para["cats"].append(json_cat)
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
|
|
|
@ -1,31 +1,29 @@
|
|||
---
|
||||
title: Models
|
||||
teaser: Downloadable statistical models for spaCy to predict linguistic features
|
||||
teaser: Downloadable pretrained models for spaCy
|
||||
menu:
|
||||
- ['Quickstart', 'quickstart']
|
||||
- ['Model Architecture', 'architecture']
|
||||
- ['Conventions', 'conventions']
|
||||
---
|
||||
|
||||
spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity
|
||||
recognition**. The models have been designed and implemented from scratch
|
||||
specifically for spaCy, to give you an unmatched balance of speed, size and
|
||||
accuracy. A novel bloom embedding strategy with subword features is used to
|
||||
support huge vocabularies in tiny tables. Convolutional layers with residual
|
||||
connections, layer normalization and maxout non-linearity are used, giving much
|
||||
better efficiency than the standard BiLSTM solution. For more details, see the
|
||||
notes on the [model architecture](#architecture).
|
||||
The models directory includes two types of pretrained models:
|
||||
|
||||
The parser and NER use an imitation learning objective to deliver **accuracy
|
||||
in-line with the latest research systems**, even when evaluated from raw text.
|
||||
With these innovations, spaCy v2.0's models are **10× smaller**, **20% more
|
||||
accurate**, and **even cheaper to run** than the previous generation.
|
||||
1. **Core models:** General-purpose pretrained models to predict named entities,
|
||||
part-of-speech tags and syntactic dependencies. Can be used out-of-the-box
|
||||
and fine-tuned on more specific data.
|
||||
2. **Starter models:** Transfer learning starter packs with pretrained weights
|
||||
you can initialize your models with to achieve better accuracy. They can
|
||||
include word vectors (which will be used as features during training) or
|
||||
other pretrained representations like BERT. These models don't include
|
||||
components for specific tasks like NER or text classification and are
|
||||
intended to be used as base models when training your own models.
|
||||
|
||||
### Quickstart {hidden="true"}
|
||||
|
||||
import QuickstartModels from 'widgets/quickstart-models.js'
|
||||
|
||||
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below." />
|
||||
<QuickstartModels title="Quickstart" id="quickstart" description="Install a default model, get the code to load it from within spaCy and test it." />
|
||||
|
||||
<Infobox title="📖 Installation and usage">
|
||||
|
||||
|
@ -36,10 +34,20 @@ For more details on how to use models with spaCy, see the
|
|||
|
||||
## Model architecture {#architecture}
|
||||
|
||||
spaCy's statistical models have been custom-designed to give a high-performance
|
||||
mix of speed and accuracy. The current architecture hasn't been published yet,
|
||||
but in the meantime we prepared a video that explains how the models work, with
|
||||
particular focus on NER.
|
||||
spaCy v2.0 features new neural models for **tagging**, **parsing** and **entity
|
||||
recognition**. The models have been designed and implemented from scratch
|
||||
specifically for spaCy, to give you an unmatched balance of speed, size and
|
||||
accuracy. A novel bloom embedding strategy with subword features is used to
|
||||
support huge vocabularies in tiny tables. Convolutional layers with residual
|
||||
connections, layer normalization and maxout non-linearity are used, giving much
|
||||
better efficiency than the standard BiLSTM solution.
|
||||
|
||||
The parser and NER use an imitation learning objective to deliver **accuracy
|
||||
in-line with the latest research systems**, even when evaluated from raw text.
|
||||
With these innovations, spaCy v2.0's models are **10× smaller**, **20% more
|
||||
accurate**, and **even cheaper to run** than the previous generation. The
|
||||
current architecture hasn't been published yet, but in the meantime we prepared
|
||||
a video that explains how the models work, with particular focus on NER.
|
||||
|
||||
<YouTube id="sqDHBH9IjRU" />
|
||||
|
||||
|
|
|
@ -68,8 +68,8 @@ representation consists of 300 dimensions of `0`, which means it's practically
|
|||
nonexistent. If your application will benefit from a **large vocabulary** with
|
||||
more vectors, you should consider using one of the larger models or loading in a
|
||||
full vector package, for example,
|
||||
[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), which includes over **1
|
||||
million unique vectors**.
|
||||
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), which includes
|
||||
over **1 million unique vectors**.
|
||||
|
||||
spaCy is able to compare two objects, and make a prediction of **how similar
|
||||
they are**. Predicting similarity is useful for building recommendation systems
|
||||
|
|
|
@ -714,8 +714,8 @@ print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector)
|
|||
```
|
||||
|
||||
For the best results, you should run this example using the
|
||||
[`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model (currently not
|
||||
available in the live demo).
|
||||
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) model (currently
|
||||
not available in the live demo).
|
||||
|
||||
<Infobox>
|
||||
|
||||
|
|
|
@ -95,8 +95,9 @@ pruning the vectors will be taken care of automatically if you set the
|
|||
`--prune-vectors` flag. You can also do it manually in the following steps:
|
||||
|
||||
1. Start with a **word vectors model** that covers a huge vocabulary. For
|
||||
instance, the [`en_vectors_web_lg`](/models/en#en_vectors_web_lg) model
|
||||
provides 300-dimensional GloVe vectors for over 1 million terms of English.
|
||||
instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg)
|
||||
model provides 300-dimensional GloVe vectors for over 1 million terms of
|
||||
English.
|
||||
2. If your vocabulary has values set for the `Lexeme.prob` attribute, the
|
||||
lexemes will be sorted by descending probability to determine which vectors
|
||||
to prune. Otherwise, lexemes will be sorted by their order in the `Vocab`.
|
||||
|
@ -203,7 +204,7 @@ nlp.vocab.vectors.from_glove("/path/to/vectors")
|
|||
|
||||
If your instance of `Language` already contains vectors, they will be
|
||||
overwritten. To create your own GloVe vectors model package like spaCy's
|
||||
[`en_vectors_web_lg`](/models/en#en_vectors_web_lg), you can call
|
||||
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call
|
||||
[`nlp.to_disk`](/api/language#to_disk), and then package the model using the
|
||||
[`package`](/api/cli#package) command.
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ exports.createPages = ({ graphql, actions }) => {
|
|||
code
|
||||
name
|
||||
models
|
||||
starters
|
||||
example
|
||||
has_examples
|
||||
}
|
||||
|
@ -210,6 +211,8 @@ exports.createPages = ({ graphql, actions }) => {
|
|||
|
||||
const langs = result.data.site.siteMetadata.languages
|
||||
const modelLangs = langs.filter(({ models }) => models && models.length)
|
||||
const starterLangs = langs.filter(({ starters }) => starters && starters.length)
|
||||
|
||||
modelLangs.forEach(({ code, name, models, example, has_examples }, i) => {
|
||||
const slug = `/models/${code}`
|
||||
const next = i < modelLangs.length - 1 ? modelLangs[i + 1] : null
|
||||
|
@ -229,6 +232,28 @@ exports.createPages = ({ graphql, actions }) => {
|
|||
},
|
||||
})
|
||||
})
|
||||
|
||||
starterLangs.forEach(({ code, name, starters }, i) => {
|
||||
const slug = `/models/${code}-starters`
|
||||
const next = i < starterLangs.length - 1 ? starterLangs[i + 1] : null
|
||||
createPage({
|
||||
path: slug,
|
||||
component: DEFAULT_TEMPLATE,
|
||||
context: {
|
||||
id: `${code}-starters`,
|
||||
slug: slug,
|
||||
isIndex: false,
|
||||
title: name,
|
||||
section: 'models',
|
||||
sectionTitle: sections.models.title,
|
||||
theme: sections.models.theme,
|
||||
next: next
|
||||
? { title: next.name, slug: `/models/${next.code}-starters` }
|
||||
: null,
|
||||
meta: { models: starters, isStarters: true },
|
||||
},
|
||||
})
|
||||
})
|
||||
})
|
||||
)
|
||||
})
|
||||
|
|
|
@ -3,10 +3,8 @@
|
|||
{
|
||||
"code": "en",
|
||||
"name": "English",
|
||||
"models": [
|
||||
"en_core_web_sm",
|
||||
"en_core_web_md",
|
||||
"en_core_web_lg",
|
||||
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
|
||||
"starters": [
|
||||
"en_vectors_web_lg",
|
||||
"en_trf_bertbaseuncased_lg",
|
||||
"en_trf_robertabase_lg",
|
||||
|
@ -19,7 +17,8 @@
|
|||
{
|
||||
"code": "de",
|
||||
"name": "German",
|
||||
"models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
|
||||
"models": ["de_core_news_sm", "de_core_news_md"],
|
||||
"starters": ["de_trf_bertbasecased_lg"],
|
||||
"example": "Dies ist ein Satz.",
|
||||
"has_examples": true
|
||||
},
|
||||
|
|
|
@ -41,7 +41,11 @@
|
|||
"items": [{ "text": "Overview", "url": "/models" }]
|
||||
},
|
||||
{
|
||||
"label": "Language Models",
|
||||
"label": "Core Models",
|
||||
"items": []
|
||||
},
|
||||
{
|
||||
"label": "Starter Models",
|
||||
"items": []
|
||||
}
|
||||
]
|
||||
|
|
|
@ -50,6 +50,17 @@ const Docs = ({ pageContext, children }) => (
|
|||
id: model,
|
||||
})),
|
||||
}))
|
||||
sidebar.items[2].items = languages
|
||||
.filter(({ starters }) => starters && starters.length)
|
||||
.map(lang => ({
|
||||
text: lang.name,
|
||||
url: `/models/${lang.code}-starters`,
|
||||
isActive: id === `${lang.code}-starters`,
|
||||
menu: lang.starters.map(model => ({
|
||||
text: model,
|
||||
id: model,
|
||||
})),
|
||||
}))
|
||||
}
|
||||
const sourcePath = source ? github(source) : null
|
||||
const currentSource = getCurrentSource(slug, isIndex)
|
||||
|
@ -133,6 +144,7 @@ const query = graphql`
|
|||
code
|
||||
name
|
||||
models
|
||||
starters
|
||||
}
|
||||
sidebars {
|
||||
section
|
||||
|
|
|
@ -331,7 +331,7 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
const [initialized, setInitialized] = useState(false)
|
||||
const [compatibility, setCompatibility] = useState({})
|
||||
const { id, title, meta } = pageContext
|
||||
const { models } = meta
|
||||
const { models, isStarters } = meta
|
||||
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
|
||||
|
||||
useEffect(() => {
|
||||
|
@ -345,9 +345,27 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
}
|
||||
}, [initialized, baseUrl])
|
||||
|
||||
const modelTitle = title
|
||||
const modelTeaser = `Available pretrained statistical models for ${title}`
|
||||
|
||||
const starterTitle = `${title} starters`
|
||||
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
||||
|
||||
return (
|
||||
<>
|
||||
<Title title={title} teaser={`Available pretrained statistical models for ${title}`} />
|
||||
<Title
|
||||
title={isStarters ? starterTitle : modelTitle}
|
||||
teaser={isStarters ? starterTeaser : modelTeaser}
|
||||
/>
|
||||
{isStarters && (
|
||||
<Section>
|
||||
<p>
|
||||
Starter packs are pretrained weights you can initialize your models with to
|
||||
achieve better accuracy. They can include word vectors (which will be used
|
||||
as features during training) or other pretrained representations like BERT.
|
||||
</p>
|
||||
</Section>
|
||||
)}
|
||||
<StaticQuery
|
||||
query={query}
|
||||
render={({ site }) =>
|
||||
|
@ -360,7 +378,6 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
compatibility={compatibility}
|
||||
baseUrl={baseUrl}
|
||||
repo={repo}
|
||||
hasExamples={meta.hasExamples}
|
||||
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
|
||||
/>
|
||||
))
|
||||
|
|
|
@ -56,7 +56,11 @@ function getCounts(langs = []) {
|
|||
return {
|
||||
langs: langs.length,
|
||||
modelLangs: langs.filter(({ models }) => models && !!models.length).length,
|
||||
starterLangs: langs.filter(({ starters }) => starters && !!starters.length).length,
|
||||
models: langs.map(({ models }) => (models ? models.length : 0)).reduce((a, b) => a + b, 0),
|
||||
starters: langs
|
||||
.map(({ starters }) => (starters ? starters.length : 0))
|
||||
.reduce((a, b) => a + b, 0),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -270,6 +274,7 @@ const landingQuery = graphql`
|
|||
repo
|
||||
languages {
|
||||
models
|
||||
starters
|
||||
}
|
||||
logosUsers {
|
||||
id
|
||||
|
|
Loading…
Reference in New Issue
Block a user