mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Merge remote-tracking branch 'upstream/develop' into fix/nr_features
# Conflicts: # spacy/ml/models/parser.py # spacy/tests/serialize/test_serialize_config.py # website/docs/api/architectures.md
This commit is contained in:
commit
35dbc63578
|
@ -20,6 +20,7 @@ pytokenizations
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging
|
||||||
importlib_metadata>=0.20; python_version < "3.8"
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
pytest>=4.6.5
|
pytest>=4.6.5
|
||||||
|
|
|
@ -57,6 +57,7 @@ install_requires =
|
||||||
setuptools
|
setuptools
|
||||||
packaging
|
packaging
|
||||||
importlib_metadata>=0.20; python_version < "3.8"
|
importlib_metadata>=0.20; python_version < "3.8"
|
||||||
|
typing_extensions>=3.7.4; python_version < "3.8"
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy-nightly"
|
__title__ = "spacy-nightly"
|
||||||
__version__ = "3.0.0a21"
|
__version__ = "3.0.0a22"
|
||||||
__release__ = True
|
__release__ = True
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -284,7 +284,7 @@ vectors = "{{ word_vectors }}"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
{% endif %}
|
{% endif -%}
|
||||||
dev_corpus = "corpora.dev"
|
dev_corpus = "corpora.dev"
|
||||||
train_corpus = "corpora.train"
|
train_corpus = "corpora.train"
|
||||||
|
|
||||||
|
|
|
@ -152,7 +152,8 @@ def train(
|
||||||
exclude=frozen_components,
|
exclude=frozen_components,
|
||||||
)
|
)
|
||||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||||
print_row, finalize_logger = train_logger(nlp)
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
|
print_row, finalize_logger = train_logger(nlp)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||||
|
@ -163,7 +164,8 @@ def train(
|
||||||
progress.close()
|
progress.close()
|
||||||
print_row(info)
|
print_row(info)
|
||||||
if is_best_checkpoint and output_path is not None:
|
if is_best_checkpoint and output_path is not None:
|
||||||
update_meta(T_cfg, nlp, info)
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
|
update_meta(T_cfg, nlp, info)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
nlp.to_disk(output_path / "model-best")
|
nlp.to_disk(output_path / "model-best")
|
||||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||||
|
|
|
@ -22,6 +22,11 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
|
try: # Python 3.8+
|
||||||
|
from typing import Literal
|
||||||
|
except ImportError:
|
||||||
|
from typing_extensions import Literal # noqa: F401
|
||||||
|
|
||||||
from thinc.api import Optimizer # noqa: F401
|
from thinc.api import Optimizer # noqa: F401
|
||||||
|
|
||||||
pickle = pickle
|
pickle = pickle
|
||||||
|
|
|
@ -11,9 +11,11 @@ def console_logger():
|
||||||
def setup_printer(
|
def setup_printer(
|
||||||
nlp: "Language",
|
nlp: "Language",
|
||||||
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
|
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
|
||||||
|
# we assume here that only components are enabled that should be trained & logged
|
||||||
|
logged_pipes = nlp.pipe_names
|
||||||
score_cols = list(nlp.config["training"]["score_weights"])
|
score_cols = list(nlp.config["training"]["score_weights"])
|
||||||
score_widths = [max(len(col), 6) for col in score_cols]
|
score_widths = [max(len(col), 6) for col in score_cols]
|
||||||
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
|
||||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||||
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
|
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
|
||||||
table_header = [col.upper() for col in table_header]
|
table_header = [col.upper() for col in table_header]
|
||||||
|
@ -26,7 +28,7 @@ def console_logger():
|
||||||
try:
|
try:
|
||||||
losses = [
|
losses = [
|
||||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||||
for pipe_name in nlp.pipe_names
|
for pipe_name in logged_pipes
|
||||||
]
|
]
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
raise KeyError(
|
raise KeyError(
|
||||||
|
|
|
@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
|
||||||
|
|
||||||
| System | POS | UAS | LAS |
|
| System | POS | UAS | LAS |
|
||||||
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
|
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
|
||||||
| spaCy RoBERTa (2020) | | | |
|
| spaCy RoBERTa (2020) | 97.8 | 96.6 | 94.7 |
|
||||||
| spaCy CNN (2020) | | | |
|
| spaCy CNN (2020) | | | |
|
||||||
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
|
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
|
||||||
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 |
|
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 |
|
||||||
|
@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison.
|
||||||
|
|
||||||
**Accuracy on the Penn Treebank.** See
|
**Accuracy on the Penn Treebank.** See
|
||||||
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
|
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
|
||||||
results.
|
results. For spaCy's evaluation, see the
|
||||||
|
[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
|
||||||
|
|
||||||
</figcaption>
|
</figcaption>
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import React from 'react'
|
import React, { Fragment } from 'react'
|
||||||
import PropTypes from 'prop-types'
|
import PropTypes from 'prop-types'
|
||||||
import classNames from 'classnames'
|
import classNames from 'classnames'
|
||||||
|
|
||||||
|
@ -14,13 +14,14 @@ export default function Infobox({
|
||||||
className,
|
className,
|
||||||
children,
|
children,
|
||||||
}) {
|
}) {
|
||||||
|
const Wrapper = id ? 'div' : Fragment
|
||||||
const infoboxClassNames = classNames(classes.root, className, {
|
const infoboxClassNames = classNames(classes.root, className, {
|
||||||
[classes.list]: !!list,
|
[classes.list]: !!list,
|
||||||
[classes.warning]: variant === 'warning',
|
[classes.warning]: variant === 'warning',
|
||||||
[classes.danger]: variant === 'danger',
|
[classes.danger]: variant === 'danger',
|
||||||
})
|
})
|
||||||
return (
|
return (
|
||||||
<>
|
<Wrapper>
|
||||||
{id && <a id={id} />}
|
{id && <a id={id} />}
|
||||||
<aside className={infoboxClassNames}>
|
<aside className={infoboxClassNames}>
|
||||||
{title && (
|
{title && (
|
||||||
|
@ -40,7 +41,7 @@ export default function Infobox({
|
||||||
)}
|
)}
|
||||||
{children}
|
{children}
|
||||||
</aside>
|
</aside>
|
||||||
</>
|
</Wrapper>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,6 @@ import Tag from '../components/tag'
|
||||||
import { H2, Label } from '../components/typography'
|
import { H2, Label } from '../components/typography'
|
||||||
import Icon from '../components/icon'
|
import Icon from '../components/icon'
|
||||||
import Link from '../components/link'
|
import Link from '../components/link'
|
||||||
import Grid from '../components/grid'
|
|
||||||
import Infobox from '../components/infobox'
|
import Infobox from '../components/infobox'
|
||||||
import Accordion from '../components/accordion'
|
import Accordion from '../components/accordion'
|
||||||
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
|
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
|
||||||
|
@ -31,10 +30,16 @@ const MODEL_META = {
|
||||||
wiki: 'Wikipedia',
|
wiki: 'Wikipedia',
|
||||||
uas: 'Unlabelled dependencies',
|
uas: 'Unlabelled dependencies',
|
||||||
las: 'Labelled dependencies',
|
las: 'Labelled dependencies',
|
||||||
|
token_acc: 'Tokenization',
|
||||||
|
tok: 'Tokenization',
|
||||||
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
|
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
|
||||||
ents_f: 'Entities (F-score)',
|
tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
|
||||||
ents_p: 'Entities (precision)',
|
ents_f: 'Named entities (F-score)',
|
||||||
ents_r: 'Entities (recall)',
|
ents_p: 'Named entities (precision)',
|
||||||
|
ents_r: 'Named entities (recall)',
|
||||||
|
sent_f: 'Sentence segmentation (F-score)',
|
||||||
|
sent_p: 'Sentence segmentation (precision)',
|
||||||
|
sent_r: 'Sentence segmentation (recall)',
|
||||||
cpu: 'words per second on CPU',
|
cpu: 'words per second on CPU',
|
||||||
gpu: 'words per second on GPU',
|
gpu: 'words per second on GPU',
|
||||||
pipeline: 'Active processing pipeline components in order',
|
pipeline: 'Active processing pipeline components in order',
|
||||||
|
@ -83,25 +88,19 @@ function formatVectors(data) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatAccuracy(data) {
|
function formatAccuracy(data) {
|
||||||
if (!data) return null
|
if (!data) return []
|
||||||
const labels = {
|
|
||||||
las: 'LAS',
|
|
||||||
uas: 'UAS',
|
|
||||||
tags_acc: 'TAG',
|
|
||||||
ents_f: 'NER F',
|
|
||||||
ents_p: 'NER P',
|
|
||||||
ents_r: 'NER R',
|
|
||||||
}
|
|
||||||
const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
|
|
||||||
const isNer = key => key.startsWith('ents_')
|
|
||||||
return Object.keys(data)
|
return Object.keys(data)
|
||||||
.filter(key => labels[key])
|
.map(label => {
|
||||||
.map(key => ({
|
const value = data[label]
|
||||||
label: labels[key],
|
return isNaN(value)
|
||||||
value: data[key].toFixed(2),
|
? null
|
||||||
help: MODEL_META[key],
|
: {
|
||||||
type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
|
label,
|
||||||
}))
|
value: value.toFixed(2),
|
||||||
|
help: MODEL_META[label],
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter(item => item)
|
||||||
}
|
}
|
||||||
|
|
||||||
function formatModelMeta(data) {
|
function formatModelMeta(data) {
|
||||||
|
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
{ label: 'Author', content: author },
|
{ label: 'Author', content: author },
|
||||||
{ label: 'License', content: license },
|
{ label: 'License', content: license },
|
||||||
]
|
]
|
||||||
const accuracy = [
|
|
||||||
{
|
|
||||||
label: 'Syntax Accuracy',
|
|
||||||
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
label: 'NER Accuracy',
|
|
||||||
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
const error = (
|
const error = (
|
||||||
<Infobox title="Unable to load model details from GitHub" variant="danger">
|
<Infobox title="Unable to load model details from GitHub" variant="danger">
|
||||||
|
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
</p>
|
</p>
|
||||||
</Infobox>
|
</Infobox>
|
||||||
)
|
)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Section id={name}>
|
<Section id={name}>
|
||||||
<H2
|
<H2
|
||||||
|
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
)}
|
)}
|
||||||
</tbody>
|
</tbody>
|
||||||
</Table>
|
</Table>
|
||||||
<Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
|
|
||||||
{accuracy &&
|
|
||||||
accuracy.map(({ label, items }, i) =>
|
|
||||||
!items ? null : (
|
|
||||||
<Table fixed key={i}>
|
|
||||||
<thead>
|
|
||||||
<Tr>
|
|
||||||
<Th colSpan={2}>{label}</Th>
|
|
||||||
</Tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
{items.map((item, i) => (
|
|
||||||
<Tr key={i}>
|
|
||||||
<Td>
|
|
||||||
<Label>
|
|
||||||
{item.label}{' '}
|
|
||||||
{item.help && <Help>{item.help}</Help>}
|
|
||||||
</Label>
|
|
||||||
</Td>
|
|
||||||
<Td num>{item.value}</Td>
|
|
||||||
</Tr>
|
|
||||||
))}
|
|
||||||
</tbody>
|
|
||||||
</Table>
|
|
||||||
)
|
|
||||||
)}
|
|
||||||
</Grid>
|
|
||||||
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
|
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
|
||||||
{hasInteractiveCode && (
|
{hasInteractiveCode && (
|
||||||
<CodeBlock title="Try out the model" lang="python" executable={true}>
|
<CodeBlock title="Try out the model" lang="python" executable={true}>
|
||||||
|
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
`import spacy`,
|
`import spacy`,
|
||||||
`from spacy.lang.${langId}.examples import sentences `,
|
`from spacy.lang.${langId}.examples import sentences `,
|
||||||
``,
|
``,
|
||||||
`nlp = spacy.load('${name}')`,
|
`nlp = spacy.load("${name}")`,
|
||||||
`doc = nlp(sentences[0])`,
|
`doc = nlp(sentences[0])`,
|
||||||
`print(doc.text)`,
|
`print(doc.text)`,
|
||||||
`for token in doc:`,
|
`for token in doc:`,
|
||||||
|
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
].join('\n')}
|
].join('\n')}
|
||||||
</CodeBlock>
|
</CodeBlock>
|
||||||
)}
|
)}
|
||||||
|
{meta.accuracy && (
|
||||||
|
<Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
|
||||||
|
<Table>
|
||||||
|
<tbody>
|
||||||
|
{meta.accuracy.map(({ label, value, help }) => (
|
||||||
|
<Tr key={`${name}-${label}`}>
|
||||||
|
<Td nowrap>
|
||||||
|
<InlineCode>{label.toUpperCase()}</InlineCode>
|
||||||
|
</Td>
|
||||||
|
<Td>{help}</Td>
|
||||||
|
<Td num style={{ textAlign: 'right' }}>
|
||||||
|
{value}
|
||||||
|
</Td>
|
||||||
|
</Tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</Table>
|
||||||
|
</Accordion>
|
||||||
|
)}
|
||||||
{labels && (
|
{labels && (
|
||||||
<Accordion id={`${name}-labels`} title="Label Scheme">
|
<Accordion id={`${name}-labels`} title="Label Scheme">
|
||||||
<p>
|
<p>
|
||||||
|
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
const labelNames = labels[pipe] || []
|
const labelNames = labels[pipe] || []
|
||||||
const help = LABEL_SCHEME_META[pipe]
|
const help = LABEL_SCHEME_META[pipe]
|
||||||
return (
|
return (
|
||||||
<Tr key={pipe} evenodd={false} key={pipe}>
|
<Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
|
||||||
<Td style={{ width: '20%' }}>
|
<Td style={{ width: '20%' }}>
|
||||||
<Label>
|
<Label>
|
||||||
{pipe} {help && <Help>{help}</Help>}
|
{pipe} {help && <Help>{help}</Help>}
|
||||||
|
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
||||||
const Models = ({ pageContext, repo, children }) => {
|
const Models = ({ pageContext, repo, children }) => {
|
||||||
const [initialized, setInitialized] = useState(false)
|
const [initialized, setInitialized] = useState(false)
|
||||||
const [compatibility, setCompatibility] = useState({})
|
const [compatibility, setCompatibility] = useState({})
|
||||||
const { id, title, meta } = pageContext
|
const { id, title, meta, hasExamples } = pageContext
|
||||||
const { models, isStarters } = meta
|
const { models, isStarters } = meta
|
||||||
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
|
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
|
||||||
|
|
||||||
|
@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
|
||||||
|
|
||||||
const modelTitle = title
|
const modelTitle = title
|
||||||
const modelTeaser = `Available trained pipelines for ${title}`
|
const modelTeaser = `Available trained pipelines for ${title}`
|
||||||
|
|
||||||
const starterTitle = `${title} starters`
|
const starterTitle = `${title} starters`
|
||||||
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
||||||
|
|
||||||
|
@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
|
||||||
baseUrl={baseUrl}
|
baseUrl={baseUrl}
|
||||||
repo={repo}
|
repo={repo}
|
||||||
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
|
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
|
||||||
|
hasExamples={meta.hasExamples}
|
||||||
/>
|
/>
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user