Merge remote-tracking branch 'upstream/develop' into fix/nr_features

# Conflicts:
#	spacy/ml/models/parser.py
#	spacy/tests/serialize/test_serialize_config.py
#	website/docs/api/architectures.md
This commit is contained in:
svlandeg 2020-09-23 17:01:13 +02:00
commit 35dbc63578
10 changed files with 68 additions and 75 deletions

View File

@ -20,6 +20,7 @@ pytokenizations
setuptools
packaging
importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
# Development dependencies
cython>=0.25
pytest>=4.6.5

View File

@ -57,6 +57,7 @@ install_requires =
setuptools
packaging
importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
[options.entry_points]
console_scripts =

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a21"
__version__ = "3.0.0a22"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -284,7 +284,7 @@ vectors = "{{ word_vectors }}"
{% endif -%}
{% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }}
{% endif %}
{% endif -%}
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

View File

@ -152,6 +152,7 @@ def train(
exclude=frozen_components,
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp)
try:
@ -163,6 +164,7 @@ def train(
progress.close()
print_row(info)
if is_best_checkpoint and output_path is not None:
with nlp.select_pipes(disable=frozen_components):
update_meta(T_cfg, nlp, info)
with nlp.use_params(optimizer.averages):
nlp.to_disk(output_path / "model-best")

View File

@ -22,6 +22,11 @@ try:
except ImportError:
cupy = None
try: # Python 3.8+
from typing import Literal
except ImportError:
from typing_extensions import Literal # noqa: F401
from thinc.api import Optimizer # noqa: F401
pickle = pickle

View File

@ -11,9 +11,11 @@ def console_logger():
def setup_printer(
nlp: "Language",
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
# we assume here that only components are enabled that should be trained & logged
logged_pipes = nlp.pipe_names
score_cols = list(nlp.config["training"]["score_weights"])
score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header]
@ -26,7 +28,7 @@ def console_logger():
try:
losses = [
"{0:.2f}".format(float(info["losses"][pipe_name]))
for pipe_name in nlp.pipe_names
for pipe_name in logged_pipes
]
except KeyError as e:
raise KeyError(

View File

@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
| System | POS | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
| spaCy RoBERTa (2020) | | | |
| spaCy RoBERTa (2020) | 97.8 | 96.6 | 94.7 |
| spaCy CNN (2020) | | | |
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 |
@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison.
**Accuracy on the Penn Treebank.** See
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
results.
results. For spaCy's evaluation, see the
[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
</figcaption>

View File

@ -1,4 +1,4 @@
import React from 'react'
import React, { Fragment } from 'react'
import PropTypes from 'prop-types'
import classNames from 'classnames'
@ -14,13 +14,14 @@ export default function Infobox({
className,
children,
}) {
const Wrapper = id ? 'div' : Fragment
const infoboxClassNames = classNames(classes.root, className, {
[classes.list]: !!list,
[classes.warning]: variant === 'warning',
[classes.danger]: variant === 'danger',
})
return (
<>
<Wrapper>
{id && <a id={id} />}
<aside className={infoboxClassNames}>
{title && (
@ -40,7 +41,7 @@ export default function Infobox({
)}
{children}
</aside>
</>
</Wrapper>
)
}

View File

@ -12,7 +12,6 @@ import Tag from '../components/tag'
import { H2, Label } from '../components/typography'
import Icon from '../components/icon'
import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox'
import Accordion from '../components/accordion'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
@ -31,10 +30,16 @@ const MODEL_META = {
wiki: 'Wikipedia',
uas: 'Unlabelled dependencies',
las: 'Labelled dependencies',
token_acc: 'Tokenization',
tok: 'Tokenization',
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_f: 'Entities (F-score)',
ents_p: 'Entities (precision)',
ents_r: 'Entities (recall)',
tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_f: 'Named entities (F-score)',
ents_p: 'Named entities (precision)',
ents_r: 'Named entities (recall)',
sent_f: 'Sentence segmentation (F-score)',
sent_p: 'Sentence segmentation (precision)',
sent_r: 'Sentence segmentation (recall)',
cpu: 'words per second on CPU',
gpu: 'words per second on GPU',
pipeline: 'Active processing pipeline components in order',
@ -83,25 +88,19 @@ function formatVectors(data) {
}
function formatAccuracy(data) {
if (!data) return null
const labels = {
las: 'LAS',
uas: 'UAS',
tags_acc: 'TAG',
ents_f: 'NER F',
ents_p: 'NER P',
ents_r: 'NER R',
}
const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
const isNer = key => key.startsWith('ents_')
if (!data) return []
return Object.keys(data)
.filter(key => labels[key])
.map(key => ({
label: labels[key],
value: data[key].toFixed(2),
help: MODEL_META[key],
type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
}))
.map(label => {
const value = data[label]
return isNaN(value)
? null
: {
label,
value: value.toFixed(2),
help: MODEL_META[label],
}
})
.filter(item => item)
}
function formatModelMeta(data) {
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
{ label: 'Author', content: author },
{ label: 'License', content: license },
]
const accuracy = [
{
label: 'Syntax Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
},
{
label: 'NER Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
},
]
const error = (
<Infobox title="Unable to load model details from GitHub" variant="danger">
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
</p>
</Infobox>
)
return (
<Section id={name}>
<H2
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
)}
</tbody>
</Table>
<Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
{accuracy &&
accuracy.map(({ label, items }, i) =>
!items ? null : (
<Table fixed key={i}>
<thead>
<Tr>
<Th colSpan={2}>{label}</Th>
</Tr>
</thead>
<tbody>
{items.map((item, i) => (
<Tr key={i}>
<Td>
<Label>
{item.label}{' '}
{item.help && <Help>{item.help}</Help>}
</Label>
</Td>
<Td num>{item.value}</Td>
</Tr>
))}
</tbody>
</Table>
)
)}
</Grid>
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
{hasInteractiveCode && (
<CodeBlock title="Try out the model" lang="python" executable={true}>
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
`import spacy`,
`from spacy.lang.${langId}.examples import sentences `,
``,
`nlp = spacy.load('${name}')`,
`nlp = spacy.load("${name}")`,
`doc = nlp(sentences[0])`,
`print(doc.text)`,
`for token in doc:`,
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
].join('\n')}
</CodeBlock>
)}
{meta.accuracy && (
<Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
<Table>
<tbody>
{meta.accuracy.map(({ label, value, help }) => (
<Tr key={`${name}-${label}`}>
<Td nowrap>
<InlineCode>{label.toUpperCase()}</InlineCode>
</Td>
<Td>{help}</Td>
<Td num style={{ textAlign: 'right' }}>
{value}
</Td>
</Tr>
))}
</tbody>
</Table>
</Accordion>
)}
{labels && (
<Accordion id={`${name}-labels`} title="Label Scheme">
<p>
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const labelNames = labels[pipe] || []
const help = LABEL_SCHEME_META[pipe]
return (
<Tr key={pipe} evenodd={false} key={pipe}>
<Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
<Td style={{ width: '20%' }}>
<Label>
{pipe} {help && <Help>{help}</Help>}
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({})
const { id, title, meta } = pageContext
const { id, title, meta, hasExamples } = pageContext
const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
const modelTitle = title
const modelTeaser = `Available trained pipelines for ${title}`
const starterTitle = `${title} starters`
const starterTeaser = `Available transfer learning starter packs for ${title}`
@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
baseUrl={baseUrl}
repo={repo}
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
hasExamples={meta.hasExamples}
/>
))
}