Merge remote-tracking branch 'upstream/develop' into fix/nr_features

# Conflicts:
#	spacy/ml/models/parser.py
#	spacy/tests/serialize/test_serialize_config.py
#	website/docs/api/architectures.md
This commit is contained in:
svlandeg 2020-09-23 17:01:13 +02:00
commit 35dbc63578
10 changed files with 68 additions and 75 deletions

View File

@ -20,6 +20,7 @@ pytokenizations
setuptools setuptools
packaging packaging
importlib_metadata>=0.20; python_version < "3.8" importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
# Development dependencies # Development dependencies
cython>=0.25 cython>=0.25
pytest>=4.6.5 pytest>=4.6.5

View File

@ -57,6 +57,7 @@ install_requires =
setuptools setuptools
packaging packaging
importlib_metadata>=0.20; python_version < "3.8" importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a21" __version__ = "3.0.0a22"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -284,7 +284,7 @@ vectors = "{{ word_vectors }}"
{% endif -%} {% endif -%}
{% if use_transformer -%} {% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }} accumulate_gradient = {{ transformer["size_factor"] }}
{% endif %} {% endif -%}
dev_corpus = "corpora.dev" dev_corpus = "corpora.dev"
train_corpus = "corpora.train" train_corpus = "corpora.train"

View File

@ -152,7 +152,8 @@ def train(
exclude=frozen_components, exclude=frozen_components,
) )
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row, finalize_logger = train_logger(nlp) with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp)
try: try:
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
@ -163,7 +164,8 @@ def train(
progress.close() progress.close()
print_row(info) print_row(info)
if is_best_checkpoint and output_path is not None: if is_best_checkpoint and output_path is not None:
update_meta(T_cfg, nlp, info) with nlp.select_pipes(disable=frozen_components):
update_meta(T_cfg, nlp, info)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
nlp.to_disk(output_path / "model-best") nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)

View File

@ -22,6 +22,11 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
try: # Python 3.8+
from typing import Literal
except ImportError:
from typing_extensions import Literal # noqa: F401
from thinc.api import Optimizer # noqa: F401 from thinc.api import Optimizer # noqa: F401
pickle = pickle pickle = pickle

View File

@ -11,9 +11,11 @@ def console_logger():
def setup_printer( def setup_printer(
nlp: "Language", nlp: "Language",
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
# we assume here that only components are enabled that should be trained & logged
logged_pipes = nlp.pipe_names
score_cols = list(nlp.config["training"]["score_weights"]) score_cols = list(nlp.config["training"]["score_weights"])
score_widths = [max(len(col), 6) for col in score_cols] score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
loss_widths = [max(len(col), 8) for col in loss_cols] loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header] table_header = [col.upper() for col in table_header]
@ -26,7 +28,7 @@ def console_logger():
try: try:
losses = [ losses = [
"{0:.2f}".format(float(info["losses"][pipe_name])) "{0:.2f}".format(float(info["losses"][pipe_name]))
for pipe_name in nlp.pipe_names for pipe_name in logged_pipes
] ]
except KeyError as e: except KeyError as e:
raise KeyError( raise KeyError(

View File

@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
| System | POS | UAS | LAS | | System | POS | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: | ---: | | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
| spaCy RoBERTa (2020) | | | | | spaCy RoBERTa (2020) | 97.8 | 96.6 | 94.7 |
| spaCy CNN (2020) | | | | | spaCy CNN (2020) | | | |
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 |
@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison.
**Accuracy on the Penn Treebank.** See **Accuracy on the Penn Treebank.** See
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
results. results. For spaCy's evaluation, see the
[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
</figcaption> </figcaption>

View File

@ -1,4 +1,4 @@
import React from 'react' import React, { Fragment } from 'react'
import PropTypes from 'prop-types' import PropTypes from 'prop-types'
import classNames from 'classnames' import classNames from 'classnames'
@ -14,13 +14,14 @@ export default function Infobox({
className, className,
children, children,
}) { }) {
const Wrapper = id ? 'div' : Fragment
const infoboxClassNames = classNames(classes.root, className, { const infoboxClassNames = classNames(classes.root, className, {
[classes.list]: !!list, [classes.list]: !!list,
[classes.warning]: variant === 'warning', [classes.warning]: variant === 'warning',
[classes.danger]: variant === 'danger', [classes.danger]: variant === 'danger',
}) })
return ( return (
<> <Wrapper>
{id && <a id={id} />} {id && <a id={id} />}
<aside className={infoboxClassNames}> <aside className={infoboxClassNames}>
{title && ( {title && (
@ -40,7 +41,7 @@ export default function Infobox({
)} )}
{children} {children}
</aside> </aside>
</> </Wrapper>
) )
} }

View File

@ -12,7 +12,6 @@ import Tag from '../components/tag'
import { H2, Label } from '../components/typography' import { H2, Label } from '../components/typography'
import Icon from '../components/icon' import Icon from '../components/icon'
import Link from '../components/link' import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox' import Infobox from '../components/infobox'
import Accordion from '../components/accordion' import Accordion from '../components/accordion'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
@ -31,10 +30,16 @@ const MODEL_META = {
wiki: 'Wikipedia', wiki: 'Wikipedia',
uas: 'Unlabelled dependencies', uas: 'Unlabelled dependencies',
las: 'Labelled dependencies', las: 'Labelled dependencies',
token_acc: 'Tokenization',
tok: 'Tokenization',
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_f: 'Entities (F-score)', tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_p: 'Entities (precision)', ents_f: 'Named entities (F-score)',
ents_r: 'Entities (recall)', ents_p: 'Named entities (precision)',
ents_r: 'Named entities (recall)',
sent_f: 'Sentence segmentation (F-score)',
sent_p: 'Sentence segmentation (precision)',
sent_r: 'Sentence segmentation (recall)',
cpu: 'words per second on CPU', cpu: 'words per second on CPU',
gpu: 'words per second on GPU', gpu: 'words per second on GPU',
pipeline: 'Active processing pipeline components in order', pipeline: 'Active processing pipeline components in order',
@ -83,25 +88,19 @@ function formatVectors(data) {
} }
function formatAccuracy(data) { function formatAccuracy(data) {
if (!data) return null if (!data) return []
const labels = {
las: 'LAS',
uas: 'UAS',
tags_acc: 'TAG',
ents_f: 'NER F',
ents_p: 'NER P',
ents_r: 'NER R',
}
const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
const isNer = key => key.startsWith('ents_')
return Object.keys(data) return Object.keys(data)
.filter(key => labels[key]) .map(label => {
.map(key => ({ const value = data[label]
label: labels[key], return isNaN(value)
value: data[key].toFixed(2), ? null
help: MODEL_META[key], : {
type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, label,
})) value: value.toFixed(2),
help: MODEL_META[label],
}
})
.filter(item => item)
} }
function formatModelMeta(data) { function formatModelMeta(data) {
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
{ label: 'Author', content: author }, { label: 'Author', content: author },
{ label: 'License', content: license }, { label: 'License', content: license },
] ]
const accuracy = [
{
label: 'Syntax Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
},
{
label: 'NER Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
},
]
const error = ( const error = (
<Infobox title="Unable to load model details from GitHub" variant="danger"> <Infobox title="Unable to load model details from GitHub" variant="danger">
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
</p> </p>
</Infobox> </Infobox>
) )
return ( return (
<Section id={name}> <Section id={name}>
<H2 <H2
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
)} )}
</tbody> </tbody>
</Table> </Table>
<Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
{accuracy &&
accuracy.map(({ label, items }, i) =>
!items ? null : (
<Table fixed key={i}>
<thead>
<Tr>
<Th colSpan={2}>{label}</Th>
</Tr>
</thead>
<tbody>
{items.map((item, i) => (
<Tr key={i}>
<Td>
<Label>
{item.label}{' '}
{item.help && <Help>{item.help}</Help>}
</Label>
</Td>
<Td num>{item.value}</Td>
</Tr>
))}
</tbody>
</Table>
)
)}
</Grid>
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
{hasInteractiveCode && ( {hasInteractiveCode && (
<CodeBlock title="Try out the model" lang="python" executable={true}> <CodeBlock title="Try out the model" lang="python" executable={true}>
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
`import spacy`, `import spacy`,
`from spacy.lang.${langId}.examples import sentences `, `from spacy.lang.${langId}.examples import sentences `,
``, ``,
`nlp = spacy.load('${name}')`, `nlp = spacy.load("${name}")`,
`doc = nlp(sentences[0])`, `doc = nlp(sentences[0])`,
`print(doc.text)`, `print(doc.text)`,
`for token in doc:`, `for token in doc:`,
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
].join('\n')} ].join('\n')}
</CodeBlock> </CodeBlock>
)} )}
{meta.accuracy && (
<Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
<Table>
<tbody>
{meta.accuracy.map(({ label, value, help }) => (
<Tr key={`${name}-${label}`}>
<Td nowrap>
<InlineCode>{label.toUpperCase()}</InlineCode>
</Td>
<Td>{help}</Td>
<Td num style={{ textAlign: 'right' }}>
{value}
</Td>
</Tr>
))}
</tbody>
</Table>
</Accordion>
)}
{labels && ( {labels && (
<Accordion id={`${name}-labels`} title="Label Scheme"> <Accordion id={`${name}-labels`} title="Label Scheme">
<p> <p>
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const labelNames = labels[pipe] || [] const labelNames = labels[pipe] || []
const help = LABEL_SCHEME_META[pipe] const help = LABEL_SCHEME_META[pipe]
return ( return (
<Tr key={pipe} evenodd={false} key={pipe}> <Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
<Td style={{ width: '20%' }}> <Td style={{ width: '20%' }}>
<Label> <Label>
{pipe} {help && <Help>{help}</Help>} {pipe} {help && <Help>{help}</Help>}
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const Models = ({ pageContext, repo, children }) => { const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false) const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({}) const [compatibility, setCompatibility] = useState({})
const { id, title, meta } = pageContext const { id, title, meta, hasExamples } = pageContext
const { models, isStarters } = meta const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master` const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
const modelTitle = title const modelTitle = title
const modelTeaser = `Available trained pipelines for ${title}` const modelTeaser = `Available trained pipelines for ${title}`
const starterTitle = `${title} starters` const starterTitle = `${title} starters`
const starterTeaser = `Available transfer learning starter packs for ${title}` const starterTeaser = `Available transfer learning starter packs for ${title}`
@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
baseUrl={baseUrl} baseUrl={baseUrl}
repo={repo} repo={repo}
licenses={arrayToObj(site.siteMetadata.licenses, 'id')} licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
hasExamples={meta.hasExamples}
/> />
)) ))
} }