mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge remote-tracking branch 'upstream/develop' into fix/nr_features
# Conflicts: # spacy/ml/models/parser.py # spacy/tests/serialize/test_serialize_config.py # website/docs/api/architectures.md
This commit is contained in:
commit
35dbc63578
|
@ -20,6 +20,7 @@ pytokenizations
|
|||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.6.5
|
||||
|
|
|
@ -57,6 +57,7 @@ install_requires =
|
|||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a21"
|
||||
__version__ = "3.0.0a22"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -284,7 +284,7 @@ vectors = "{{ word_vectors }}"
|
|||
{% endif -%}
|
||||
{% if use_transformer -%}
|
||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||
{% endif %}
|
||||
{% endif -%}
|
||||
dev_corpus = "corpora.dev"
|
||||
train_corpus = "corpora.train"
|
||||
|
||||
|
|
|
@ -152,7 +152,8 @@ def train(
|
|||
exclude=frozen_components,
|
||||
)
|
||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
|
||||
try:
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
|
@ -163,7 +164,8 @@ def train(
|
|||
progress.close()
|
||||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
update_meta(T_cfg, nlp, info)
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
update_meta(T_cfg, nlp, info)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
|
|
|
@ -22,6 +22,11 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try: # Python 3.8+
|
||||
from typing import Literal
|
||||
except ImportError:
|
||||
from typing_extensions import Literal # noqa: F401
|
||||
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
|
|
|
@ -11,9 +11,11 @@ def console_logger():
|
|||
def setup_printer(
|
||||
nlp: "Language",
|
||||
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
|
||||
# we assume here that only components are enabled that should be trained & logged
|
||||
logged_pipes = nlp.pipe_names
|
||||
score_cols = list(nlp.config["training"]["score_weights"])
|
||||
score_widths = [max(len(col), 6) for col in score_cols]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
|
||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
|
||||
table_header = [col.upper() for col in table_header]
|
||||
|
@ -26,7 +28,7 @@ def console_logger():
|
|||
try:
|
||||
losses = [
|
||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||
for pipe_name in nlp.pipe_names
|
||||
for pipe_name in logged_pipes
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
|
|
|
@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.
|
|||
|
||||
| System | POS | UAS | LAS |
|
||||
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
|
||||
| spaCy RoBERTa (2020) | | | |
|
||||
| spaCy RoBERTa (2020) | 97.8 | 96.6 | 94.7 |
|
||||
| spaCy CNN (2020) | | | |
|
||||
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
|
||||
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 |
|
||||
|
@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison.
|
|||
|
||||
**Accuracy on the Penn Treebank.** See
|
||||
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
|
||||
results.
|
||||
results. For spaCy's evaluation, see the
|
||||
[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).
|
||||
|
||||
</figcaption>
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import React from 'react'
|
||||
import React, { Fragment } from 'react'
|
||||
import PropTypes from 'prop-types'
|
||||
import classNames from 'classnames'
|
||||
|
||||
|
@ -14,13 +14,14 @@ export default function Infobox({
|
|||
className,
|
||||
children,
|
||||
}) {
|
||||
const Wrapper = id ? 'div' : Fragment
|
||||
const infoboxClassNames = classNames(classes.root, className, {
|
||||
[classes.list]: !!list,
|
||||
[classes.warning]: variant === 'warning',
|
||||
[classes.danger]: variant === 'danger',
|
||||
})
|
||||
return (
|
||||
<>
|
||||
<Wrapper>
|
||||
{id && <a id={id} />}
|
||||
<aside className={infoboxClassNames}>
|
||||
{title && (
|
||||
|
@ -40,7 +41,7 @@ export default function Infobox({
|
|||
)}
|
||||
{children}
|
||||
</aside>
|
||||
</>
|
||||
</Wrapper>
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
@ -12,7 +12,6 @@ import Tag from '../components/tag'
|
|||
import { H2, Label } from '../components/typography'
|
||||
import Icon from '../components/icon'
|
||||
import Link from '../components/link'
|
||||
import Grid from '../components/grid'
|
||||
import Infobox from '../components/infobox'
|
||||
import Accordion from '../components/accordion'
|
||||
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
|
||||
|
@ -31,10 +30,16 @@ const MODEL_META = {
|
|||
wiki: 'Wikipedia',
|
||||
uas: 'Unlabelled dependencies',
|
||||
las: 'Labelled dependencies',
|
||||
token_acc: 'Tokenization',
|
||||
tok: 'Tokenization',
|
||||
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
|
||||
ents_f: 'Entities (F-score)',
|
||||
ents_p: 'Entities (precision)',
|
||||
ents_r: 'Entities (recall)',
|
||||
tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
|
||||
ents_f: 'Named entities (F-score)',
|
||||
ents_p: 'Named entities (precision)',
|
||||
ents_r: 'Named entities (recall)',
|
||||
sent_f: 'Sentence segmentation (F-score)',
|
||||
sent_p: 'Sentence segmentation (precision)',
|
||||
sent_r: 'Sentence segmentation (recall)',
|
||||
cpu: 'words per second on CPU',
|
||||
gpu: 'words per second on GPU',
|
||||
pipeline: 'Active processing pipeline components in order',
|
||||
|
@ -83,25 +88,19 @@ function formatVectors(data) {
|
|||
}
|
||||
|
||||
function formatAccuracy(data) {
|
||||
if (!data) return null
|
||||
const labels = {
|
||||
las: 'LAS',
|
||||
uas: 'UAS',
|
||||
tags_acc: 'TAG',
|
||||
ents_f: 'NER F',
|
||||
ents_p: 'NER P',
|
||||
ents_r: 'NER R',
|
||||
}
|
||||
const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
|
||||
const isNer = key => key.startsWith('ents_')
|
||||
if (!data) return []
|
||||
return Object.keys(data)
|
||||
.filter(key => labels[key])
|
||||
.map(key => ({
|
||||
label: labels[key],
|
||||
value: data[key].toFixed(2),
|
||||
help: MODEL_META[key],
|
||||
type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
|
||||
}))
|
||||
.map(label => {
|
||||
const value = data[label]
|
||||
return isNaN(value)
|
||||
? null
|
||||
: {
|
||||
label,
|
||||
value: value.toFixed(2),
|
||||
help: MODEL_META[label],
|
||||
}
|
||||
})
|
||||
.filter(item => item)
|
||||
}
|
||||
|
||||
function formatModelMeta(data) {
|
||||
|
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
{ label: 'Author', content: author },
|
||||
{ label: 'License', content: license },
|
||||
]
|
||||
const accuracy = [
|
||||
{
|
||||
label: 'Syntax Accuracy',
|
||||
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
|
||||
},
|
||||
{
|
||||
label: 'NER Accuracy',
|
||||
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
|
||||
},
|
||||
]
|
||||
|
||||
const error = (
|
||||
<Infobox title="Unable to load model details from GitHub" variant="danger">
|
||||
|
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
</p>
|
||||
</Infobox>
|
||||
)
|
||||
|
||||
return (
|
||||
<Section id={name}>
|
||||
<H2
|
||||
|
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
)}
|
||||
</tbody>
|
||||
</Table>
|
||||
<Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
|
||||
{accuracy &&
|
||||
accuracy.map(({ label, items }, i) =>
|
||||
!items ? null : (
|
||||
<Table fixed key={i}>
|
||||
<thead>
|
||||
<Tr>
|
||||
<Th colSpan={2}>{label}</Th>
|
||||
</Tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{items.map((item, i) => (
|
||||
<Tr key={i}>
|
||||
<Td>
|
||||
<Label>
|
||||
{item.label}{' '}
|
||||
{item.help && <Help>{item.help}</Help>}
|
||||
</Label>
|
||||
</Td>
|
||||
<Td num>{item.value}</Td>
|
||||
</Tr>
|
||||
))}
|
||||
</tbody>
|
||||
</Table>
|
||||
)
|
||||
)}
|
||||
</Grid>
|
||||
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
|
||||
{hasInteractiveCode && (
|
||||
<CodeBlock title="Try out the model" lang="python" executable={true}>
|
||||
|
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
`import spacy`,
|
||||
`from spacy.lang.${langId}.examples import sentences `,
|
||||
``,
|
||||
`nlp = spacy.load('${name}')`,
|
||||
`nlp = spacy.load("${name}")`,
|
||||
`doc = nlp(sentences[0])`,
|
||||
`print(doc.text)`,
|
||||
`for token in doc:`,
|
||||
|
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
].join('\n')}
|
||||
</CodeBlock>
|
||||
)}
|
||||
{meta.accuracy && (
|
||||
<Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
|
||||
<Table>
|
||||
<tbody>
|
||||
{meta.accuracy.map(({ label, value, help }) => (
|
||||
<Tr key={`${name}-${label}`}>
|
||||
<Td nowrap>
|
||||
<InlineCode>{label.toUpperCase()}</InlineCode>
|
||||
</Td>
|
||||
<Td>{help}</Td>
|
||||
<Td num style={{ textAlign: 'right' }}>
|
||||
{value}
|
||||
</Td>
|
||||
</Tr>
|
||||
))}
|
||||
</tbody>
|
||||
</Table>
|
||||
</Accordion>
|
||||
)}
|
||||
{labels && (
|
||||
<Accordion id={`${name}-labels`} title="Label Scheme">
|
||||
<p>
|
||||
|
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
const labelNames = labels[pipe] || []
|
||||
const help = LABEL_SCHEME_META[pipe]
|
||||
return (
|
||||
<Tr key={pipe} evenodd={false} key={pipe}>
|
||||
<Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
|
||||
<Td style={{ width: '20%' }}>
|
||||
<Label>
|
||||
{pipe} {help && <Help>{help}</Help>}
|
||||
|
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
const Models = ({ pageContext, repo, children }) => {
|
||||
const [initialized, setInitialized] = useState(false)
|
||||
const [compatibility, setCompatibility] = useState({})
|
||||
const { id, title, meta } = pageContext
|
||||
const { id, title, meta, hasExamples } = pageContext
|
||||
const { models, isStarters } = meta
|
||||
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
|
||||
|
||||
|
@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
|
||||
const modelTitle = title
|
||||
const modelTeaser = `Available trained pipelines for ${title}`
|
||||
|
||||
const starterTitle = `${title} starters`
|
||||
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
||||
|
||||
|
@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
baseUrl={baseUrl}
|
||||
repo={repo}
|
||||
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
|
||||
hasExamples={meta.hasExamples}
|
||||
/>
|
||||
))
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user