Merge remote-tracking branch 'upstream/develop' into fix/nr_features

# Conflicts: # spacy/ml/models/parser.py # spacy/tests/serialize/test_serialize_config.py # website/docs/api/architectures.md
2025-10-29 15:07:54 +03:00 · 2020-09-23 17:01:13 +02:00 · 2020-09-23 17:01:13 +02:00 · 35dbc63578
commit 35dbc63578
parent 25b34bba94 916050bf2f
10 changed files with 68 additions and 75 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -20,6 +20,7 @@ pytokenizations
 setuptools
 packaging
 importlib_metadata>=0.20; python_version < "3.8"
+typing_extensions>=3.7.4; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=4.6.5
--- a/setup.cfg
+++ b/setup.cfg
@ -57,6 +57,7 @@ install_requires =
    setuptools
    packaging
    importlib_metadata>=0.20; python_version < "3.8"
+    typing_extensions>=3.7.4; python_version < "3.8"

 [options.entry_points]
 console_scripts =
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a21"
+__version__ = "3.0.0a22"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -284,7 +284,7 @@ vectors = "{{ word_vectors }}"
 {% endif -%}
 {% if use_transformer -%}
 accumulate_gradient = {{ transformer["size_factor"] }}
-{% endif %}
+{% endif -%}
 dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"

--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -152,7 +152,8 @@ def train(
        exclude=frozen_components,
    )
    msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
-    print_row, finalize_logger = train_logger(nlp)
+    with nlp.select_pipes(disable=frozen_components):
+        print_row, finalize_logger = train_logger(nlp)

    try:
        progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
@ -163,7 +164,8 @@ def train(
                progress.close()
                print_row(info)
                if is_best_checkpoint and output_path is not None:
-                    update_meta(T_cfg, nlp, info)
+                    with nlp.select_pipes(disable=frozen_components):
+                        update_meta(T_cfg, nlp, info)
                    with nlp.use_params(optimizer.averages):
                        nlp.to_disk(output_path / "model-best")
                progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -22,6 +22,11 @@ try:
 except ImportError:
    cupy = None

+try:  # Python 3.8+
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal  # noqa: F401
+
 from thinc.api import Optimizer  # noqa: F401

 pickle = pickle
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -11,9 +11,11 @@ def console_logger():
    def setup_printer(
        nlp: "Language",
    ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
+        # we assume here that only components are enabled that should be trained & logged
+        logged_pipes = nlp.pipe_names
        score_cols = list(nlp.config["training"]["score_weights"])
        score_widths = [max(len(col), 6) for col in score_cols]
-        loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
+        loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
        loss_widths = [max(len(col), 8) for col in loss_cols]
        table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
        table_header = [col.upper() for col in table_header]
@ -26,7 +28,7 @@ def console_logger():
            try:
                losses = [
                    "{0:.2f}".format(float(info["losses"][pipe_name]))
-                    for pipe_name in nlp.pipe_names
+                    for pipe_name in logged_pipes
                ]
            except KeyError as e:
                raise KeyError(
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@ -28,7 +28,7 @@ on training Stanza on this corpus to allow direct comparison.

 | System                                                                         |  POS |  UAS |  LAS |
 | ------------------------------------------------------------------------------ | ---: | ---: | ---: |
-| spaCy RoBERTa (2020)                                                           |      |      |      |
+| spaCy RoBERTa (2020)                                                           | 97.8 | 96.6 | 94.7 |
 | spaCy CNN (2020)                                                               |      |      |      |
 | [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
 | [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019)             | 97.3 | 97.2 | 95.7 |
@ -37,7 +37,8 @@ on training Stanza on this corpus to allow direct comparison.

 **Accuracy on the Penn Treebank.** See
 [NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
-results.
+results. For spaCy's evaluation, see the
+[project template](https://github.com/explosion/projects/tree/v3/benchmarks/parsing_penn_treebank).

 </figcaption>

--- a/website/src/components/infobox.js
+++ b/website/src/components/infobox.js
@ -1,4 +1,4 @@
-import React from 'react'
+import React, { Fragment } from 'react'
 import PropTypes from 'prop-types'
 import classNames from 'classnames'

@ -14,13 +14,14 @@ export default function Infobox({
    className,
    children,
 }) {
+    const Wrapper = id ? 'div' : Fragment
    const infoboxClassNames = classNames(classes.root, className, {
        [classes.list]: !!list,
        [classes.warning]: variant === 'warning',
        [classes.danger]: variant === 'danger',
    })
    return (
-        <>
+        <Wrapper>
            {id && <a id={id} />}
            <aside className={infoboxClassNames}>
                {title && (
@ -40,7 +41,7 @@ export default function Infobox({
                )}
                {children}
            </aside>
-        </>
+        </Wrapper>
    )
 }

--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@ -12,7 +12,6 @@ import Tag from '../components/tag'
 import { H2, Label } from '../components/typography'
 import Icon from '../components/icon'
 import Link from '../components/link'
-import Grid from '../components/grid'
 import Infobox from '../components/infobox'
 import Accordion from '../components/accordion'
 import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
@ -31,10 +30,16 @@ const MODEL_META = {
    wiki: 'Wikipedia',
    uas: 'Unlabelled dependencies',
    las: 'Labelled dependencies',
+    token_acc: 'Tokenization',
+    tok: 'Tokenization',
    tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
-    ents_f: 'Entities (F-score)',
-    ents_p: 'Entities (precision)',
-    ents_r: 'Entities (recall)',
+    tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
+    ents_f: 'Named entities (F-score)',
+    ents_p: 'Named entities (precision)',
+    ents_r: 'Named entities (recall)',
+    sent_f: 'Sentence segmentation (F-score)',
+    sent_p: 'Sentence segmentation (precision)',
+    sent_r: 'Sentence segmentation (recall)',
    cpu: 'words per second on CPU',
    gpu: 'words per second on GPU',
    pipeline: 'Active processing pipeline components in order',
@ -83,25 +88,19 @@ function formatVectors(data) {
 }

 function formatAccuracy(data) {
-    if (!data) return null
-    const labels = {
-        las: 'LAS',
-        uas: 'UAS',
-        tags_acc: 'TAG',
-        ents_f: 'NER F',
-        ents_p: 'NER P',
-        ents_r: 'NER R',
-    }
-    const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
-    const isNer = key => key.startsWith('ents_')
+    if (!data) return []
    return Object.keys(data)
-        .filter(key => labels[key])
-        .map(key => ({
-            label: labels[key],
-            value: data[key].toFixed(2),
-            help: MODEL_META[key],
-            type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
-        }))
+        .map(label => {
+            const value = data[label]
+            return isNaN(value)
+                ? null
+                : {
+                      label,
+                      value: value.toFixed(2),
+                      help: MODEL_META[label],
+                  }
+        })
+        .filter(item => item)
 }

 function formatModelMeta(data) {
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
        { label: 'Author', content: author },
        { label: 'License', content: license },
    ]
-    const accuracy = [
-        {
-            label: 'Syntax Accuracy',
-            items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
-        },
-        {
-            label: 'NER Accuracy',
-            items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
-        },
-    ]

    const error = (
        <Infobox title="Unable to load model details from GitHub" variant="danger">
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
            </p>
        </Infobox>
    )
-
    return (
        <Section id={name}>
            <H2
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                    )}
                </tbody>
            </Table>
-            <Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
-                {accuracy &&
-                    accuracy.map(({ label, items }, i) =>
-                        !items ? null : (
-                            <Table fixed key={i}>
-                                <thead>
-                                    <Tr>
-                                        <Th colSpan={2}>{label}</Th>
-                                    </Tr>
-                                </thead>
-                                <tbody>
-                                    {items.map((item, i) => (
-                                        <Tr key={i}>
-                                            <Td>
-                                                <Label>
-                                                    {item.label}{' '}
-                                                    {item.help && <Help>{item.help}</Help>}
-                                                </Label>
-                                            </Td>
-                                            <Td num>{item.value}</Td>
-                                        </Tr>
-                                    ))}
-                                </tbody>
-                            </Table>
-                        )
-                    )}
-            </Grid>
            {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
            {hasInteractiveCode && (
                <CodeBlock title="Try out the model" lang="python" executable={true}>
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                        `import spacy`,
                        `from spacy.lang.${langId}.examples import sentences `,
                        ``,
-                        `nlp = spacy.load('${name}')`,
+                        `nlp = spacy.load("${name}")`,
                        `doc = nlp(sentences[0])`,
                        `print(doc.text)`,
                        `for token in doc:`,
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                    ].join('\n')}
                </CodeBlock>
            )}
+            {meta.accuracy && (
+                <Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
+                    <Table>
+                        <tbody>
+                            {meta.accuracy.map(({ label, value, help }) => (
+                                <Tr key={`${name}-${label}`}>
+                                    <Td nowrap>
+                                        <InlineCode>{label.toUpperCase()}</InlineCode>
+                                    </Td>
+                                    <Td>{help}</Td>
+                                    <Td num style={{ textAlign: 'right' }}>
+                                        {value}
+                                    </Td>
+                                </Tr>
+                            ))}
+                        </tbody>
+                    </Table>
+                </Accordion>
+            )}
            {labels && (
                <Accordion id={`${name}-labels`} title="Label Scheme">
                    <p>
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
                                const labelNames = labels[pipe] || []
                                const help = LABEL_SCHEME_META[pipe]
                                return (
-                                    <Tr key={pipe} evenodd={false} key={pipe}>
+                                    <Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
                                        <Td style={{ width: '20%' }}>
                                            <Label>
                                                {pipe} {help && <Help>{help}</Help>}
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
 const Models = ({ pageContext, repo, children }) => {
    const [initialized, setInitialized] = useState(false)
    const [compatibility, setCompatibility] = useState({})
-    const { id, title, meta } = pageContext
+    const { id, title, meta, hasExamples } = pageContext
    const { models, isStarters } = meta
    const baseUrl = `https://raw.githubusercontent.com/${repo}/master`

@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {

    const modelTitle = title
    const modelTeaser = `Available trained pipelines for ${title}`
-
    const starterTitle = `${title} starters`
    const starterTeaser = `Available transfer learning starter packs for ${title}`

@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
                            baseUrl={baseUrl}
                            repo={repo}
                            licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
+                            hasExamples={meta.hasExamples}
                        />
                    ))
                }