Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-09 06:13:08 +03:00 · 2017-11-05 17:11:24 +01:00 · 2017-11-05 17:11:24 +01:00 · bc4dc2da4e
commit bc4dc2da4e
parent 717e8124fb 51ce346995
16 changed files with 408 additions and 315 deletions
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -11,6 +11,23 @@ mixin section(id)
        block
 //- Accordion (collapsible sections)
    title - [string] Section title.
    id    - [string] Optional section ID for permalinks.
    level - [integer] Headline level for section title.
 mixin accordion(title, id, level)
    section.o-accordion.o-block
        +h(level || 4).o-no-block(id=id)
            button.o-accordion__button.o-grid.o-grid--vcenter.o-grid--space.js-accordion(aria-expanded="false")=title
                svg.o-accordion__icon(width="20" height="20" viewBox="0 0 10 10" aria-hidden="true" focusable="false")
                    rect.o-accordion__hide(height="8" width="2" y="1" x="4")
                    rect(height="2" width="8" y="4" x="1")
        .o-accordion__content(hidden="")
            block
 //- Headlines Helper Mixin
    level - [integer] 1, 2, 3, 4, or 5
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -50,7 +50,7 @@ for id in CURRENT_MODELS
                    +cell
                        span(data-tpl=id data-tpl-key=field) #[em n/a]
-            +row(data-tpl=id data-tpl-key="compat-wrapper" style="display: none")
+            +row(data-tpl=id data-tpl-key="compat-wrapper" hidden="")
                +cell
                    +label Compat #[+help("Latest compatible model version for your spaCy installation").u-color-subtle]
                +cell
@ -58,15 +58,15 @@ for id in CURRENT_MODELS
                        select.o-field__select.u-text-small(data-tpl=id data-tpl-key="compat")
                    div(data-tpl=id data-tpl-key="compat-versions") &nbsp;
-        section(data-tpl=id data-tpl-key="benchmarks" style="display: none")
+        section(data-tpl=id data-tpl-key="benchmarks" hidden="")
            +grid.o-block-small
                for keys, label in MODEL_BENCHMARKS
-                    .u-flex-full.u-padding-small(data-tpl=id data-tpl-key=label.toLowerCase() style="display: none")
+                    .u-flex-full.u-padding-small(data-tpl=id data-tpl-key=label.toLowerCase() hidden="")
                        +table.o-block-small
                            +row("head")
                                +head-cell(colspan="2")=(MODEL_META["benchmark_" + label] || label)
                            for label, field in keys
-                                +row(style="display: none")
+                                +row(hidden="")
                                    +cell.u-nowrap
                                        +label=label
                                            if MODEL_META[field]
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@ -41,6 +41,7 @@ if IS_PAGE
    https://medium.com/dev-channel/es6-modules-in-chrome-canary-m60-ba588dfb8ab7
 - ProgressBar = "new ProgressBar('.js-progress');"
 - Accordion = "new Accordion('.js-accordion');"
 - Changelog = "new Changelog('" + SOCIAL.github + "', 'spacy');"
 - NavHighlighter = "new NavHighlighter('data-section', 'data-nav');"
 - GitHubEmbed = "new GitHubEmbed('" + SOCIAL.github + "', 'data-gh-embed');"
@ -57,6 +58,7 @@ if environment == "deploy"
        if IS_PAGE
            !=NavHighlighter
            !=GitHubEmbed
            !=Accordion
        if HAS_MODELS
            !=ModelLoader
        if compare_models
@ -74,6 +76,8 @@ else
            !=NavHighlighter
            | import GitHubEmbed from '/assets/js/github-embed.js';
            !=GitHubEmbed
            | import Accordion from '/assets/js/accordion.js';
            !=Accordion
        if HAS_MODELS
            | import { ModelLoader } from '/assets/js/models.js';
            !=ModelLoader
--- a/website/api/_annotation/_dep-labels.jade
+++ b/website/api/_annotation/_dep-labels.jade
@ -1,12 +1,17 @@
 //- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS
 +h(3, "dependency-parsing-english") English dependency labels
 p
-    |  The English dependency labels use the #[+a("http://www.clearnlp.com") ClearNLP]
+    |  This section lists the syntactic dependency labels assigned by
-    |  #[+a("http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf") CLEAR Style].
+    |  spaCy's #[+a("/models") models]. The individual labels are
    |  language-specific and depend on the training corpus.
-+table(["Label", "Description"])
+accordion("English", "dependency-parsing-english")
    p
        |  The English dependency labels use the
        |  #[+a("http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf") CLEAR Style]
        |  by #[+a("http://www.clearnlp.com") ClearNLP].
    +table(["Label", "Description"])
        +dep-row("acomp", "adjectival complement")
        +dep-row("advcl", "adverbial clause modifier")
        +dep-row("advmod", "adverbial modifier")
@ -60,14 +65,13 @@ p
        +dep-row("root", "root")
        +dep-row("xcomp", "open clausal complement")
-+h(3, "dependency-parsing-german") German dependency labels
+accordion("German", "dependency-parsing-german")
-
+    p
 p
        |  The German dependency labels use the
        |  #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank]
        |  annotation scheme.
-+table(["Label", "Description"])
+    +table(["Label", "Description"])
        +dep-row("ac", "adpositional case marker")
        +dep-row("adc", "adjective component")
        +dep-row("ag", "genitive attribute")
--- a/website/api/_annotation/_pos-tags.jade
+++ b/website/api/_annotation/_pos-tags.jade
@ -1,14 +1,19 @@
 //- 💫 DOCS > API > ANNOTATION > POS TAGS
 +h(3, "pos-tagging-english") English part-of-speech tag scheme
 p
    |  This section lists the fine-grained and coarse-grained part-of-speech
    |  tags assigned by spaCy's #[+a("/models") models]. The individual mapping
    |  is specific to the training corpus and can be defined in the respective
    |  language data's #[+a("/usage/adding-languages#tag-map") #[code tag_map.py]].
 +accordion("English", "pos-tagging-english")
    p
        |  The English part-of-speech tagger uses the
        |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of
        |  the Penn Treebank tag set. We also map the tags to the simpler Google
        |  Universal POS tag set.
-+table(["Tag", "POS", "Morphology", "Description"])
+    +table(["Tag", "POS", "Morphology", "Description"])
        +pos-row("-LRB-", "PUNCT", "PunctType=brck PunctSide=ini", "left round bracket")
        +pos-row("-PRB-", "PUNCT", "PunctType=brck PunctSide=fin", "right round bracket")
        +pos-row(",", "PUNCT", "PunctType=comm", "punctuation mark, comma")
@ -66,15 +71,14 @@ p
        +pos-row("WRB", "ADV", "PronType=int|rel", "wh-adverb")
        +pos-row("XX", "X", "", "unknown")
-+h(3, "pos-tagging-german") German part-of-speech tag scheme
+accordion("German", "pos-tagging-german")
-
+    p
 p
        |  The German part-of-speech tagger uses the
        |  #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank]
        |  annotation scheme. We also map the tags to the simpler Google
        |  Universal POS tag set.
-+table(["Tag", "POS", "Morphology", "Description"])
+    +table(["Tag", "POS", "Morphology", "Description"])
        +pos-row("$(", "PUNCT", "PunctType=brck", "other sentence-internal punctuation mark")
        +pos-row("$,", "PUNCT", "PunctType=comm", "comma")
        +pos-row("$.", "PUNCT", "PunctType=peri", "sentence-final punctuation mark")
--- a/website/api/_annotation/_text-processing.jade
+++ b/website/api/_annotation/_text-processing.jade
@ -0,0 +1,55 @@
 //- 💫 DOCS > API > ANNOTATION > TEXT PROCESSING
 +aside-code("Example").
    from spacy.lang.en import English
    nlp = English()
    tokens = nlp('Some\nspaces  and\ttab characters')
    tokens_text = [t.text for t in tokens]
    assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
                        '\t', 'tab', 'characters']
 p
    |  Tokenization standards are based on the
    |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
    |  The tokenizer differs from most by including
    |  #[strong tokens for significant whitespace]. Any sequence of
    |  whitespace characters beyond a single space (#[code ' ']) is included
    |  as a token. The whitespace tokens are useful for much the same reason
    |  punctuation is – it's often an important delimiter in the text. By
    |  preserving it in the token output, we are able to maintain a simple
    |  alignment between the tokens and the original string, and we ensure
    |  that #[strong no information is lost] during processing.
 +h(3, "lemmatization") Lemmatization
 +aside("Examples")
    |   In English, this means:#[br]
    |  #[strong Adjectives]: happier, happiest &rarr; happy#[br]
    |  #[strong Adverbs]: worse, worst &rarr; badly#[br]
    |  #[strong Nouns]: dogs, children &rarr; dog, child#[br]
    |  #[strong Verbs]: writes, wirting, wrote, written &rarr; write
 p
    |  A lemma is the uninflected form of a word. The English lemmatization
    |  data is taken from #[+a("https://wordnet.princeton.edu") WordNet].
    |  Lookup tables are taken from
    |  #[+a("http://www.lexiconista.com/datasets/lemmatization/") Lexiconista].
    |  spaCy also adds a #[strong special case for pronouns]: all pronouns
    |  are lemmatized to the special token #[code -PRON-].
 +infobox("About spaCy's custom pronoun lemma", "⚠️")
    |  Unlike verbs and common nouns, there's no clear base form of a personal
    |  pronoun. Should the lemma of "me" be "I", or should we normalize person
    |  as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
    |  novel symbol, #[code -PRON-], which is used as the lemma for
    |  all personal pronouns.
 +h(3, "sentence-boundary") Sentence boundary detection
 p
    |  Sentence boundaries are calculated from the syntactic parse tree, so
    |  features such as punctuation and capitalisation play an important but
    |  non-decisive role in determining the sentence boundaries. Usually this
    |  means that the sentence boundaries will at least coincide with clause
    |  boundaries, even given poorly punctuated text.
--- a/website/api/_data.json
+++ b/website/api/_data.json
@ -205,10 +205,8 @@
        "title": "Annotation Specifications",
        "teaser": "Schemes used for labels, tags and training data.",
        "menu": {
-            "Tokenization": "tokenization",
+            "Text Processing": "text-processing",
            "Sentence Boundaries": "sbd",
            "POS Tagging": "pos-tagging",
            "Lemmatization": "lemmatization",
            "Dependencies": "dependency-parsing",
            "Named Entities": "named-entities",
            "Models & Training": "training"
--- a/website/api/annotation.jade
+++ b/website/api/annotation.jade
@ -2,43 +2,9 @@
 include ../_includes/_mixins
-p This document describes the target annotations spaCy is trained to predict.
+section("text-processing")
-
+    +h(2, "text-processing") Text Processing
-
+    include _annotation/_text-processing
 +section("tokenization")
    +h(2, "tokenization") Tokenization
    p
        |  Tokenization standards are based on the
        |  #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] corpus.
        |  The tokenizer differs from most by including tokens for significant
        |  whitespace. Any sequence of whitespace characters beyond a single space
        |  (#[code ' ']) is included as a token.
    +aside-code("Example").
        from spacy.lang.en import English
        nlp = English()
        tokens = nlp('Some\nspaces  and\ttab characters')
        tokens_text = [t.text for t in tokens]
        assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and',
                            '\t', 'tab', 'characters']
    p
        |  The whitespace tokens are useful for much the same reason punctuation is
        |  – it's often an important delimiter in the text. By preserving it in the
        |  token output, we are able to maintain a simple alignment between the
        |  tokens and the original string, and we ensure that no information is
        |  lost during processing.
 +section("sbd")
    +h(2, "sentence-boundary") Sentence boundary detection
    p
        |  Sentence boundaries are calculated from the syntactic parse tree, so
        |  features such as punctuation and capitalisation play an important but
        |  non-decisive role in determining the sentence boundaries. Usually this
        |  means that the sentence boundaries will at least coincide with clause
        |  boundaries, even given poorly punctuated text.
 +section("pos-tagging")
    +h(2, "pos-tagging") Part-of-speech Tagging
@ -50,30 +16,6 @@ p This document describes the target annotations spaCy is trained to predict.
    include _annotation/_pos-tags
 +section("lemmatization")
    +h(2, "lemmatization") Lemmatization
    p A "lemma" is the uninflected form of a word. In English, this means:
    +list
        +item #[strong Adjectives]: The form like "happy", not "happier" or "happiest"
        +item #[strong Adverbs]: The form like "badly", not "worse" or "worst"
        +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children"
        +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written"
    p
        |  The lemmatization data is taken from
        |  #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a
        |  special case for pronouns: all pronouns are lemmatized to the special
        |  token #[code -PRON-].
    +infobox("About spaCy's custom pronoun lemma")
        |  Unlike verbs and common nouns, there's no clear base form of a personal
        |  pronoun. Should the lemma of "me" be "I", or should we normalize person
        |  as well, giving "it" — or maybe "he"? spaCy's solution is to introduce a
        |  novel symbol, #[code -PRON-], which is used as the lemma for
        |  all personal pronouns.
 +section("dependency-parsing")
    +h(2, "dependency-parsing") Syntactic Dependency Parsing
--- a/website/assets/css/_base/_layout.sass
+++ b/website/assets/css/_base/_layout.sass
@ -31,6 +31,9 @@ main > *:not(footer) li a,
 main aside a
    @extend .u-link
 a:focus
    outline: 1px dotted $color-theme
 //- Selection
--- a/website/assets/css/_base/_objects.sass
+++ b/website/assets/css/_base/_objects.sass
@ -74,6 +74,42 @@
    border-radius: $border-radius
    box-shadow: $box-shadow
 //- Accordion
 .o-accordion
    &:not(:last-child)
        margin-bottom: 2rem
 .o-accordion__content
    margin-top: 3rem
 .o-accordion__button
    font: inherit
    border-radius: $border-radius
    width: 100%
    padding: 1.5rem 2rem
    background: $color-subtle-light
    &[aria-expanded="true"]
        border-bottom: 3px solid $color-subtle
        border-bottom-left-radius: 0
        border-bottom-right-radius: 0
        .o-accordion__hide
            display: none
    &:focus:not([aria-expanded="true"])
        background: $color-subtle
 .o-accordion__icon
    @include size(2.5rem)
    background: $color-theme
    color: $color-back
    border-radius: 50%
    padding: 0.35rem
    pointer-events: none
 //- Box
 .o-box
--- a/website/assets/js/accordion.js
+++ b/website/assets/js/accordion.js
@ -0,0 +1,25 @@
 'use strict';
 import { $$ } from './util.js';
 export default class Accordion {
    /**
     * Simple, collapsible accordion sections.
     * Inspired by: https://inclusive-components.design/collapsible-sections/
     * @param {string} selector - Query selector of button element.
     */
    constructor(selector) {
        [...$$(selector)].forEach(btn =>
            btn.addEventListener('click', this.onClick.bind(this)))
    }
    /**
     * Toggle aria-expanded attribute on button and visibility of section.
     * @param {node} Event.target - The accordion button.
     */
    onClick({ target }) {
        const exp = target.getAttribute('aria-expanded') === 'true' || false;
        target.setAttribute('aria-expanded', !exp);
        target.parentElement.nextElementSibling.hidden = exp;
    }
 }
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@ -101,9 +101,9 @@ export class ModelLoader {
    showError(modelId) {
        const tpl = new Templater(modelId);
        tpl.get('table').removeAttribute('data-loading');
-        tpl.get('error').style.display = 'block';
+        tpl.get('error').hidden = false;
        for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
-            tpl.get(key).parentElement.parentElement.style.display = 'none';
+            tpl.get(key).parentElement.parentElement.hidden = true;
        }
    }
@ -114,13 +114,12 @@ export class ModelLoader {
        const modelId = `${data.lang}_${data.name}`;
        const model = `${modelId}-${data.version}`;
        const tpl = new Templater(modelId);
        tpl.get('error').style.display = 'none';
        this.renderDetails(tpl, data)
        this.renderBenchmarks(tpl, data.accuracy, data.speed);
        this.renderCompat(tpl, modelId);
        tpl.get('download').setAttribute('href', `${this.repo}/releases/tag/${model}`);
        tpl.get('table').removeAttribute('data-loading');
-        tpl.get('error').style.display = 'none';
+        tpl.get('error').hidden = true;
    }
    renderDetails(tpl, { version, size, description, notes, author, url,
@ -133,9 +132,9 @@ export class ModelLoader {
        if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
        if (sources) tpl.fill('sources', formats.sources(sources));
        if (vectors) tpl.fill('vecs', formats.vectors(vectors));
-        else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
+        else tpl.get('vecs').parentElement.parentElement.hidden = true;
        if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
-        else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
+        else tpl.get('pipeline').parentElement.parentElement.hidden = true;
    }
    renderBenchmarks(tpl, accuracy = {}, speed = {}) {
@ -143,7 +142,7 @@ export class ModelLoader {
        this.renderTable(tpl, 'parser', accuracy, val => val.toFixed(2));
        this.renderTable(tpl, 'ner', accuracy, val => val.toFixed(2));
        this.renderTable(tpl, 'speed', speed, Math.round);
-        tpl.get('benchmarks').style.display = 'block';
+        tpl.get('benchmarks').hidden = false;
    }
    renderTable(tpl, id, benchmarks, converter = val => val) {
@ -151,13 +150,13 @@ export class ModelLoader {
        for (let key of Object.keys(this.benchKeys[id])) {
            if (benchmarks[key]) tpl
                .fill(key, convertNumber(converter(benchmarks[key])))
-                .parentElement.style.display = 'table-row';
+                .parentElement.hidden = false;
        }
-        tpl.get(id).style.display = 'block';
+        tpl.get(id).hidden = false;
    }
    renderCompat(tpl, modelId) {
-        tpl.get('compat-wrapper').style.display = 'table-row';
+        tpl.get('compat-wrapper').hidden = false;
        const header = '<option selected disabled>spaCy version</option>';
        const options = Object.keys(this.compat)
            .map(v => `<option value="${v}">v${v}</option>`)
@ -197,8 +196,8 @@ export class ModelComparer {
        this.colors = CHART_COLORS;
        this.fonts = CHART_FONTS;
        this.defaultModels = defaultModels;
-        this.tpl.get('result').style.display = 'block';
+        this.tpl.get('result').hidden = false;
-        this.tpl.get('error').style.display = 'none';
+        this.tpl.get('error').hidden = true;
        this.fetchCompat()
            .then(compat => this.init(compat))
            .catch(this.showError.bind(this))
@ -257,8 +256,8 @@ export class ModelComparer {
    showError(err) {
        console.error(err || 'Error');
-        this.tpl.get('result').style.display = 'none';
+        this.tpl.get('result').hidden = true;
-        this.tpl.get('error').style.display = 'block';
+        this.tpl.get('error').hidden = false;
    }
    onSelect(ev) {
@ -301,8 +300,8 @@ export class ModelComparer {
        this.chart.update();
        [model1, model2].forEach((model, i) => this.renderTable(metaKeys, i + 1, model));
        this.tpl.get('result').removeAttribute('data-loading');
-        this.tpl.get('error').style.display = 'none';
+        this.tpl.get('error').hidden = true;
-        this.tpl.get('result').style.display = 'block';
+        this.tpl.get('result').hidden = false;
    }
    renderTable(metaKeys, i, { lang, name, version, size, description,
--- a/website/assets/js/rollup.js
+++ b/website/assets/js/rollup.js
@ -12,6 +12,7 @@ import ProgressBar from './progress.js';
 import NavHighlighter from './nav-highlighter.js';
 import Changelog from './changelog.js';
 import GitHubEmbed from './github-embed.js';
 import Accordion from './accordion.js';
 import { ModelLoader, ModelComparer } from './models.js';
 // Assign to window so they are bundled by rollup
@ -19,5 +20,6 @@ window.ProgressBar = ProgressBar;
 window.NavHighlighter = NavHighlighter;
 window.Changelog = Changelog;
 window.GitHubEmbed = GitHubEmbed;
 window.Accordion = Accordion;
 window.ModelLoader = ModelLoader;
 window.ModelComparer = ModelComparer;
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@ -30,7 +30,7 @@ div(data-tpl=TPL data-tpl-key="error")
        |  overview of the
        |  #[+a(gh("spacy-models") + "/releases") latest model releases].
-div(data-tpl=TPL data-tpl-key="result" style="display: none")
+div(data-tpl=TPL data-tpl-key="result" hidden="")
    +chart("compare_accuracy", 350)
    +aside-code("Download", "text")
--- a/website/usage/_linguistic-features/_dependency-parse.jade
+++ b/website/usage/_linguistic-features/_dependency-parse.jade
@ -181,6 +181,10 @@ p
    +annotation-row(["their", "ADJ", "poss", "requests"], style)
    +annotation-row(["requests", "NOUN", "dobj", "submit"], style)
 +h(3, "dep-scheme") Dependency label scheme
 include ../../api/_annotation/_dep-labels
 +h(3, "displacy") Visualizing dependencies
 p
--- a/website/usage/_linguistic-features/_pos-tagging.jade
+++ b/website/usage/_linguistic-features/_pos-tagging.jade
@ -2,8 +2,6 @@
 include ../_spacy-101/_pos-deps
 //-+aside("Help – spaCy's output is wrong!")
 +h(3, "rule-based-morphology") Rule-based morphology
 p
@ -70,4 +68,6 @@ p
        |  list-based exception files, acquired from
        |  #[+a("https://wordnet.princeton.edu/") WordNet].
 +h(3, "pos-scheme") Part-of-speech tag scheme
 include ../../api/_annotation/_pos-tags