Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-07-16 03:02:41 +03:00 · 2017-05-28 11:09:35 -05:00 · 2017-05-28 11:09:35 -05:00 · 89bf635cbe
commit 89bf635cbe
parent 7996d21717 69bda9aed7
20 changed files with 152 additions and 60 deletions
--- a/website/_includes/_mixins-base.jade
+++ b/website/_includes/_mixins-base.jade
@ -93,7 +93,7 @@ mixin permalink(id)
    groups - [object] option groups, uses global variable QUICKSTART
    headline - [string] optional text to be rendered as widget headline

-mixin quickstart(groups, headline, description)
+mixin quickstart(groups, headline, description, hide_results)
    .c-quickstart.o-block-small#qs
        .c-quickstart__content
            if headline
@ -102,18 +102,22 @@ mixin quickstart(groups, headline, description)
                p=description
            for group in groups
                .c-quickstart__group.u-text-small(data-qs-group=group.id)
+                    if group.title
                        .c-quickstart__legend=group.title
                            if group.help
                                |  #[+help(group.help)]
                    .c-quickstart__fields
                        for option in group.options
-                            input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked)
-                            label.c-quickstart__label(for=option.id)=option.title
+                            input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked)
+                            label.c-quickstart__label(for=option.id)!=option.title
                                if option.meta
                                    |  #[span.c-quickstart__label__meta (#{option.meta})]
                                if option.help
                                    |  #[+help(option.help)]

+        if hide_results
+            block
+        else
            pre.c-code-block
                code.c-code-block__content.c-quickstart__code(data-qs-results="")
                    block
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -113,7 +113,7 @@ mixin code(label, language, icon, height)

        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
-            .c-code-block__icon(class=classes[icon] || "")
+            .c-code-block__icon(class=classes[icon] || "" class=classes[icon] ? "c-code-block__icon--border" : "")
                +icon(icon, 18)

        code.c-code-block__content
--- a/website/_includes/_navigation.jade
+++ b/website/_includes/_navigation.jade
@ -9,10 +9,9 @@ nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
        .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION

    ul.c-nav__menu
-        if ALPHA
-            - var NAVIGATION = { "Usage": "/docs/usage", "Reference": "/docs/api" }
+        - var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION

-        each url, item in NAVIGATION
+        each url, item in NAV
            li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
                +a(url)=item

--- a/website/assets/css/_components/_code.sass
+++ b/website/assets/css/_components/_code.sass
@ -22,9 +22,12 @@
    display: flex
    justify-content: center
    align-items: center
+
+    &.c-code-block__icon--border
        border-left: 6px solid


+
 //- Code block content

 .c-code-block__content
--- a/website/assets/img/icons.svg
+++ b/website/assets/img/icons.svg
@ -36,5 +36,8 @@
        <symbol id="accept" viewBox="0 0 24 24">
            <path d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z"/>
        </symbol>
+        <symbol id="markdown" viewBox="0 0 32 32">
+            <path d="M29.692 6h-27.385c-1.272 0-2.308 1.035-2.308 2.308v15.385c0 1.273 1.035 2.308 2.308 2.308h27.385c1.273 0 2.308-1.035 2.308-2.308v-15.385c0-1.272-1.035-2.308-2.308-2.308zM18 21.996l-4 0.004v-6l-3 3.846-3-3.846v6h-4v-12h4l3 4 3-4 4-0.004v12zM23.972 22.996l-4.972-6.996h3v-6h4v6h3l-5.028 6.996z"></path>
+        </symbol>
    </defs>
 </svg>
--- a/website/docs/api/displacy.jade
+++ b/website/docs/api/displacy.jade
@ -4,7 +4,7 @@ include ../../_includes/_mixins

 p
    |  As of v2.0, spaCy comes with a built-in visualization suite. For more
-    |  info and examples, see the usage workflow on
+    |  info and examples, see the usage guide on
    |  #[+a("/docs/usage/visualizers") visualizing spaCy].


--- a/website/docs/api/index.jade
+++ b/website/docs/api/index.jade
@ -2,6 +2,8 @@

 include ../../_includes/_mixins

+under-construction
+
 +h(2, "comparison") Feature comparison

 p
--- a/website/docs/api/matcher.jade
+++ b/website/docs/api/matcher.jade
@ -79,7 +79,7 @@ p Find all token sequences matching the supplied patterns on the #[code Doc].
    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
    |  actions per pattern within the same matcher. For example, you might only
    |  want to merge some entity types, and set custom flags for other matched
-    |  patterns. For more details and examples, see the usage workflow on
+    |  patterns. For more details and examples, see the usage guide on
    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].

 +h(2, "pipe") Matcher.pipe
--- a/website/docs/api/tokenizer.jade
+++ b/website/docs/api/tokenizer.jade
@ -175,7 +175,7 @@ p

 p
    |  Add a special-case tokenization rule. This mechanism is also used to add
-    |  custom tokenizer exceptions to the language data. See the usage workflow
+    |  custom tokenizer exceptions to the language data. See the usage guide
    |  on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
    |  for more details and examples.

--- a/website/docs/usage/_spacy-101/_vocab-stringstore.jade
+++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@ -34,7 +34,7 @@ p

 +infobox
    |  For more details on the language-specific data, see the
-    |  usage workflow on #[+a("/docs/usage/adding-languages") adding languages].
+    |  usage guide on #[+a("/docs/usage/adding-languages") adding languages].

 +h(2, "special-cases") Adding special case tokenization rules

--- a/website/docs/usage/dependency-parse.jade
+++ b/website/docs/usage/dependency-parse.jade
@ -201,7 +201,7 @@ p

 +infobox
    |  For more details and examples, see the
-    |  #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. You
+    |  #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy]. You
    |  can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo].

 +h(2, "disabling") Disabling the parser
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@ -248,7 +248,7 @@ p

 p
    |  For more details and examples, see the
-    |  #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy].
+    |  #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy].

 +code("Named Entity example").
    import spacy
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -4,7 +4,8 @@ include ../../_includes/_mixins

 p
    |  The following examples and code snippets give you an overview of spaCy's
-    |  functionality and its usage.
+    |  functionality and its usage. If you're new to spaCy, make sure to check
+    |  out the #[+a("/docs/usage/spacy-101") spaCy 101 guide].

 +h(2, "models") Install models and process text

@ -80,13 +81,13 @@ p

 +code.
    doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
-    ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
+    ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    assert ents == [(u'San Francisco', 0, 13, u'GPE')]

    from spacy.tokens import Span
    doc = nlp(u'Netflix is hiring a new VP of global policy')
    doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
-    ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents]
+    ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    assert ents == [(0, 7, u'ORG')]

 +infobox
@ -95,6 +96,42 @@ p
 +h(2, "displacy") Visualize a dependency parse and named entities in your browser
    +tag-model("dependency parse", "NER")

+aside
+    .u-text-center(style="overflow: auto").
+        <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" class="o-svg" viewBox="270 35 125 240" width="400" height="150" style="max-width: none; color: #fff; background: #1a1e23; font-family: inherit; font-size: 2rem">
+            <text fill="currentColor" text-anchor="middle" y="222.0">
+                <tspan style="font-weight: bold" fill="currentColor" x="50">This</tspan>
+                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="50">DT</tspan>
+            </text>
+            <text fill="currentColor" text-anchor="middle" y="222.0">
+                <tspan style="font-weight: bold" fill="currentColor" x="225">is</tspan>
+                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="225">VBZ</tspan>
+            </text>
+            <text fill="currentColor" text-anchor="middle" y="222.0">
+                <tspan style="font-weight: bold" fill="currentColor" x="400">a</tspan>
+                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="400">DT</tspan>
+            </text>
+            <text fill="currentColor" text-anchor="middle" y="222.0">
+                <tspan style="font-weight: bold" fill="currentColor" x="575">sentence.</tspan>
+                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="575">NN</tspan>
+            </text>
+            <path id="arrow-0-0" stroke-width="2px" d="M70,177.0 C70,89.5 220.0,89.5 220.0,177.0" fill="none" stroke="currentColor"/>
+            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
+                <textPath xlink:href="#arrow-0-0" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath>
+            </text>
+            <path d="M70,179.0 L62,167.0 78,167.0" fill="currentColor"/>
+            <path id="arrow-0-1" stroke-width="2px" d="M420,177.0 C420,89.5 570.0,89.5 570.0,177.0" fill="none" stroke="currentColor"/>
+            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
+                <textPath xlink:href="#arrow-0-1" startOffset="50%" fill="currentColor" text-anchor="middle">det</textPath>
+            </text>
+            <path d="M420,179.0 L412,167.0 428,167.0" fill="currentColor"/>
+            <path id="arrow-0-2" stroke-width="2px" d="M245,177.0 C245,2.0 575.0,2.0 575.0,177.0" fill="none" stroke="currentColor"/>
+            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
+                <textPath xlink:href="#arrow-0-2" startOffset="50%" fill="currentColor" text-anchor="middle">attr</textPath>
+            </text>
+            <path d="M575.0,179.0 L583.0,167.0 567.0,167.0" fill="currentColor"/>
+        </svg>
+
 +code.
    from spacy import displacy

@ -158,7 +195,7 @@ p
    pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
    pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
    matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
-    matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji
+    matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji
    matches = nlp(LOTS_OF TEXT)

 +infobox
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -141,7 +141,7 @@ p
        html = displacy.render(doc, style='ent', page=True,
                               options={'ents': ['EVENT']})

-    |  For more info and examples, see the usage workflow on
+    |  For more info and examples, see the usage guide on
    |  #[+a("/docs/usage/visualizers") visualizing spaCy].

 p
--- a/website/docs/usage/saving-loading.jade
+++ b/website/docs/usage/saving-loading.jade
@ -151,7 +151,7 @@ p

 +infobox("Custom models with pipeline components")
    |  For more details and an example of how to package a sentiment model
-    |  with a custom pipeline component, see the usage workflow on
+    |  with a custom pipeline component, see the usage guide on
    |  #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines].

 +h(3, "models-building") Building the model package
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@ -16,59 +16,67 @@ include ../../_includes/_mixins
 +table(["Name", "Description", "Needs model"])
    +row
        +cell #[strong Tokenization]
-        +cell
+        +cell Segmenting text into words, punctuations marks etc.
        +cell #[+procon("con")]

    +row
-        +cell #[strong Part-of-speech Tagging]
-        +cell
+        +cell #[strong Part-of-speech] (POS) #[strong Tagging]
+        +cell Assigning word types to tokens, like verb or noun.
        +cell #[+procon("pro")]

    +row
        +cell #[strong Dependency Parsing]
        +cell
+            |  Assigning syntactic dependency labels, i.e. the relations between
+            |  individual tokens.
        +cell #[+procon("pro")]

    +row
-        +cell #[strong Sentence Boundary Detection]
-        +cell
+        +cell #[strong Sentence Boundary Detection] (SBD)
+        +cell Finding and segmenting individual sentences.
        +cell #[+procon("pro")]

    +row
        +cell #[strong Named Entity Recongition] (NER)
        +cell
+            |  Labelling named "real-world" objects, like persons, companies or
+            |  locations.
        +cell #[+procon("pro")]

    +row
        +cell #[strong Rule-based Matching]
        +cell
+            |  Finding sequences of tokens based on their texts and linguistic
+            |  annotations, similar to regular expressions.
        +cell #[+procon("con")]

    +row
        +cell #[strong Similarity]
        +cell
+            |  Comparing words, text spans and documents and how similar they
+            |  are to each other.
        +cell #[+procon("pro")]

    +row
        +cell #[strong Training]
-        +cell
+        +cell Updating and improving a statistical model's predictions.
        +cell #[+procon("neutral")]

    +row
        +cell #[strong Serialization]
-        +cell
+        +cell Saving objects to files or byte strings.
        +cell #[+procon("neutral")]

 +h(2, "annotations") Linguistic annotations

 p
-    |  spaCy provides a variety of linguistic annotations to give you insights
-    |  into a text's grammatical structure. This includes the word types,
-    |  i.e. the parts of speech, and how the words are related to each other.
-    |  For example, if you're analysing text, it makes a huge difference
-    |  whether a noun is the subject of a sentence, or the object – or whether
-    |  "google" is used as a verb, or refers to the website or company in a
-    |  specific context.
+    |  spaCy provides a variety of linguistic annotations to give you
+    |  #[strong insights into a text&apos;s grammatical structure]. This includes the
+    |  word types, like the parts of speech, and how the words are related to
+    |  each other. For example, if you're analysing text, it makes a huge
+    |  difference whether a noun is the subject of a sentence, or the object –
+    |  or whether "google" is used as a verb, or refers to the website or
+    |  company in a specific context.

 p
    |  Once you've downloaded and installed a #[+a("/docs/usage/models") model],
@ -148,9 +156,9 @@ include _spacy-101/_pipelines
    |  #[strong create your own], see the usage guide on
    |  #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].

-+h(2, "vocab-stringstore") Vocab, lexemes and the string store
+h(2, "vocab") Vocab and lexemes

-include _spacy-101/_vocab-stringstore
+include _spacy-101/_vocab

 +h(2, "serialization") Serialization

@ -223,6 +231,15 @@ include _spacy-101/_training
            |  Segment text, and create #[code Doc] objects with the discovered
            |  segment boundaries.

+    +row
+        +cell #[+api("matcher") #[code Matcher]]
+        +cell
+            |  Match sequences of tokens, based on pattern rules, similar to
+            |  regular expressions.
+
+h(3, "architecture-pipeline") Pipeline components
+
+table(["Name", "Description"])
    +row
        +cell #[+api("tagger") #[code Tagger]]
        +cell Annotate part-of-speech tags on #[code Doc] objects.
@ -237,15 +254,13 @@ include _spacy-101/_training
            |  Annotate named entities, e.g. persons or products, on #[code Doc]
            |  objects.

-    +row
-        +cell #[+api("matcher") #[code Matcher]]
-        +cell
-            |  Match sequences of tokens, based on pattern rules, similar to
-            |  regular expressions.
-
-+h(3, "architecture-other") Other
+h(3, "architecture-other") Other classes

 +table(["Name", "Description"])
+    +row
+        +cell #[+api("binder") #[code Binder]]
+        +cell
+
    +row
        +cell #[+api("goldparse") #[code GoldParse]]
        +cell Collection for training annotations.
--- a/website/docs/usage/training.jade
+++ b/website/docs/usage/training.jade
@ -1,7 +1,7 @@
 include ../../_includes/_mixins

 p
-    |  This workflow describes how to train new statistical models for spaCy's
+    |  This guide describes how to train new statistical models for spaCy's
    |  part-of-speech tagger, named entity recognizer and dependency parser.
    |  Once the model is trained, you can then
    |  #[+a("/docs/usage/saving-loading") save and load] it.
@ -61,7 +61,7 @@ p

 p.o-inline-list
    +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example
-    +button("/docs/usage/training-ner", false, "secondary") Usage Workflow
+    +button("/docs/usage/training-ner", false, "secondary") Usage guide

 +h(2, "train-dependency") Training the dependency parser

--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -8,6 +8,20 @@ p

 +h(2, "features") New features

+p
+    |  This section contains an overview of the most important
+    |  #[strong new features and improvements]. The #[+a("/docs/api") API docs]
+    |  include additional  deprecation notes. New methods and functions that
+    |  were introduced in this version are marked with a #[+tag-new(2)] tag.
+
+p
+    |  To help you make the most of v2.0, we also
+    |  #[strong re-wrote almost all of the usage guides and API docs], and added
+    |  more real-world examples. If you're new to spaCy, or just want to brush
+    |  up on some NLP basics and the details of the library, check out
+    |  the #[+a("/docs/usage/spacy-101") spaCy 101 guide] that explains the most
+    |  important concepts with examples and illustrations.
+
 +h(3, "features-pipelines") Improved processing pipelines

 +aside-code("Example").
@ -97,9 +111,6 @@ p
    |  complex regular expressions. The language data has also been tidied up
    |  and simplified. spaCy now also supports simple lookup-based lemmatization.

-+image
-    include ../../assets/img/docs/language_data.svg
-
 +infobox
    |  #[strong API:] #[+api("language") #[code Language]]
    |  #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang]
@ -126,10 +137,18 @@ p
    |  #[strong API:] #[+api("matcher") #[code Matcher]]
    |  #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]

-+h(3, "features-models") Neural network models for English, German, French and Spanish
+h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER
+
+aside-code("Example", "bash").
+    python -m spacy download en # default English model
+    python -m spacy download de # default German model
+    python -m spacy download fr # default French model
+    python -m spacy download es # default Spanish model
+    python -m spacy download xx_ent_web_md # multi-language NER

 +infobox
    |  #[strong Details:] #[+src(gh("spacy-models")) spacy-models]
+    |  #[+a("/docs/api/language-models") Languages]
    |  #[strong Usage:] #[+a("/docs/usage/models") Models]

 +h(2, "incompat") Backwards incompatibilities
@ -147,6 +166,10 @@ p
        +cell #[code spacy.orth]
        +cell #[code spacy.lang.xx.lex_attrs]

+    +row
+        +cell #[code cli.model]
+        +cell -
+
    +row
        +cell #[code Language.save_to_directory]
        +cell #[+api("language#to_disk") #[code Language.to_disk]]
--- a/website/docs/usage/visualizers.jade
+++ b/website/docs/usage/visualizers.jade
@ -58,6 +58,11 @@ p
    |  The argument #[code options] lets you specify a dictionary of settings
    |  to customise the layout, for example:

+aside("Important note")
+    |  There's currently a known issue with the #[code compact] mode for long
+    |  sentences with arrow spacing. If the spacing is larger than the arc
+    |  itself, it'll cause the arc and its label to flip.
+
 +table(["Name", "Type", "Description", "Default"])
    +row
        +cell #[code compact]
@ -330,11 +335,12 @@ p
    |  It's certainly possible to just have your server return the markup.
    |  But outputting raw, unsanitised HTML is risky and makes your app vulnerable to
    |  #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting]
-    |  (XSS). All your user needs to do is find a way to make spaCy return one
-    |  token #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;].
-    |  Instead of relying on the server to render and sanitize HTML, you
-    |  can do this on the client in JavaScript. displaCy.js creates
-    |  the markup as DOM nodes and will never insert raw HTML.
+    |  (XSS). All your user needs to do is find a way to make spaCy return text
+    |  like #[code &lt;script src="malicious-code.js"&gt;&lt;script&gt;], which
+    |  is pretty easy in NER mode. Instead of relying on the server to render
+    |  and sanitise HTML, you can do this on the client in JavaScript.
+    |  displaCy.js creates the markup as DOM nodes and will never insert raw
+    |  HTML.

 p
    |  The #[code parse_deps] function takes a #[code Doc] object and returns