From b85d88fac6d4da0cc28bd756a03f3e67f43597b1 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 16:36:07 +0200 Subject: [PATCH 1/7] Update quickstart mixin to make it more customisable --- website/_includes/_mixins-base.jade | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 80d63353d..484f29afc 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -93,7 +93,7 @@ mixin permalink(id) groups - [object] option groups, uses global variable QUICKSTART headline - [string] optional text to be rendered as widget headline -mixin quickstart(groups, headline, description) +mixin quickstart(groups, headline, description, hide_results) .c-quickstart.o-block-small#qs .c-quickstart__content if headline @@ -102,21 +102,25 @@ mixin quickstart(groups, headline, description) p=description for group in groups .c-quickstart__group.u-text-small(data-qs-group=group.id) - .c-quickstart__legend=group.title - if group.help - | #[+help(group.help)] + if group.title + .c-quickstart__legend=group.title + if group.help + | #[+help(group.help)] .c-quickstart__fields for option in group.options - input.c-quickstart__input(class="c-quickstart__input--" + (group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked) - label.c-quickstart__label(for=option.id)=option.title + input.c-quickstart__input(class="c-quickstart__input--" + (group.input_style ? group.input_style : group.multiple ? "check" : "radio") type=group.multiple ? "checkbox" : "radio" name=group.id id=option.id value=option.id checked=option.checked) + label.c-quickstart__label(for=option.id)!=option.title if option.meta | #[span.c-quickstart__label__meta (#{option.meta})] if option.help | #[+help(option.help)] - pre.c-code-block - code.c-code-block__content.c-quickstart__code(data-qs-results="") - block + if hide_results + block + else + pre.c-code-block + code.c-code-block__content.c-quickstart__code(data-qs-results="") + block .c-quickstart__info.u-text-tiny.o-block.u-text-right | Like this widget? Check out #[+a("https://github.com/ines/quickstart").u-link quickstart.js]! From 189db308d9d8e2af5f5b9207f4856d9e4ca48e77 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 16:36:21 +0200 Subject: [PATCH 2/7] Only add coloured border to code block if icon has colour --- website/_includes/_mixins.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index fc4d66841..05e64b0fa 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -113,7 +113,7 @@ mixin code(label, language, icon, height) if icon - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} - .c-code-block__icon(class=classes[icon] || "") + .c-code-block__icon(class=classes[icon] || "" class=classes[icon] ? "c-code-block__icon--border" : "") +icon(icon, 18) code.c-code-block__content From 20ffb561484b025611200886c2afb29d5f2ef3e2 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 16:36:31 +0200 Subject: [PATCH 3/7] Fix overwriting of navigation in ALPHA mode --- website/_includes/_navigation.jade | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/website/_includes/_navigation.jade b/website/_includes/_navigation.jade index 320882807..f113ca3f4 100644 --- a/website/_includes/_navigation.jade +++ b/website/_includes/_navigation.jade @@ -9,10 +9,9 @@ nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null) .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION ul.c-nav__menu - if ALPHA - - var NAVIGATION = { "Usage": "/docs/usage", "Reference": "/docs/api" } + - var NAV = ALPHA ? { "Usage": "/docs/usage", "Reference": "/docs/api" } : NAVIGATION - each url, item in NAVIGATION + each url, item in NAV li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null) +a(url)=item From bd79e683f6ab85ca88f8f2929a3e2ab065db6910 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 16:36:42 +0200 Subject: [PATCH 4/7] Move code block border to own modifier class --- website/assets/css/_components/_code.sass | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/website/assets/css/_components/_code.sass b/website/assets/css/_components/_code.sass index 478f8a9e0..2e1856c0a 100644 --- a/website/assets/css/_components/_code.sass +++ b/website/assets/css/_components/_code.sass @@ -22,7 +22,10 @@ display: flex justify-content: center align-items: center - border-left: 6px solid + + &.c-code-block__icon--border + border-left: 6px solid + //- Code block content From 57ea94f0e3194d1e3c07f7cb99af3a2116bfb0ce Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 16:36:47 +0200 Subject: [PATCH 5/7] Add markdown icon --- website/assets/img/icons.svg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg index 3f226af93..104117cc0 100644 --- a/website/assets/img/icons.svg +++ b/website/assets/img/icons.svg @@ -36,5 +36,8 @@ + + + From f8185b8e11b7347f9f47489e6b9d6ec34fbe5131 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 16:37:14 +0200 Subject: [PATCH 6/7] Rename vocab-stringsotre to vocab --- .../usage/_spacy-101/{_vocab-stringstore.jade => _vocab.jade} | 0 website/docs/usage/spacy-101.jade | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename website/docs/usage/_spacy-101/{_vocab-stringstore.jade => _vocab.jade} (100%) diff --git a/website/docs/usage/_spacy-101/_vocab-stringstore.jade b/website/docs/usage/_spacy-101/_vocab.jade similarity index 100% rename from website/docs/usage/_spacy-101/_vocab-stringstore.jade rename to website/docs/usage/_spacy-101/_vocab.jade diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 6a1f780dc..498749f31 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -148,9 +148,9 @@ include _spacy-101/_pipelines | #[strong create your own], see the usage guide on | #[+a("/docs/usage/language-processing-pipeline") language processing pipelines]. -+h(2, "vocab-stringstore") Vocab, lexemes and the string store ++h(2, "vocab") Vocab and lexemes -include _spacy-101/_vocab-stringstore +include _spacy-101/_vocab +h(2, "serialization") Serialization From 69bda9aed77a6892a2b161b27606de264188f426 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 16:41:01 +0200 Subject: [PATCH 7/7] Update text, examples, typos, wording and formatting --- website/docs/api/displacy.jade | 2 +- website/docs/api/index.jade | 2 + website/docs/api/matcher.jade | 2 +- website/docs/api/tokenizer.jade | 2 +- website/docs/usage/customizing-tokenizer.jade | 2 +- website/docs/usage/dependency-parse.jade | 2 +- website/docs/usage/entity-recognition.jade | 2 +- website/docs/usage/lightning-tour.jade | 45 +++++++++++++-- website/docs/usage/rule-based-matching.jade | 2 +- website/docs/usage/saving-loading.jade | 2 +- website/docs/usage/spacy-101.jade | 57 ++++++++++++------- website/docs/usage/training.jade | 4 +- website/docs/usage/v2.jade | 31 ++++++++-- website/docs/usage/visualizers.jade | 16 ++++-- 14 files changed, 127 insertions(+), 44 deletions(-) diff --git a/website/docs/api/displacy.jade b/website/docs/api/displacy.jade index a96d8a397..415fab77d 100644 --- a/website/docs/api/displacy.jade +++ b/website/docs/api/displacy.jade @@ -4,7 +4,7 @@ include ../../_includes/_mixins p | As of v2.0, spaCy comes with a built-in visualization suite. For more - | info and examples, see the usage workflow on + | info and examples, see the usage guide on | #[+a("/docs/usage/visualizers") visualizing spaCy]. diff --git a/website/docs/api/index.jade b/website/docs/api/index.jade index 24f3d4458..f92080975 100644 --- a/website/docs/api/index.jade +++ b/website/docs/api/index.jade @@ -2,6 +2,8 @@ include ../../_includes/_mixins ++under-construction + +h(2, "comparison") Feature comparison p diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index e2972fdc0..c837fe434 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -79,7 +79,7 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. | #[+api("matcher#add") #[code add]]. This allows you to define custom | actions per pattern within the same matcher. For example, you might only | want to merge some entity types, and set custom flags for other matched - | patterns. For more details and examples, see the usage workflow on + | patterns. For more details and examples, see the usage guide on | #[+a("/docs/usage/rule-based-matching") rule-based matching]. +h(2, "pipe") Matcher.pipe diff --git a/website/docs/api/tokenizer.jade b/website/docs/api/tokenizer.jade index 8d933f75b..196f886b7 100644 --- a/website/docs/api/tokenizer.jade +++ b/website/docs/api/tokenizer.jade @@ -175,7 +175,7 @@ p p | Add a special-case tokenization rule. This mechanism is also used to add - | custom tokenizer exceptions to the language data. See the usage workflow + | custom tokenizer exceptions to the language data. See the usage guide | on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages] | for more details and examples. diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index 86040a4eb..05a16fc24 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -34,7 +34,7 @@ p +infobox | For more details on the language-specific data, see the - | usage workflow on #[+a("/docs/usage/adding-languages") adding languages]. + | usage guide on #[+a("/docs/usage/adding-languages") adding languages]. +h(2, "special-cases") Adding special case tokenization rules diff --git a/website/docs/usage/dependency-parse.jade b/website/docs/usage/dependency-parse.jade index dfb37f786..683991d95 100644 --- a/website/docs/usage/dependency-parse.jade +++ b/website/docs/usage/dependency-parse.jade @@ -201,7 +201,7 @@ p +infobox | For more details and examples, see the - | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. You + | #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy]. You | can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo]. +h(2, "disabling") Disabling the parser diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 527c14dde..0155cf2e4 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -248,7 +248,7 @@ p p | For more details and examples, see the - | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. + | #[+a("/docs/usage/visualizers") usage guide on visualizing spaCy]. +code("Named Entity example"). import spacy diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 8cf651be0..107e7210f 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -4,7 +4,8 @@ include ../../_includes/_mixins p | The following examples and code snippets give you an overview of spaCy's - | functionality and its usage. + | functionality and its usage. If you're new to spaCy, make sure to check + | out the #[+a("/docs/usage/spacy-101") spaCy 101 guide]. +h(2, "models") Install models and process text @@ -80,13 +81,13 @@ p +code. doc = nlp(u'San Francisco considers banning sidewalk delivery robots') - ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents] assert ents == [(u'San Francisco', 0, 13, u'GPE')] from spacy.tokens import Span doc = nlp(u'Netflix is hiring a new VP of global policy') doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] - ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents] + ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents] assert ents == [(0, 7, u'ORG')] +infobox @@ -95,6 +96,42 @@ p +h(2, "displacy") Visualize a dependency parse and named entities in your browser +tag-model("dependency parse", "NER") ++aside + .u-text-center(style="overflow: auto"). + + + This + DT + + + is + VBZ + + + a + DT + + + sentence. + NN + + + + nsubj + + + + + det + + + + + attr + + + + +code. from spacy import displacy @@ -158,7 +195,7 @@ p pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']] matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" - matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji + matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji matches = nlp(LOTS_OF TEXT) +infobox diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index 1fd398ad9..9813abd2e 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -141,7 +141,7 @@ p html = displacy.render(doc, style='ent', page=True, options={'ents': ['EVENT']}) - | For more info and examples, see the usage workflow on + | For more info and examples, see the usage guide on | #[+a("/docs/usage/visualizers") visualizing spaCy]. p diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 1ecb7d7ee..827b54748 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -151,7 +151,7 @@ p +infobox("Custom models with pipeline components") | For more details and an example of how to package a sentiment model - | with a custom pipeline component, see the usage workflow on + | with a custom pipeline component, see the usage guide on | #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines]. +h(3, "models-building") Building the model package diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 498749f31..092a1d984 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -16,59 +16,67 @@ include ../../_includes/_mixins +table(["Name", "Description", "Needs model"]) +row +cell #[strong Tokenization] - +cell + +cell Segmenting text into words, punctuations marks etc. +cell #[+procon("con")] +row - +cell #[strong Part-of-speech Tagging] - +cell + +cell #[strong Part-of-speech] (POS) #[strong Tagging] + +cell Assigning word types to tokens, like verb or noun. +cell #[+procon("pro")] +row +cell #[strong Dependency Parsing] +cell + | Assigning syntactic dependency labels, i.e. the relations between + | individual tokens. +cell #[+procon("pro")] +row - +cell #[strong Sentence Boundary Detection] - +cell + +cell #[strong Sentence Boundary Detection] (SBD) + +cell Finding and segmenting individual sentences. +cell #[+procon("pro")] +row +cell #[strong Named Entity Recongition] (NER) +cell + | Labelling named "real-world" objects, like persons, companies or + | locations. +cell #[+procon("pro")] +row +cell #[strong Rule-based Matching] +cell + | Finding sequences of tokens based on their texts and linguistic + | annotations, similar to regular expressions. +cell #[+procon("con")] +row +cell #[strong Similarity] +cell + | Comparing words, text spans and documents and how similar they + | are to each other. +cell #[+procon("pro")] +row +cell #[strong Training] - +cell + +cell Updating and improving a statistical model's predictions. +cell #[+procon("neutral")] +row +cell #[strong Serialization] - +cell + +cell Saving objects to files or byte strings. +cell #[+procon("neutral")] +h(2, "annotations") Linguistic annotations p - | spaCy provides a variety of linguistic annotations to give you insights - | into a text's grammatical structure. This includes the word types, - | i.e. the parts of speech, and how the words are related to each other. - | For example, if you're analysing text, it makes a huge difference - | whether a noun is the subject of a sentence, or the object – or whether - | "google" is used as a verb, or refers to the website or company in a - | specific context. + | spaCy provides a variety of linguistic annotations to give you + | #[strong insights into a text's grammatical structure]. This includes the + | word types, like the parts of speech, and how the words are related to + | each other. For example, if you're analysing text, it makes a huge + | difference whether a noun is the subject of a sentence, or the object – + | or whether "google" is used as a verb, or refers to the website or + | company in a specific context. p | Once you've downloaded and installed a #[+a("/docs/usage/models") model], @@ -223,6 +231,15 @@ include _spacy-101/_training | Segment text, and create #[code Doc] objects with the discovered | segment boundaries. + +row + +cell #[+api("matcher") #[code Matcher]] + +cell + | Match sequences of tokens, based on pattern rules, similar to + | regular expressions. + ++h(3, "architecture-pipeline") Pipeline components + ++table(["Name", "Description"]) +row +cell #[+api("tagger") #[code Tagger]] +cell Annotate part-of-speech tags on #[code Doc] objects. @@ -237,15 +254,13 @@ include _spacy-101/_training | Annotate named entities, e.g. persons or products, on #[code Doc] | objects. - +row - +cell #[+api("matcher") #[code Matcher]] - +cell - | Match sequences of tokens, based on pattern rules, similar to - | regular expressions. - -+h(3, "architecture-other") Other ++h(3, "architecture-other") Other classes +table(["Name", "Description"]) + +row + +cell #[+api("binder") #[code Binder]] + +cell + +row +cell #[+api("goldparse") #[code GoldParse]] +cell Collection for training annotations. diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 6c6c17e17..41bbaff92 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -1,7 +1,7 @@ include ../../_includes/_mixins p - | This workflow describes how to train new statistical models for spaCy's + | This guide describes how to train new statistical models for spaCy's | part-of-speech tagger, named entity recognizer and dependency parser. | Once the model is trained, you can then | #[+a("/docs/usage/saving-loading") save and load] it. @@ -61,7 +61,7 @@ p p.o-inline-list +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example - +button("/docs/usage/training-ner", false, "secondary") Usage Workflow + +button("/docs/usage/training-ner", false, "secondary") Usage guide +h(2, "train-dependency") Training the dependency parser diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 25aae8706..db827c414 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -8,6 +8,20 @@ p +h(2, "features") New features +p + | This section contains an overview of the most important + | #[strong new features and improvements]. The #[+a("/docs/api") API docs] + | include additional deprecation notes. New methods and functions that + | were introduced in this version are marked with a #[+tag-new(2)] tag. + +p + | To help you make the most of v2.0, we also + | #[strong re-wrote almost all of the usage guides and API docs], and added + | more real-world examples. If you're new to spaCy, or just want to brush + | up on some NLP basics and the details of the library, check out + | the #[+a("/docs/usage/spacy-101") spaCy 101 guide] that explains the most + | important concepts with examples and illustrations. + +h(3, "features-pipelines") Improved processing pipelines +aside-code("Example"). @@ -97,9 +111,6 @@ p | complex regular expressions. The language data has also been tidied up | and simplified. spaCy now also supports simple lookup-based lemmatization. -+image - include ../../assets/img/docs/language_data.svg - +infobox | #[strong API:] #[+api("language") #[code Language]] | #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang] @@ -126,10 +137,18 @@ p | #[strong API:] #[+api("matcher") #[code Matcher]] | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] -+h(3, "features-models") Neural network models for English, German, French and Spanish ++h(3, "features-models") Neural network models for English, German, French, Spanish and multi-language NER + ++aside-code("Example", "bash"). + python -m spacy download en # default English model + python -m spacy download de # default German model + python -m spacy download fr # default French model + python -m spacy download es # default Spanish model + python -m spacy download xx_ent_web_md # multi-language NER +infobox | #[strong Details:] #[+src(gh("spacy-models")) spacy-models] + | #[+a("/docs/api/language-models") Languages] | #[strong Usage:] #[+a("/docs/usage/models") Models] +h(2, "incompat") Backwards incompatibilities @@ -147,6 +166,10 @@ p +cell #[code spacy.orth] +cell #[code spacy.lang.xx.lex_attrs] + +row + +cell #[code cli.model] + +cell - + +row +cell #[code Language.save_to_directory] +cell #[+api("language#to_disk") #[code Language.to_disk]] diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 186fc5db3..b26fbc27a 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -58,6 +58,11 @@ p | The argument #[code options] lets you specify a dictionary of settings | to customise the layout, for example: ++aside("Important note") + | There's currently a known issue with the #[code compact] mode for long + | sentences with arrow spacing. If the spacing is larger than the arc + | itself, it'll cause the arc and its label to flip. + +table(["Name", "Type", "Description", "Default"]) +row +cell #[code compact] @@ -330,11 +335,12 @@ p | It's certainly possible to just have your server return the markup. | But outputting raw, unsanitised HTML is risky and makes your app vulnerable to | #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting] - | (XSS). All your user needs to do is find a way to make spaCy return one - | token #[code <script src="malicious-code.js"><script>]. - | Instead of relying on the server to render and sanitize HTML, you - | can do this on the client in JavaScript. displaCy.js creates - | the markup as DOM nodes and will never insert raw HTML. + | (XSS). All your user needs to do is find a way to make spaCy return text + | like #[code <script src="malicious-code.js"><script>], which + | is pretty easy in NER mode. Instead of relying on the server to render + | and sanitise HTML, you can do this on the client in JavaScript. + | displaCy.js creates the markup as DOM nodes and will never insert raw + | HTML. p | The #[code parse_deps] function takes a #[code Doc] object and returns