From d68dd1f251a26ba754ac8a0b5b6403758696efff Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 May 2016 12:11:57 +0200 Subject: [PATCH 01/51] Add SENT_START attribute, for custom sentence boundary detection --- spacy/attrs.pxd | 1 + spacy/attrs.pyx | 1 + spacy/symbols.pxd | 1 + spacy/symbols.pyx | 1 + spacy/tokens/doc.pyx | 16 ++++++++++++++++ 5 files changed, 20 insertions(+) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 073de3565..a8ee9cac0 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -83,6 +83,7 @@ cpdef enum attr_id_t: ENT_IOB ENT_TYPE HEAD + SENT_START SPACY PROB diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 49a1e0438..bf2687d22 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -85,6 +85,7 @@ IDS = { "ENT_IOB": ENT_IOB, "ENT_TYPE": ENT_TYPE, "HEAD": HEAD, + "SENT_START": SENT_START, "SPACY": SPACY, "PROB": PROB, "LANG": LANG, diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 1a46f509f..0b713cb21 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -82,6 +82,7 @@ cpdef enum symbol_t: ENT_IOB ENT_TYPE HEAD + SENT_START SPACY PROB diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 662aca777..9f4009579 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -84,6 +84,7 @@ IDS = { "ENT_IOB": ENT_IOB, "ENT_TYPE": ENT_TYPE, "HEAD": HEAD, + "SENT_START": SENT_START, "SPACY": SPACY, "PROB": PROB, diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 014b84746..faddba6ba 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -24,6 +24,7 @@ from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE +from ..attrs cimport SENT_START from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..syntax.iterators import CHUNKERS from ..util import normalize_slice @@ -52,6 +53,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return token.dep elif feat_name == HEAD: return token.head + elif feat_name == SENT_START: + return token.sent_start elif feat_name == SPACY: return token.spacy elif feat_name == ENT_IOB: @@ -559,6 +562,7 @@ cdef class Doc: for i in range(self.length): self.c[i] = parsed[i] +<<<<<<< HEAD def from_array(self, attrs, int[:, :] array): """Load attributes from a numpy array. Write to a `Doc` object, from an `(M, N)` array of attributes. @@ -567,6 +571,18 @@ cdef class Doc: array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load. RETURNS (Doc): Itself. """ +======= + def from_array(self, attrs, array): + if SENT_START in attrs and HEAD in attrs: + raise ValueError( + "Conflicting attributes specified in doc.from_array():\n" + "(HEAD, SENT_START)\n" + "The HEAD attribute currently sets sentence boundaries implicitly,\n" + "based on the tree structure. This means the HEAD attribute would " + "potentially override the sentence boundaries set by SENT_START.\n" + "See https://github.com/spacy-io/spaCy/issues/235 for details and " + "workarounds, and to propose solutions.") +>>>>>>> 45ad8684... * Add SENT_START attribute cdef int i, col cdef attr_id_t attr_id cdef TokenC* tokens = self.c From 4917cbb4843b1b549350e30f98b49979943f8b45 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 May 2016 12:10:07 +0200 Subject: [PATCH 02/51] Include sent_start test --- spacy/tests/doc/test_token_api.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 2f784e678..d4d8aea8e 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -155,3 +155,15 @@ def test_doc_token_api_head_setter(en_tokenizer): assert doc[3].left_edge.i == 0 assert doc[4].left_edge.i == 0 assert doc[2].left_edge.i == 0 + + +def test_sent_start(en_tokenizer): + doc = en_tokenizer(u'This is a sentence. This is another.') + assert not doc[0].sent_start + assert not doc[5].sent_start + doc[5].sent_start = True + assert doc[5].sent_start + assert not doc[0].sent_start + doc.is_parsed = True + assert len(list(doc.sents)) == 2 + From 01e59e4e6e1afca9545c4f0caa52e2b00af74677 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 May 2016 11:53:20 +0200 Subject: [PATCH 03/51] * Add Token.sent_start property, re Issue #235 --- spacy/tokens/token.pyx | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 7dc970fa1..6039a84ee 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -279,6 +279,18 @@ cdef class Token: def __get__(self): return self.c.r_kids + property sent_start: + def __get__(self): + return self.c.sent_start + + def __set__(self, bint value): + if self.doc.is_parsed: + raise ValueError( + 'Refusing to write to token.sent_start if its document is parsed, ' + 'because this may cause inconsistent state. ' + 'See https://github.com/spacy-io/spaCy/issues/235 for workarounds.') + self.c.sent_start = value + property lefts: def __get__(self): """ From d44b1eafc426e0007a7c65f334872b2f39a2d890 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 May 2017 18:47:11 +0200 Subject: [PATCH 04/51] Fix conflict artefacts --- spacy/tokens/doc.pyx | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index faddba6ba..0e4faafbe 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -562,16 +562,6 @@ cdef class Doc: for i in range(self.length): self.c[i] = parsed[i] -<<<<<<< HEAD - def from_array(self, attrs, int[:, :] array): - """Load attributes from a numpy array. Write to a `Doc` object, from an - `(M, N)` array of attributes. - - attrs (ints): A list of attribute ID ints. - array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load. - RETURNS (Doc): Itself. - """ -======= def from_array(self, attrs, array): if SENT_START in attrs and HEAD in attrs: raise ValueError( @@ -582,7 +572,6 @@ cdef class Doc: "potentially override the sentence boundaries set by SENT_START.\n" "See https://github.com/spacy-io/spaCy/issues/235 for details and " "workarounds, and to propose solutions.") ->>>>>>> 45ad8684... * Add SENT_START attribute cdef int i, col cdef attr_id_t attr_id cdef TokenC* tokens = self.c From 05761e1750e3bd31ef19839abd3415e9ebf3a601 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:11:38 +0200 Subject: [PATCH 05/51] Allow size on procon icon --- website/_includes/_mixins-base.jade | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 5a7a535c9..c42994e8f 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -42,10 +42,11 @@ mixin icon(name, size) //- Pro/Con/Neutral icon icon - [string] "pro", "con" or "neutral" (default: "neutral") + size - [integer] icon size (optional) -mixin procon(icon) +mixin procon(icon, size) - colors = { pro: "green", con: "red", neutral: "yellow" } - +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) + +icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) //- Headlines Helper Mixin From 7e5163402e7bcbc09507484261c00501dc646de3 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:13:26 +0200 Subject: [PATCH 06/51] Allow clipping code block to height and add docs --- website/_includes/_mixins.jade | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index f9960b71f..250865884 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -103,9 +103,11 @@ mixin button(url, trusted, ...style) label - [string] aside title (optional or false for no label) language - [string] language for syntax highlighting (default: "python") supports basic relevant languages available for PrismJS + icon - [string] icon to display next to code block, mostly used for old/new + height - [integer] optional height to clip code block to -mixin code(label, language, icon) - pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes) +mixin code(label, language, icon, height) + pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes) if label h4.u-text-label.u-text-label--dark=label From 00ede349dc02a4fc73aa06de7e9243fa0ba8a717 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:13:37 +0200 Subject: [PATCH 07/51] Add table row for linguistic annotations --- website/_includes/_mixins.jade | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 250865884..f815d9c4a 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -352,7 +352,22 @@ mixin pos-row(tag, pos, morph, desc) | #[code=m] +cell.u-text-small=desc + mixin dep-row(label, desc) +row +cell #[code=label] +cell=desc + + +//- Table rows for linguistic annotations + annots [array] - array of cell content + style [array] array of 1 (display as code) or 0 (display as text) + +mixin annotation-row(annots, style) + +row + for cell, i in annots + if style && style[i] + - cell = (typeof(cell) != 'boolean') ? cell : cell ? 'True' : 'False' + +cell #[code=cell] + else + +cell=cell From 0a8a2d2f6dcc2f10a6b684f42b71d9eeefb9a3b3 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:13:51 +0200 Subject: [PATCH 08/51] Remove tip infoboxes from annotation docs --- website/docs/api/_annotation/_dep-labels.jade | 5 ----- website/docs/api/_annotation/_named-entities.jade | 5 ----- website/docs/api/_annotation/_pos-tags.jade | 5 ----- 3 files changed, 15 deletions(-) diff --git a/website/docs/api/_annotation/_dep-labels.jade b/website/docs/api/_annotation/_dep-labels.jade index 9e1e89324..427b2f53a 100644 --- a/website/docs/api/_annotation/_dep-labels.jade +++ b/website/docs/api/_annotation/_dep-labels.jade @@ -1,10 +1,5 @@ //- πŸ’« DOCS > API > ANNOTATION > DEPENDENCY LABELS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a label. For example, - | #[code spacy.explain("prt")] will return "particle". - +h(3, "dependency-parsing-english") English dependency labels p diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/docs/api/_annotation/_named-entities.jade index 68b3bd17d..476659d4a 100644 --- a/website/docs/api/_annotation/_named-entities.jade +++ b/website/docs/api/_annotation/_named-entities.jade @@ -1,10 +1,5 @@ //- πŸ’« DOCS > API > ANNOTATION > NAMED ENTITIES -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of an entity label. For example, - | #[code spacy.explain("LANGUAGE")] will return "any named language". - +table([ "Type", "Description" ]) +row +cell #[code PERSON] diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index d3ceef777..ea3a225bf 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -1,10 +1,5 @@ //- πŸ’« DOCS > API > ANNOTATION > POS TAGS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a tag. For example, - | #[code spacy.explain("RB")] will return "adverb". - +h(3, "pos-tagging-english") English part-of-speech tag scheme p From c8bde2161cf199665d2a2e9eab87ecbb2af53a39 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:14:02 +0200 Subject: [PATCH 09/51] Add kwargs to spacy.load --- website/docs/api/spacy.jade | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade index da8c97b9c..6ad88c1a8 100644 --- a/website/docs/api/spacy.jade +++ b/website/docs/api/spacy.jade @@ -33,6 +33,11 @@ p +cell unicode or #[code Path] +cell Model to load, i.e. shortcut link, package name or path. + +row + +cell #[code **overrides] + +cell - + +cell Override or disable components. + +footrow +cell returns +cell #[code Language] From 6ef09d7ed8957c46ac90afb065f2da06662f03ac Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:15:31 +0200 Subject: [PATCH 10/51] Change save_to_directory to to_disk --- website/docs/usage/saving-loading.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index c4eb08f04..b11007683 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -3,11 +3,11 @@ include ../../_includes/_mixins p | After training your model, you'll usually want to save its state, and load | it back later. You can do this with the - | #[+api("language#save_to_directory") #[code Language.save_to_directory()]] + | #[+api("language#to_disk") #[code Language.to_disk()]] | method: +code. - nlp.save_to_directory('/home/me/data/en_example_model') + nlp.to_disk('/home/me/data/en_example_model') p | The directory will be created if it doesn't exist, and the whole pipeline From 3aff8834344071974503d7a9b819260161273448 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:15:39 +0200 Subject: [PATCH 11/51] Add displaCy examples to lightning tour --- website/docs/usage/lightning-tour.jade | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 967d0c61e..24654b853 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -24,6 +24,23 @@ p en_doc = en_nlp(u'Hello, world. Here are two sentences.') de_doc = de_nlp(u'ich bin ein Berliner.') ++h(2, "displacy-dep") Visualize a dependency parse in your browser + ++code. + from spacy import displacy + + doc = nlp(u'This is a sentence.') + displacy.serve(doc, style='dep') + ++h(2, "displacy-ent") Visualize named entities in your browser + ++code. + from spacy import displacy + + doc = nlp(u'When Sebastian Thrun started working on self-driving cars at ' + u'Google in 2007, few people outside of the company took him seriously.') + displacy.serve(doc, style='ent') + +h(2, "multi-threaded") Multi-threaded generator +code. From 786af87ffbc4f6dd98ec149c074c8cbd60fa9a6b Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:15:50 +0200 Subject: [PATCH 12/51] Update IOB docs --- website/docs/api/token.jade | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 9be41081c..744446ec2 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -338,8 +338,10 @@ p The L2 norm of the token's vector representation. +cell #[code ent_iob] +cell int +cell - | IOB code of named entity tag. - | #[code 1="I", 2="O", 3="B"]. #[code 0] means no tag is assigned. + | IOB code of named entity tag. #[code "B"] + | means the token begins an entity, #[code "I"] means it is inside + | an entity, #[code "O"] means it is outside an entity, and + | #[code ""] means no entity tag is set. +row +cell #[code ent_iob_] From a38393e2f624b6c58806acdc18015329e75542d5 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:16:17 +0200 Subject: [PATCH 13/51] Update annotation docs --- website/docs/api/annotation.jade | 38 +++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index bc723b5c6..048e69897 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -14,11 +14,12 @@ p | (#[code ' ']) is included as a token. +aside-code("Example"). - from spacy.en import English - nlp = English(parser=False) + from spacy.lang.en import English + nlp = English() tokens = nlp('Some\nspaces and\ttab characters') - print([t.orth_ for t in tokens]) - # ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] + tokens_text = [t.text for t in tokens] + assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and', + '\t', 'tab', 'characters'] p | The whitespace tokens are useful for much the same reason punctuation is @@ -38,6 +39,11 @@ p +h(2, "pos-tagging") Part-of-speech Tagging ++aside("Tip: Understanding tags") + | You can also use #[code spacy.explain()] to get the escription for the + | string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + include _annotation/_pos-tags +h(2, "lemmatization") Lemmatization @@ -50,25 +56,35 @@ p A "lemma" is the uninflected form of a word. In English, this means: +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children" +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written" -+aside("About spaCy's custom pronoun lemma") - | Unlike verbs and common nouns, there's no clear base form of a personal - | pronoun. Should the lemma of "me" be "I", or should we normalize person - | as well, giving "it" β€” or maybe "he"? spaCy's solution is to introduce a - | novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for - | all personal pronouns. - p | The lemmatization data is taken from | #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a | special case for pronouns: all pronouns are lemmatized to the special | token #[code -PRON-]. ++infobox("About spaCy's custom pronoun lemma") + | Unlike verbs and common nouns, there's no clear base form of a personal + | pronoun. Should the lemma of "me" be "I", or should we normalize person + | as well, giving "it" β€” or maybe "he"? spaCy's solution is to introduce a + | novel symbol, #[code -PRON-], which is used as the lemma for + | all personal pronouns. + +h(2, "dependency-parsing") Syntactic Dependency Parsing ++aside("Tip: Understanding labels") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition ++aside("Tip: Understanding entity types") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + include _annotation/_named-entities +h(3, "biluo") BILUO Scheme From 3523715d52a318329f238e0bc6d3f14ebf248533 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:16:31 +0200 Subject: [PATCH 14/51] Add spaCy 101 components --- .../usage/_spacy-101/_named-entities.jade | 38 +++++ website/docs/usage/_spacy-101/_pos-deps.jade | 62 +++++++ .../docs/usage/_spacy-101/_similarity.jade | 44 +++++ .../docs/usage/_spacy-101/_tokenization.jade | 18 +++ .../docs/usage/_spacy-101/_word-vectors.jade | 152 ++++++++++++++++++ 5 files changed, 314 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_named-entities.jade create mode 100644 website/docs/usage/_spacy-101/_pos-deps.jade create mode 100644 website/docs/usage/_spacy-101/_similarity.jade create mode 100644 website/docs/usage/_spacy-101/_tokenization.jade create mode 100644 website/docs/usage/_spacy-101/_word-vectors.jade diff --git a/website/docs/usage/_spacy-101/_named-entities.jade b/website/docs/usage/_spacy-101/_named-entities.jade new file mode 100644 index 000000000..a3c539564 --- /dev/null +++ b/website/docs/usage/_spacy-101/_named-entities.jade @@ -0,0 +1,38 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > NAMED ENTITIES + +p + | A named entity is a "real-world object" that's assigned a name – for + | example, a person, a country, a product or a book title. spaCy can + | #[strong recognise] #[+a("/docs/api/annotation#named-entities") various types] + | of named entities in a document, by asking the model for a + | #[strong prediction]. Because models are statistical and strongly depend + | on the examples they were trained on, this doesn't always work + | #[em perfectly] and might need some tuning later, depending on your use + | case. + +p + | Named entities are available as the #[code ents] property of a #[code Doc]: + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + + for ent in doc.ents: + print(ent.text, ent.start_char, ent.end_char, ent.label_) + ++aside + | #[strong Text]: The original entity text.#[br] + | #[strong Start]: Index of start of entity in the #[code Doc].#[br] + | #[strong End]: Index of end of entity in the #[code Doc].#[br] + | #[strong Label]: Entity label, i.e. type. + ++table(["Text", "Start", "End", "Label", "Description"]) + - var style = [0, 1, 1, 1, 0] + +annotation-row(["Apple", 0, 5, "ORG", "Companies, agencies, institutions."], style) + +annotation-row(["U.K.", 27, 31, "GPE", "Geopolitical entity, i.e. countries, cities, states."], style) + +annotation-row(["$1 billion", 44, 54, "MONEY", "Monetary values, including unit."], style) + +p + | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer], + | here's what our example sentence and its named entities look like: + ++codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160) diff --git a/website/docs/usage/_spacy-101/_pos-deps.jade b/website/docs/usage/_spacy-101/_pos-deps.jade new file mode 100644 index 000000000..5aa719c23 --- /dev/null +++ b/website/docs/usage/_spacy-101/_pos-deps.jade @@ -0,0 +1,62 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING + +p + | After tokenization, spaCy can also #[strong parse] and #[strong tag] a + | given #[code Doc]. This is where the statistical model comes in, which + | enables spaCy to #[strong make a prediction] of which tag or label most + | likely applies in this context. A model consists of binary data and is + | produced by showing a system enough examples for it to make predictions + | that generalise across the language – for example, a word following "the" + | in English is most likely a noun. + +p + | Linguistic annotations are available as + | #[+api("token#attributes") #[code Token] attributes]. Like many NLP + | libraries, spaCy #[strong encodes all strings to integers] to reduce + | memory usage and improve efficiency. So to get the readable string + | representation of an attribute, we need to add an underscore #[code _] + | to its name: + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + + for token in doc: + print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, + token.shape_, token.is_alpha, token.is_stop) + ++aside + | #[strong Text:] The original word text.#[br] + | #[strong Lemma:] The base form of the word.#[br] + | #[strong POS:] The simple part-of-speech tag.#[br] + | #[strong Tag:] ...#[br] + | #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br] + | #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br] + | #[strong is alpha:] Is the token an alpha character?#[br] + | #[strong is stop:] Is the token part of a stop list, i.e. the most common + | words of the language?#[br] + ++table(["Text", "Lemma", "POS", "Tag", "Dep", "Shape", "alpha", "stop"]) + - var style = [0, 0, 1, 1, 1, 1, 1, 1] + +annotation-row(["Apple", "apple", "PROPN", "NNP", "nsubj", "Xxxxx", true, false], style) + +annotation-row(["is", "be", "VERB", "VBZ", "aux", "xx", true, true], style) + +annotation-row(["looking", "look", "VERB", "VBG", "ROOT", "xxxx", true, false], style) + +annotation-row(["at", "at", "ADP", "IN", "prep", "xx", true, true], style) + +annotation-row(["buying", "buy", "VERB", "VBG", "pcomp", "xxxx", true, false], style) + +annotation-row(["U.K.", "u.k.", "PROPN", "NNP", "compound", "X.X.", false, false], style) + +annotation-row(["startup", "startup", "NOUN", "NN", "dobj", "xxxx", true, false], style) + +annotation-row(["for", "for", "ADP", "IN", "prep", "xxx", true, true], style) + +annotation-row(["$", "$", "SYM", "$", "quantmod", "$", false, false], style) + +annotation-row(["1", "1", "NUM", "CD", "compound", "d", false, false], style) + +annotation-row(["billion", "billion", "NUM", "CD", "pobj", "xxxx", true, false], style) + ++aside("Tip: Understanding tags and labels") + | Most of the tags and labels look pretty abstract, and they vary between + | languages. #[code spacy.explain()] will show you a short description – + | for example, #[code spacy.explain("VBZ")] returns "verb, 3rd person + | singular present". + +p + | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer], + | here's what our example sentence and its dependencies look like: + ++codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460) diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade new file mode 100644 index 000000000..c99bc9658 --- /dev/null +++ b/website/docs/usage/_spacy-101/_similarity.jade @@ -0,0 +1,44 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > SIMILARITY + +p + | spaCy is able to compare two objects, and make a prediction of + | #[strong how similar they are]. Predicting similarity is useful for + | building recommendation systems or flagging duplicates. For example, you + | can suggest a user content that's similar to what they're currently + | looking at, or label a support ticket as a duplicate, if it's very + | similar to an already existing one. + +p + | Each #[code Doc], #[code Span] and #[code Token] comes with a + | #[+api("token#similarity") #[code .similarity()]] method that lets you + | compare it with another object, and determine the similarity. Of course + | similarity is always subjective – whether "dog" and "cat" are similar + | really depends on how you're looking at it. spaCy's similarity model + | usually assumes a pretty general-purpose definition of similarity. + ++code. + tokens = nlp(u'dog cat banana') + + for token1 in tokens: + for token2 in tokens: + print(token1.similarity(token2)) + ++aside + | #[strong #[+procon("neutral", 16)] similarity:] identical#[br] + | #[strong #[+procon("pro", 16)] similarity:] similar (higher is more similar) #[br] + | #[strong #[+procon("con", 16)] similarity:] dissimilar (lower is less similar) + ++table(["", "dog", "cat", "banana"]) + each cells, label in {"dog": [1.00, 0.80, 0.24], "cat": [0.80, 1.00, 0.28], "banana": [0.24, 0.28, 1.00]} + +row + +cell.u-text-label.u-color-theme=label + for cell in cells + +cell #[code=cell.toFixed(2)] + | #[+procon(cell < 0.5 ? "con" : cell != 1 ? "pro" : "neutral")] + +p + | In this case, the model's predictions are pretty on point. A dog is very + | similar to a cat, whereas a banana is not very similar to either of them. + | Identical tokens are obviously 100% similar to each other (just not always + | exactly #[code 1.0], because of vector math and floating point + | imprecisions). diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade new file mode 100644 index 000000000..28fd448b4 --- /dev/null +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -0,0 +1,18 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > TOKENIZATION + +p + | During processing, spaCy first #[strong tokenizes] the text, i.e. + | segments it into words, punctuation and so on. For example, punctuation + | at the end of a sentence should be split off – whereas "U.K." should + | remain one token. This is done by applying rules specific to each + | language. Each #[code Doc] consists of individual tokens, and we can + | simply iterate over them: + ++code. + for token in doc: + print(token.text) + ++table([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).u-text-center + +row + for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"] + +cell=cell diff --git a/website/docs/usage/_spacy-101/_word-vectors.jade b/website/docs/usage/_spacy-101/_word-vectors.jade new file mode 100644 index 000000000..4ed8e4c78 --- /dev/null +++ b/website/docs/usage/_spacy-101/_word-vectors.jade @@ -0,0 +1,152 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > WORD VECTORS + +p + | Similarity is determined by comparing #[strong word vectors] or "word + | embeddings", multi-dimensional meaning representations of a word. Word + | vectors can be generated using an algorithm like + | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's + | #[+a("/docs/usage/models") default models] come with + | #[strong 300-dimensional vectors], that look like this: + ++code("banana.vector", false, false, 250). + array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, + 3.28450017e-02, -4.19569999e-01, 7.20689967e-02, + -3.74760002e-01, 5.74599989e-02, -1.24009997e-02, + 5.29489994e-01, -5.23800015e-01, -1.97710007e-01, + -3.41470003e-01, 5.33169985e-01, -2.53309999e-02, + 1.73800007e-01, 1.67720005e-01, 8.39839995e-01, + 5.51070012e-02, 1.05470002e-01, 3.78719985e-01, + 2.42750004e-01, 1.47449998e-02, 5.59509993e-01, + 1.25210002e-01, -6.75960004e-01, 3.58420014e-01, + -4.00279984e-02, 9.59490016e-02, -5.06900012e-01, + -8.53179991e-02, 1.79800004e-01, 3.38669986e-01, + 1.32300004e-01, 3.10209990e-01, 2.18779996e-01, + 1.68530002e-01, 1.98740005e-01, -5.73849976e-01, + -1.06490001e-01, 2.66689986e-01, 1.28380001e-01, + -1.28030002e-01, -1.32839993e-01, 1.26570001e-01, + 8.67229998e-01, 9.67210010e-02, 4.83060002e-01, + 2.12709993e-01, -5.49900010e-02, -8.24249983e-02, + 2.24079996e-01, 2.39749998e-01, -6.22599982e-02, + 6.21940017e-01, -5.98999977e-01, 4.32009995e-01, + 2.81430006e-01, 3.38420011e-02, -4.88150001e-01, + -2.13589996e-01, 2.74010003e-01, 2.40950003e-01, + 4.59500015e-01, -1.86049998e-01, -1.04970002e+00, + -9.73049998e-02, -1.89080000e-01, -7.09290028e-01, + 4.01950002e-01, -1.87680006e-01, 5.16870022e-01, + 1.25200003e-01, 8.41499984e-01, 1.20970003e-01, + 8.82389992e-02, -2.91959997e-02, 1.21510006e-03, + 5.68250008e-02, -2.74210006e-01, 2.55640000e-01, + 6.97930008e-02, -2.22580001e-01, -3.60060006e-01, + -2.24020004e-01, -5.36990017e-02, 1.20220006e+00, + 5.45350015e-01, -5.79980016e-01, 1.09049998e-01, + 4.21669990e-01, 2.06619993e-01, 1.29360005e-01, + -4.14570011e-02, -6.67770028e-01, 4.04670000e-01, + -1.52179999e-02, -2.76400000e-01, -1.56110004e-01, + -7.91980028e-02, 4.00369987e-02, -1.29439995e-01, + -2.40900001e-04, -2.67850012e-01, -3.81150007e-01, + -9.72450018e-01, 3.17259997e-01, -4.39509988e-01, + 4.19340014e-01, 1.83530003e-01, -1.52600005e-01, + -1.08080000e-01, -1.03579998e+00, 7.62170032e-02, + 1.65189996e-01, 2.65259994e-04, 1.66160002e-01, + -1.52810007e-01, 1.81229994e-01, 7.02740014e-01, + 5.79559989e-03, 5.16639985e-02, -5.97449988e-02, + -2.75510013e-01, -3.90489995e-01, 6.11319989e-02, + 5.54300010e-01, -8.79969969e-02, -4.16810006e-01, + 3.28260005e-01, -5.25489986e-01, -4.42880005e-01, + 8.21829960e-03, 2.44859993e-01, -2.29819998e-01, + -3.49810004e-01, 2.68940002e-01, 3.91660005e-01, + -4.19039994e-01, 1.61909997e-01, -2.62630010e+00, + 6.41340017e-01, 3.97430003e-01, -1.28680006e-01, + -3.19460005e-01, -2.56330013e-01, -1.22199997e-01, + 3.22750002e-01, -7.99330026e-02, -1.53479993e-01, + 3.15050006e-01, 3.05909991e-01, 2.60120004e-01, + 1.85530007e-01, -2.40429997e-01, 4.28860001e-02, + 4.06219989e-01, -2.42559999e-01, 6.38700008e-01, + 6.99829996e-01, -1.40430003e-01, 2.52090007e-01, + 4.89840001e-01, -6.10670000e-02, -3.67659986e-01, + -5.50890028e-01, -3.82649988e-01, -2.08430007e-01, + 2.28320003e-01, 5.12179971e-01, 2.78679997e-01, + 4.76520002e-01, 4.79510017e-02, -3.40079993e-01, + -3.28729987e-01, -4.19669986e-01, -7.54989982e-02, + -3.89539987e-01, -2.96219997e-02, -3.40700001e-01, + 2.21699998e-01, -6.28560036e-02, -5.19029975e-01, + -3.77739996e-01, -4.34770016e-03, -5.83010018e-01, + -8.75459984e-02, -2.39289999e-01, -2.47109994e-01, + -2.58870006e-01, -2.98940003e-01, 1.37150005e-01, + 2.98919994e-02, 3.65439989e-02, -4.96650010e-01, + -1.81600004e-01, 5.29389977e-01, 2.19919994e-01, + -4.45140004e-01, 3.77979994e-01, -5.70620000e-01, + -4.69460003e-02, 8.18059966e-02, 1.92789994e-02, + 3.32459986e-01, -1.46200001e-01, 1.71560004e-01, + 3.99809986e-01, 3.62170011e-01, 1.28160000e-01, + 3.16439986e-01, 3.75690013e-01, -7.46899992e-02, + -4.84800003e-02, -3.14009994e-01, -1.92860007e-01, + -3.12940001e-01, -1.75529998e-02, -1.75139993e-01, + -2.75870003e-02, -1.00000000e+00, 1.83870003e-01, + 8.14339995e-01, -1.89129993e-01, 5.09989977e-01, + -9.19600017e-03, -1.92950002e-03, 2.81890005e-01, + 2.72470005e-02, 4.34089988e-01, -5.49669981e-01, + -9.74259973e-02, -2.45399997e-01, -1.72030002e-01, + -8.86500031e-02, -3.02980006e-01, -1.35910004e-01, + -2.77649999e-01, 3.12860007e-03, 2.05559999e-01, + -1.57720000e-01, -5.23079991e-01, -6.47010028e-01, + -3.70139986e-01, 6.93930015e-02, 1.14009999e-01, + 2.75940001e-01, -1.38750002e-01, -2.72680014e-01, + 6.68910027e-01, -5.64539991e-02, 2.40170002e-01, + -2.67300010e-01, 2.98599988e-01, 1.00830004e-01, + 5.55920005e-01, 3.28489989e-01, 7.68579990e-02, + 1.55279994e-01, 2.56359994e-01, -1.07720003e-01, + -1.23590000e-01, 1.18270002e-01, -9.90289971e-02, + -3.43279988e-01, 1.15019999e-01, -3.78080010e-01, + -3.90120000e-02, -3.45930010e-01, -1.94040000e-01, + -3.35799992e-01, -6.23340011e-02, 2.89189994e-01, + 2.80319989e-01, -5.37410021e-01, 6.27939999e-01, + 5.69549985e-02, 6.21469975e-01, -2.52819985e-01, + 4.16700006e-01, -1.01079997e-02, -2.54339993e-01, + 4.00029987e-01, 4.24320012e-01, 2.26720005e-01, + 1.75530002e-01, 2.30489999e-01, 2.83230007e-01, + 1.38820007e-01, 3.12180002e-03, 1.70570001e-01, + 3.66849989e-01, 2.52470002e-03, -6.40089989e-01, + -2.97650009e-01, 7.89430022e-01, 3.31680000e-01, + -1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32) + +p + | The #[code .vector] attribute will return an object's vector. + | #[+api("doc#vector") #[code Doc.vector]] and + | #[+api("span#vector") #[code Span.vector]] will default to an average + | of their token vectors. You can also check if a token has a vector + | assigned, and get the L2 norm, which can be used to normalise + | vectors. + ++code. + tokens = nlp(u'dog cat banana sasquatch') + + for token in tokens: + print(token.text, token.has_vector, token.vector_norm, token.is_oov) + ++aside + | #[strong Text]: The original token text.#[br] + | #[strong has vector]: Does the token have a vector representation?#[br] + | #[strong Vector norm]: The L2 norm of the token's vector (the square root + | of the sum of the values squared)#[br] + | #[strong is OOV]: Is the word out-of-vocabulary? + ++table(["Text", "Has vector", "Vector norm", "OOV"]) + - var style = [0, 1, 1, 1] + +annotation-row(["dog", true, 7.033672992262838, false], style) + +annotation-row(["cat", true, 6.68081871208896, false], style) + +annotation-row(["banana", true, 6.700014292148571, false], style) + +annotation-row(["sasquatch", false, 0, true], style) + +p + | The words "dog", "cat" and "banana" are all pretty common in English, so + | they're part of the model's vocabulary, and come with a vector. The word + | "sasquatch" on the other hand is a lot less common and out-of-vocabulary + | – so its vector representation consists of 300 dimensions of #[code 0], + | which means it's practically nonexistent. + +p + | If your application will benefit from a large vocabulary with more + | vectors, you should consider using one of the + | #[+a("/docs/usage/models#available") larger models] instead of the default, + | smaller ones, which usually come with a clipped vocabulary. From a433e5012a901bb47ffc34fadb0af2514171b289 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:16:44 +0200 Subject: [PATCH 15/51] Update adding languages docs --- website/docs/usage/adding-languages.jade | 43 ++++++++---------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index d1cb1887c..f77acdf24 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -436,6 +436,8 @@ p +h(3, "morph-rules") Morph rules +//- TODO: write morph rules section + +h(2, "testing") Testing the new language tokenizer p @@ -626,37 +628,20 @@ p | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. | The #[code vectors.bin] file should consist of one word and vector per line. -+h(2, "model-directory") Setting up a model directory - -p - | Once you've collected the word frequencies, Brown clusters and word - | vectors files, you can use the - | #[+a("/docs/usage/cli#model") #[code model] command] to create a data - | directory: - -+code(false, "bash"). - python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] - +aside-code("your_data_directory", "yaml"). β”œβ”€β”€ vocab/ - | β”œβ”€β”€ lexemes.bin # via nlp.vocab.dump(path) - | β”œβ”€β”€ strings.json # via nlp.vocab.strings.dump(file_) - | └── oov_prob # optional - β”œβ”€β”€ pos/ # optional - | β”œβ”€β”€ model # via nlp.tagger.model.dump(path) - | └── config.json # via Langage.train - β”œβ”€β”€ deps/ # optional - | β”œβ”€β”€ model # via nlp.parser.model.dump(path) - | └── config.json # via Langage.train - └── ner/ # optional - β”œβ”€β”€ model # via nlp.entity.model.dump(path) - └── config.json # via Langage.train - -p - | This creates a spaCy data directory with a vocabulary model, ready to be - | loaded. By default, the command expects to be able to find your language - | class using #[code spacy.util.get_lang_class(lang_id)]. - + | β”œβ”€β”€ lexemes.bin + | β”œβ”€β”€ strings.json + | └── oov_prob + β”œβ”€β”€ pos/ + | β”œβ”€β”€ model + | └── config.json + β”œβ”€β”€ deps/ + | β”œβ”€β”€ model + | └── config.json + └── ner/ + β”œβ”€β”€ model + └── config.json +h(2, "train-tagger-parser") Training the tagger and parser From 1c06ef35427e5b495eab09a4d165bcec588bdead Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:17:25 +0200 Subject: [PATCH 16/51] Update spaCy architecture --- website/docs/usage/spacy-101.jade | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index daace114b..06f88ace2 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -8,3 +8,85 @@ include ../../_includes/_mixins include ../../assets/img/docs/architecture.svg .u-text-right +button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic + ++table(["Name", "Description"]) + +row + +cell #[+api("language") #[code Language]] + +cell + | A text-processing pipeline. Usually you'll load this once per + | process as #[code nlp] and pass the instance around your application. + + +row + +cell #[+api("doc") #[code Doc]] + +cell A container for accessing linguistic annotations. + + +row + +cell #[+api("span") #[code Span]] + +cell A slice from a #[code Doc] object. + + +row + +cell #[+api("token") #[code Token]] + +cell + | An individual token β€” i.e. a word, punctuation symbol, whitespace, + | etc. + + +row + +cell #[+api("lexeme") #[code Lexeme]] + +cell + | An entry in the vocabulary. It's a word type with no context, as + | opposed to a word token. It therefore has no part-of-speech tag, + | dependency parse etc. + + +row + +cell #[+api("vocab") #[code Vocab]] + +cell + | A lookup table for the vocabulary that allows you to access + | #[code Lexeme] objects. + + +row + +cell #[code Morphology] + +cell + + +row + +cell #[+api("stringstore") #[code StringStore]] + +cell Map strings to and from integer IDs. + + +row + +row + +cell #[+api("tokenizer") #[code Tokenizer]] + +cell + | Segment text, and create #[code Doc] objects with the discovered + | segment boundaries. + + +row + +cell #[+api("tagger") #[code Tagger]] + +cell Annotate part-of-speech tags on #[code Doc] objects. + + +row + +cell #[+api("dependencyparser") #[code DependencyParser]] + +cell Annotate syntactic dependencies on #[code Doc] objects. + + +row + +cell #[+api("entityrecognizer") #[code EntityRecognizer]] + +cell + | Annotate named entities, e.g. persons or products, on #[code Doc] + | objects. + + +row + +cell #[+api("matcher") #[code Matcher]] + +cell + | Match sequences of tokens, based on pattern rules, similar to + | regular expressions. + ++h(3, "architecture-other") Other + ++table(["Name", "Description"]) + +row + +cell #[+api("goldparse") #[code GoldParse]] + +cell Collection for training annotations. + + +row + +cell #[+api("goldcorpus") #[code GoldCorpus]] + +cell + | An annotated corpus, using the JSON file format. Manages + | annotations for tagging, dependency parsing and NER. From 61cf2bba5518fa97009631b46f8bc2bca7a9a9c6 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:17:37 +0200 Subject: [PATCH 17/51] Fix code example --- website/docs/usage/visualizers.jade | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 93a4b5567..fe779add9 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -314,3 +314,4 @@ p 'text': 'But Google is starting from behind.', 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], 'title': None + } From 43258d6b0a3e0c265c873d6e7e41bb62ca331cf2 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:17:57 +0200 Subject: [PATCH 18/51] Update NER workflow --- website/docs/usage/entity-recognition.jade | 205 ++++++++++++--------- 1 file changed, 116 insertions(+), 89 deletions(-) diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 2c3116b82..bcad07baa 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -9,14 +9,12 @@ p | locations, organizations and products. You can add arbitrary classes to | the entity recognition system, and update the model with new examples. -+aside-code("Example"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'London is a big city in the United Kingdom.') - for ent in doc.ents: - print(ent.label_, ent.text) - # GPE London - # GPE United Kingdom ++h(2, "101") Named Entity Recognition 101 + +tag-model("named entities") + +include _spacy-101/_named-entities + ++h(2, "accessing") Accessing entity annotations p | The standard way to access entity annotations is the @@ -26,56 +24,89 @@ p | #[code ent.label] and #[code ent.label_]. The #[code Span] object acts | as a sequence of tokens, so you can iterate over the entity or index into | it. You can also get the text form of the whole entity, as though it were - | a single token. See the #[+api("span") API reference] for more details. + | a single token. p - | You can access token entity annotations using the #[code token.ent_iob] - | and #[code token.ent_type] attributes. The #[code token.ent_iob] - | attribute indicates whether an entity starts, continues or ends on the - | tag (In, Begin, Out). + | You can also access token entity annotations using the + | #[+api("token#attributes") #[code token.ent_iob]] and + | #[+api("token#attributes") #[code token.ent_type]] attributes. + | #[code token.ent_iob] indicates whether an entity starts, continues or + | ends on the tag. If no entity type is set on a token, it will return an + | empty string. + ++aside("IOB Scheme") + | #[code I] – Token is inside an entity.#[br] + | #[code O] – Token is outside an entity.#[br] + | #[code B] – Token is the beginning of an entity.#[br] +code("Example"). - doc = nlp(u'London is a big city in the United Kingdom.') - print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_) - # (u'London', 2, u'GPE') - print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_) - # (u'is', 3, u'') + doc = nlp(u'San Francisco considers banning sidewalk delivery robots') + + # document level + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents == [(u'San Francisco', 0, 13, u'GPE')] + + # token level + ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] + ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_] + assert ent_san == [u'San', u'B', u'GPE'] + assert ent_francisco == [u'Francisco', u'I', u'GPE'] + ++table(["Text", "ent_iob", "ent.iob_", "ent_type", "ent_type_", "Description"]) + - var style = [0, 1, 1, 1, 1, 0] + +annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style) + +annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style) + +annotation-row(["considers", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["banning", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["sidewalk", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["delivery", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["robots", 2, "O", 0, '""', "outside an entity"], style) +h(2, "setting") Setting entity annotations p | To ensure that the sequence of token annotations remains consistent, you - | have to set entity annotations at the document level β€” you can't write - | directly to the #[code token.ent_iob] or #[code token.ent_type] - | attributes. The easiest way to set entities is to assign to the - | #[code doc.ents] attribute. + | have to set entity annotations #[strong at the document level]. However, + | you can't write directly to the #[code token.ent_iob] or + | #[code token.ent_type] attributes, so the easiest way to set entities is + | to assign to the #[+api("doc#ents") #[code doc.ents]] attribute + | and create the new entity as a #[+api("span") #[code Span]]. +code("Example"). - doc = nlp(u'London is a big city in the United Kingdom.') - doc.ents = [] - assert doc[0].ent_type_ == '' - doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings['GPE'])] - assert doc[0].ent_type_ == 'GPE' - doc.ents = [] - doc.ents = [(u'LondonCity', doc.vocab.strings['GPE'], 0, 1)] + from spacy.tokens import Span + + doc = nlp(u'Netflix is hiring a new VP of global policy') + # the model didn't recognise any entities :( + + ORG = doc.vocab.strings[u'ORG'] # get integer ID of entity label + netflix_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity + doc.ents = [netflix_ent] + + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents = [(u'Netflix', 0, 7, u'ORG')] p - | The value you assign should be a sequence, the values of which - | can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)] - | tuples, where #[code start] and #[code end] are token offsets that - | describe the slice of the document that should be annotated. + | Keep in mind that you need to create a #[code Span] with the start and + | end index of the #[strong token], not the start and end index of the + | entity in the document. In this case, "Netflix" is token #[code (0, 1)] – + | but at the document level, the entity will have the start and end + | indices #[code (0, 7)]. + ++h(3, "setting-from-array") Setting entity annotations from array p - | You can also assign entity annotations using the #[code doc.from_array()] - | method. To do this, you should include both the #[code ENT_TYPE] and the - | #[code ENT_IOB] attributes in the array you're importing from. + | You can also assign entity annotations using the + | #[+api("doc#from_array") #[code doc.from_array()]] method. To do this, + | you should include both the #[code ENT_TYPE] and the #[code ENT_IOB] + | attributes in the array you're importing from. -+code("Example"). - from spacy.attrs import ENT_IOB, ENT_TYPE ++code. import numpy + from spacy.attrs import ENT_IOB, ENT_TYPE doc = nlp.make_doc(u'London is a big city in the United Kingdom.') assert list(doc.ents) == [] + header = [ENT_IOB, ENT_TYPE] attr_array = numpy.zeros((len(doc), len(header))) attr_array[0, 0] = 2 # B @@ -83,12 +114,14 @@ p doc.from_array(header, attr_array) assert list(doc.ents)[0].text == u'London' ++h(3, "setting-cython") Setting entity annotations in Cython + p | Finally, you can always write to the underlying struct, if you compile - | a Cython function. This is easy to do, and allows you to write efficient - | native code. + | a #[+a("http://cython.org/") Cython] function. This is easy to do, and + | allows you to write efficient native code. -+code("Example"). ++code. # cython: infer_types=True from spacy.tokens.doc cimport Doc @@ -104,67 +137,30 @@ p | you'll have responsibility for ensuring that the data is left in a | consistent state. - -+h(2, "displacy") Visualizing named entities - -p - | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] - | lets you explore an entity recognition model's behaviour interactively. - | If you're training a model, it's very useful to run the visualization - | yourself. To help you do that, spaCy v2.0+ comes with a visualization - | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to - | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to - | run the web server, or #[+api("displacy#render") #[code displacy.render]] - | to generate the raw markup. - -p - | For more details and examples, see the - | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. - -+code("Named Entity example"). - import spacy - from spacy import displacy - - text = """But Google is starting from behind. The company made a late push - into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa - software, which runs on its Echo and Dot devices, have clear leads in - consumer adoption.""" - - nlp = spacy.load('custom_ner_model') - doc = nlp(text) - displacy.serve(doc, style='ent') - -+codepen("a73f8b68f9af3157855962b283b364e4", 345) - +h(2, "entity-types") Built-in entity types -include ../api/_annotation/_named-entities ++aside("Tip: Understanding entity types") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". -+aside("Install") - | The #[+api("load") #[code spacy.load()]] function configures a pipeline that - | includes all of the available annotators for the given ID. In the example - | above, the #[code 'en'] ID tells spaCy to load the default English - | pipeline. If you have installed the data with - | #[code python -m spacy download en], this will include the entity - | recognition model. +include ../api/_annotation/_named-entities +h(2, "updating") Training and updating p | To provide training examples to the entity recogniser, you'll first need - | to create an instance of the #[code GoldParse] class. You can specify - | your annotations in a stand-off format or as token tags. + | to create an instance of the #[+api("goldparse") #[code GoldParse]] class. + | You can specify your annotations in a stand-off format or as token tags. +code. - import spacy import random + import spacy from spacy.gold import GoldParse - from spacy.language import EntityRecognizer + from spacy.pipeline import EntityRecognizer - train_data = [ - ('Who is Chaka Khan?', [(7, 17, 'PERSON')]), - ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) - ] + train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]), + ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])] nlp = spacy.load('en', entity=False, parser=False) ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC']) @@ -237,3 +233,34 @@ p | loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] | imitation learning strategy. The transition system is equivalent to the | BILOU tagging scheme. + ++h(2, "displacy") Visualizing named entities + +p + | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] + | lets you explore an entity recognition model's behaviour interactively. + | If you're training a model, it's very useful to run the visualization + | yourself. To help you do that, spaCy v2.0+ comes with a visualization + | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to + | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to + | run the web server, or #[+api("displacy#render") #[code displacy.render]] + | to generate the raw markup. + +p + | For more details and examples, see the + | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. + ++code("Named Entity example"). + import spacy + from spacy import displacy + + text = """But Google is starting from behind. The company made a late push + into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa + software, which runs on its Echo and Dot devices, have clear leads in + consumer adoption.""" + + nlp = spacy.load('custom_ner_model') + doc = nlp(text) + displacy.serve(doc, style='ent') + ++codepen("a73f8b68f9af3157855962b283b364e4", 345) From b6209e24271bcc141c21168e4592a5063e8bc2f2 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:18:08 +0200 Subject: [PATCH 19/51] Update POS tagging workflow --- website/docs/usage/pos-tagging.jade | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index cded00b6c..245156b77 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -7,22 +7,12 @@ p | assigned to each token in the document. They're useful in rule-based | processes. They can also be useful features in some statistical models. -p - | To use spaCy's tagger, you need to have a data pack installed that - | includes a tagging model. Tagging models are included in the data - | downloads for English and German. After you load the model, the tagger - | is applied automatically, as part of the default pipeline. You can then - | access the tags using the #[+api("token") #[code Token.tag]] and - | #[+api("token") #[code token.pos]] attributes. For English, the tagger - | also triggers some simple rule-based morphological processing, which - | gives you the lemma as well. ++h(2, "101") Part-of-speech tagging 101 + +tag-model("dependency parse") -+code("Usage"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'They told us to duck.') - for word in doc: - print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_) +include _spacy-101/_pos-deps + ++aside("Help – spaCy's output is wrong!") +h(2, "rule-based-morphology") Rule-based morphology @@ -63,7 +53,8 @@ p +list("numbers") +item - | The tokenizer consults a #[strong mapping table] + | The tokenizer consults a + | #[+a("/docs/usage/adding-languages#tokenizer-exceptions") mapping table] | #[code TOKENIZER_EXCEPTIONS], which allows sequences of characters | to be mapped to multiple tokens. Each token may be assigned a part | of speech and one or more morphological features. @@ -77,8 +68,9 @@ p +item | For words whose POS is not set by a prior process, a - | #[strong mapping table] #[code TAG_MAP] maps the tags to a - | part-of-speech and a set of morphological features. + | #[+a("/docs/usage/adding-languages#tag-map") mapping table] + | #[code TAG_MAP] maps the tags to a part-of-speech and a set of + | morphological features. +item | Finally, a #[strong rule-based deterministic lemmatizer] maps the From b6c62baab39e54c78b75104e0f2ec532ad3e69b8 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:18:53 +0200 Subject: [PATCH 20/51] Update What's new in v2 docs --- website/docs/usage/v2.jade | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 8faae9d32..d3941bba0 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -55,7 +55,23 @@ p | #[strong API:] #[+api("spacy#load") #[code spacy.load]] | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] -+h(3, "features-language") Improved language data and processing pipelines ++h(3, "features-language") Improved language data and lazy loading + +p + | Language-specfic data now lives in its own submodule, #[code spacy.lang]. + | Languages are lazy-loaded, i.e. only loaded when you import a + | #[code Language] class, or load a model that initialises one. This allows + | languages to contain more custom data, e.g. lemmatizer lookup tables, or + | complex regular expressions. The language data has also been tidied up + | and simplified. It's now also possible to overwrite the functions that + | compute lexical attributes like #[code like_num], and supply + | language-specific syntax iterators, e.g. to determine noun chunks. + ++infobox + | #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang] + | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] + ++h(3, "features-pipelines") Improved processing pipelines +aside-code("Example"). from spacy.language import Language @@ -64,7 +80,7 @@ p +infobox | #[strong API:] #[+api("language") #[code Language]] - | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] + | #[strong Usage:] #[+a("/docs/usage/processing-text") Processing text] +h(3, "features-lemmatizer") Simple lookup-based lemmatization @@ -95,7 +111,7 @@ p from spacy.matcher import Matcher from spacy.attrs import LOWER, IS_PUNCT matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', on_match=None, + matcher.add('HelloWorld', None, [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], [{LOWER: 'hello'}, {LOWER: 'world'}]) assert len(matcher) == 1 @@ -128,6 +144,18 @@ p +h(2, "incompat") Backwards incompatibilities +table(["Old", "New"]) + +row + +cell + | #[code spacy.en] + | #[code spacy.xx] + +cell + | #[code spacy.lang.en] + | #[code spacy.lang.xx] + + +row + +cell #[code spacy.orth] + +cell #[code spacy.lang.xx.lex_attrs] + +row +cell #[code Language.save_to_directory] +cell #[+api("language#to_disk") #[code Language.to_disk]] From af348025ecbe0229b016e341c1c9dc43625957f4 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:19:09 +0200 Subject: [PATCH 21/51] Update word vectors & similarity workflow --- .../docs/usage/word-vectors-similarities.jade | 75 +++++++++---------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 3cc0a67a8..00e200f59 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -6,46 +6,40 @@ p | Dense, real valued vectors representing distributional similarity | information are now a cornerstone of practical NLP. The most common way | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] - | family of algorithms. - -+aside("Tip") - | If you need to train a word2vec model, we recommend the implementation in - | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. - -p - | spaCy makes using word vectors very easy. The - | #[+api("lexeme") #[code Lexeme]], #[+api("token") #[code Token]], - | #[+api("span") #[code Span]] and #[+api("doc") #[code Doc]] classes all - | have a #[code .vector] property, which is a 1-dimensional numpy array of - | 32-bit floats: - -+code. - import numpy - - apples, and_, oranges = nlp(u'apples and oranges') - print(apples.vector.shape) - # (1,) - apples.similarity(oranges) - -p - | By default, #[code Token.vector] returns the vector for its underlying - | lexeme, while #[code Doc.vector] and #[code Span.vector] return an - | average of the vectors of their tokens. You can customize these - | behaviours by modifying the #[code doc.user_hooks], - | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] - | dictionaries. - -+aside-code("Example"). - # TODO - -p - | The default English model installs vectors for one million vocabulary - | entries, using the 300-dimensional vectors trained on the Common Crawl + | family of algorithms. The default + | #[+a("/docs/usage/models#available") English model] installs + | 300-dimensional vectors trained on the Common Crawl | corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe] | algorithm. The GloVe common crawl vectors have become a de facto | standard for practical NLP. -+aside-code("Example"). ++aside("Tip: Training a word2vec model") + | If you need to train a word2vec model, we recommend the implementation in + | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. + ++h(2, "101") Similarity and word vectors 101 + +tag-model("vectors") + +include _spacy-101/_similarity +include _spacy-101/_word-vectors + + ++h(2, "custom") Customising word vectors + +p + | By default, #[+api("token#vector") #[code Token.vector]] returns the + | vector for its underlying #[+api("lexeme") #[code Lexeme]], while + | #[+api("doc#vector") #[code Doc.vector]] and + | #[+api("span#vector") #[code Span.vector]] return an average of the + | vectors of their tokens. + +p + | You can customize these + | behaviours by modifying the #[code doc.user_hooks], + | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] + | dictionaries. + ++code("Example"). # TODO p @@ -56,11 +50,14 @@ p | can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a | path to a binary file written by #[code vocab.dump_vectors()]. -+aside-code("Example"). ++code("Example"). # TODO p - | You can also load vectors from memory, by writing to the #[code lexeme.vector] - | property. If the vectors you are writing are of different dimensionality + | You can also load vectors from memory by writing to the + | #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors + | you are writing are of different dimensionality | from the ones currently loaded, you should first call | #[code vocab.resize_vectors(new_size)]. + ++h(2, "similarity") Similarity From fe24267948c75759f774130bb63c27fc3cf539ee Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:19:20 +0200 Subject: [PATCH 22/51] Update usage docs meta and navigation --- website/docs/usage/_data.json | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 8eca16a8c..45daa8381 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -3,13 +3,13 @@ "Get started": { "Installation": "./", "Models": "models", + "spaCy 101": "spacy-101", "Lightning tour": "lightning-tour", "Visualizers": "visualizers", "Troubleshooting": "troubleshooting", "What's new in v2.0": "v2" }, "Workflows": { - "spaCy 101": "spacy-101", "Loading the pipeline": "language-processing-pipeline", "Processing text": "processing-text", "spaCy's data model": "data-model", @@ -44,13 +44,18 @@ "models": { "title": "Models", - "next": "lightning-tour", + "next": "spacy-101", "quickstart": true }, + "spacy-101": { + "title": "spaCy 101", + "next": "lightning-tour" + }, + "lightning-tour": { "title": "Lightning tour", - "next": "spacy-101" + "next": "visualizers" }, "visualizers": { @@ -66,10 +71,6 @@ "title": "Resources" }, - "spacy-101": { - "title": "spaCy 101" - }, - "language-processing-pipeline": { "title": "Loading a language processing pipeline", "next": "processing-text" @@ -95,7 +96,7 @@ }, "entity-recognition": { - "title": "Entity recognition", + "title": "Named Entity Recognition", "next": "rule-based-matching" }, From 9ed6b48a49c289af307388e304f2a8ff2a25254a Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:34:39 +0200 Subject: [PATCH 23/51] Update dependency parse workflow --- website/docs/usage/dependency-parse.jade | 205 +++++++++++++++-------- 1 file changed, 132 insertions(+), 73 deletions(-) diff --git a/website/docs/usage/dependency-parse.jade b/website/docs/usage/dependency-parse.jade index 904522bd4..abfa1f825 100644 --- a/website/docs/usage/dependency-parse.jade +++ b/website/docs/usage/dependency-parse.jade @@ -8,55 +8,80 @@ p | boundary detection, and lets you iterate over base noun phrases, or | "chunks". -+aside-code("Example"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'I like green eggs and ham.') - for np in doc.noun_chunks: - print(np.text, np.root.text, np.root.dep_, np.root.head.text) - # I I nsubj like - # green eggs eggs dobj like - # ham ham conj eggs - p | You can check whether a #[+api("doc") #[code Doc]] object has been | parsed with the #[code doc.is_parsed] attribute, which returns a boolean | value. If this attribute is #[code False], the default sentence iterator | will raise an exception. -+h(2, "displacy") The displaCy visualizer ++h(2, "noun-chunks") Noun chunks + +tag-model("dependency parse") -p - | The best way to understand spaCy's dependency parser is interactively, - | through the #[+a(DEMOS_URL + "/displacy", true) displaCy visualizer]. If - | you want to know how to write rules that hook into some type of syntactic - | construction, just plug the sentence into the visualizer and see how - | spaCy annotates it. +p Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt nisi. Integer commodo pellentesque tincidunt. Nam at turpis finibus tortor gravida sodales tincidunt sit amet est. Nullam euismod arcu in tortor auctor. + ++code("Example"). + nlp = spacy.load('en') + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + for chunk in doc.noun_chunks: + print(chunk.text, chunk.root.text, chunk.root.dep_, + chunk.root.head.text) + ++aside + | #[strong Text:] The original noun chunk text.#[br] + | #[strong Root text:] ...#[br] + | #[strong Root dep:] ...#[br] + | #[strong Root head text:] ...#[br] + ++table(["Text", "root.text", "root.dep_", "root.head.text"]) + - var style = [0, 0, 1, 0] + +annotation-row(["Autonomous cars", "cars", "nsubj", "shift"], style) + +annotation-row(["insurance liability", "liability", "dobj", "shift"], style) + +annotation-row(["manufacturers", "manufacturers", "pobj", "toward"], style) +h(2, "navigating") Navigating the parse tree p - | spaCy uses the terms #[em head] and #[em child] to describe the words - | connected by a single arc in the dependency tree. The term #[em dep] is - | used for the arc label, which describes the type of syntactic relation - | that connects the child to the head. As with other attributes, the value - | of #[code token.dep] is an integer. You can get the string value with - | #[code token.dep_]. + | spaCy uses the terms #[strong head] and #[strong child] to describe the words + | #[strong connected by a single arc] in the dependency tree. The term + | #[strong dep] is used for the arc label, which describes the type of + | syntactic relation that connects the child to the head. As with other + | attributes, the value of #[code .dep] is an integer. You can get + | the string value with #[code .dep_]. -+aside-code("Example"). - from spacy.symbols import det - the, dog = nlp(u'the dog') - assert the.dep == det - assert the.dep_ == 'det' ++code("Example"). + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + for token in doc: + print(token.text, token.dep_, token.head.text, token.head.pos_, + [child for child in token.children]) + ++aside + | #[strong Text]: The original token text.#[br] + | #[strong Dep]: The syntactic relation connecting child to head.#[br] + | #[strong Head text]: The original text of the token head.#[br] + | #[strong Head POS]: The part-of-speech tag of the token head.#[br] + | #[strong Children]: ... + ++table(["Text", "Dep", "Head text", "Head POS", "Children"]) + - var style = [0, 1, 0, 1, 0] + +annotation-row(["Autonomous", "amod", "cars", "NOUN", ""], style) + +annotation-row(["cars", "nsubj", "shift", "VERB", "Autonomous"], style) + +annotation-row(["shift", "ROOT", "shift", "VERB", "cars, liability"], style) + +annotation-row(["insurance", "compound", "liability", "NOUN", ""], style) + +annotation-row(["liability", "dobj", "shift", "VERB", "insurance, toward"], style) + +annotation-row(["toward", "prep", "liability", "NOUN", "manufacturers"], style) + +annotation-row(["manufacturers", "pobj", "toward", "ADP", ""], style) + ++codepen("dcf8d293367ca185b935ed2ca11ebedd", 370) p - | Because the syntactic relations form a tree, every word has exactly one - | head. You can therefore iterate over the arcs in the tree by iterating - | over the words in the sentence. This is usually the best way to match an - | arc of interest β€” from below: + | Because the syntactic relations form a tree, every word has + | #[strong exactly one head]. You can therefore iterate over the arcs in + | the tree by iterating over the words in the sentence. This is usually + | the best way to match an arc of interest β€” from below: +code. from spacy.symbols import nsubj, VERB + # Finding a verb with a subject from below β€” good verbs = set() for possible_subject in doc: @@ -82,6 +107,8 @@ p | attribute, which provides a sequence of #[+api("token") #[code Token]] | objects. ++h(3, "navigating-around") Iterating around the local tree + p | A few more convenience attributes are provided for iterating around the | local tree from the token. The #[code .lefts] and #[code .rights] @@ -90,55 +117,89 @@ p | two integer-typed attributes, #[code .n_rights] and #[code .n_lefts], | that give the number of left and right children. -+aside-code("Examples"). - apples = nlp(u'bright red apples on the tree')[2] - print([w.text for w in apples.lefts]) - # ['bright', 'red'] - print([w.text for w in apples.rights]) - # ['on'] - assert apples.n_lefts == 2 - assert apples.n_rights == 1 - - from spacy.symbols import nsubj - doc = nlp(u'Credit and mortgage account holders must submit their requests within 30 days.') - root = [w for w in doc if w.head is w][0] - subject = list(root.lefts)[0] - for descendant in subject.subtree: - assert subject.is_ancestor_of(descendant) - - from spacy.symbols import nsubj - doc = nlp(u'Credit and mortgage account holders must submit their requests.') - holders = doc[4] - span = doc[holders.left_edge.i : holders.right_edge.i + 1] - span.merge() - for word in doc: - print(word.text, word.pos_, word.dep_, word.head.text) - # Credit and mortgage account holders nsubj NOUN submit - # must VERB aux submit - # submit VERB ROOT submit - # their DET det requests - # requests NOUN dobj submit ++code. + doc = nlp(u'bright red apples on the tree') + assert [token.text for token in doc[2].lefts]) == [u'bright', u'red'] + assert [token.text for token in doc[2].rights]) == ['on'] + assert doc[2].n_lefts == 2 + assert doc[2].n_rights == 1 p | You can get a whole phrase by its syntactic head using the | #[code .subtree] attribute. This returns an ordered sequence of tokens. - | For the default English model, the parse tree is #[em projective], which - | means that there are no crossing brackets. The tokens returned by - | #[code .subtree] are therefore guaranteed to be contiguous. This is not - | true for the German model, which has many - | #[+a("https://explosion.ai/blog/german-model#word-order", true) non-projective dependencies]. | You can walk up the tree with the #[code .ancestors] attribute, and - | check dominance with the #[code .is_ancestor()] method. + | check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]] + | method. + ++aside("Projective vs. non-projective") + | For the #[+a("/docs/usage/models#available") default English model], the + | parse tree is #[strong projective], which means that there are no crossing + | brackets. The tokens returned by #[code .subtree] are therefore guaranteed + | to be contiguous. This is not true for the German model, which has many + | #[+a(COMPANY_URL + "/blog/german-model#word-order", true) non-projective dependencies]. + ++code. + doc = nlp(u'Credit and mortgage account holders must submit their requests') + root = [token for token in doc if token.head is token][0] + subject = list(root.lefts)[0] + for descendant in subject.subtree: + assert subject.is_ancestor(descendant) + print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, + [ancestor.text for ancestor in descendant.ancestors]) + ++table(["Text", "Dep", "n_lefts", "n_rights", "ancestors"]) + - var style = [0, 1, 1, 1, 0] + +annotation-row(["Credit", "nmod", 0, 2, "holders, submit"], style) + +annotation-row(["and", "cc", 0, 0, "Credit, holders, submit"], style) + +annotation-row(["mortgage", "compound", 0, 0, "account, Credit, holders, submit"], style) + +annotation-row(["account", "conj", 1, 0, "Credit, holders, submit"], style) + +annotation-row(["holders", "nsubj", 1, 0, "submit"], style) p - | Finally, I often find the #[code .left_edge] and #[code right_edge] - | attributes especially useful. They give you the first and last token + | Finally, the #[code .left_edge] and #[code .right_edge] attributes + | can be especially useful, because they give you the first and last token | of the subtree. This is the easiest way to create a #[code Span] object - | for a syntactic phrase β€” a useful operation. + | for a syntactic phrase. Note that #[code .right_edge] gives a token + | #[strong within] the subtree β€” so if you use it as the end-point of a + | range, don't forget to #[code +1]! + ++code. + doc = nlp(u'Credit and mortgage account holders must submit their requests') + span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1] + span.merge() + for token in doc: + print(token.text, token.pos_, token.dep_, token.head.text) + ++table(["Text", "POS", "Dep", "Head text"]) + - var style = [0, 1, 1, 0] + +annotation-row(["Credit and mortgage account holders", "NOUN", "nsubj", "submit"], style) + +annotation-row(["must", "VERB", "aux", "submit"], style) + +annotation-row(["submit", "VERB", "ROOT", "submit"], style) + +annotation-row(["their", "ADJ", "poss", "requests"], style) + +annotation-row(["requests", "NOUN", "dobj", "submit"], style) + ++h(2, "displacy") Visualizing dependencies p - | Note that #[code .right_edge] gives a token #[em within] the subtree β€” - | so if you use it as the end-point of a range, don't forget to #[code +1]! + | The best way to understand spaCy's dependency parser is interactively. + | To make this easier, spaCy v2.0+ comes with a visualization module. Simply + | pass a #[code Doc] or a list of #[code Doc] objects to + | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to + | run the web server, or #[+api("displacy#render") #[code displacy.render]] + | to generate the raw markup. If you want to know how to write rules that + | hook into some type of syntactic construction, just plug the sentence into + | the visualizer and see how spaCy annotates it. + ++code. + from spacy import displacy + + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + displacy.serve(doc, style='dep') + ++infobox + | For more details and examples, see the + | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. You + | can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo]. +h(2, "disabling") Disabling the parser @@ -149,8 +210,6 @@ p | the parser from being loaded: +code. - import spacy - nlp = spacy.load('en', parser=False) p From 7ef7f0b42c98e395f9899bce5f0aef19b2ac1a17 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:37:51 +0200 Subject: [PATCH 24/51] Add linguistic annotations 101 content --- website/docs/usage/spacy-101.jade | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 06f88ace2..2507b9d94 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -2,6 +2,54 @@ include ../../_includes/_mixins ++h(2, "annotations") Linguistic annotations + +p + | spaCy provides a variety of linguistic annotations to give you insights + | into a text's grammatical structure. This includes the word types, + | i.e. the parts of speech, and how the words are related to each other. + | For example, if you're analysing text, it makes a #[em huge] difference + | whether a noun is the subject of a sentence, or the object – or whether + | "google" is used as a verb, or refers to the website or company in a + | specific context. + +p + | Once you've downloaded and installed a #[+a("/docs/usage/models") model], + | you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will + | return a #[code Language] object contaning all components and data needed + | to process text. We usually call it #[code nlp]. Calling the #[code nlp] + | object on a string of text will return a processed #[code Doc]: + ++code. + import spacy + + nlp = spacy.load('en') + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + ++h(3, "annotations-token") Tokenization + +include _spacy-101/_tokenization + + ++h(3, "annotations-pos-deps") Part-of-speech tags and dependencies + +tag-model("dependency parse") + +include _spacy-101/_pos-deps + ++h(3, "annotations-ner") Named Entities + +tag-model("named entities") + +include _spacy-101/_named-entities + ++h(2, "vectors-similarity") Word vectors and similarity + +tag-model("vectors") + +include _spacy-101/_similarity + +include _spacy-101/_word-vectors + ++h(2, "pipelines") Pipelines + +h(2, "architecture") Architecture +image From e6d88dfe08a34aeef61c27c726a0b269257a2f0b Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:38:33 +0200 Subject: [PATCH 25/51] Add features table to 101 --- website/docs/usage/spacy-101.jade | 55 +++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 2507b9d94..4fb758bb4 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -2,6 +2,61 @@ include ../../_includes/_mixins ++h(2, "features") Features + ++aside + | If one of spaCy's functionalities #[strong needs a model], it means that + | you need to have one our the available + | #[+a("/docs/usage/models") statistical models] installed. Models are used + | to #[strong predict] linguistic annotations – for example, if a word is + | a verb or a noun. + ++table(["Name", "Description", "Needs model"]) + +row + +cell #[strong Tokenization] + +cell + +cell #[+procon("con")] + + +row + +cell #[strong Part-of-speech Tagging] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Dependency Parsing] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Sentence Boundary Detection] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Named Entity Recongition] (NER) + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Rule-based Matching] + +cell + +cell #[+procon("con")] + + +row + +cell #[strong Similarity] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Training] + +cell + +cell #[+procon("neutral")] + + +row + +cell #[strong Serialization] + +cell + +cell #[+procon("neutral")] + +h(2, "annotations") Linguistic annotations p From 4fb5fb7218dc81b78b0aa737d52bfba9b16b4297 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:40:04 +0200 Subject: [PATCH 26/51] Update v2 docs --- website/docs/usage/v2.jade | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index d3941bba0..4a0e6ca2f 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -242,6 +242,79 @@ p +cell #[code Token.is_ancestor_of] +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]] ++h(2, "migrating") Migrating from spaCy 1.x ++list + +item Saving, loading and serialization. + +item Processing pipelines and language data. + +item Adding patterns and callbacks to the matcher. + +item Models trained with spaCy 1.x. + ++infobox("Some tips") + | Before migrating, we strongly recommend writing a few + | #[strong simple tests] specific to how you're using spaCy in your + | application. This makes it easier to check whether your code requires + | changes, and if so, which parts are affected. + | (By the way, feel free contribute your tests to + | #[+src(gh("spaCy", "spacy/tests")) our test suite] – this will also ensure + | we never accidentally introduce a bug in a workflow that's + | important to you.) If you've trained your own models, keep in mind that + | your train and runtime inputs must match. This means you'll have to + | #[strong retrain your models] with spaCy v2.0 to make them compatible. + + ++h(3, "migrating-saving-loading") Saving, loading and serialization +h(2, "migrating") Migrating from spaCy 1.x +p + | Double-check all calls to #[code spacy.load()] and make sure they don't + | use the #[code path] keyword argument. + ++code-new nlp = spacy.load('/model') ++code-old nlp = spacy.load('en', path='/model') + +p + | Review all other code that writes state to disk or bytes. + | All containers, now share the same, consistent API for saving and + | loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and + | loading with #[code from_disk()] and #[code from_bytes()]. + ++code-new. + nlp.to_disk('/model') + nlp.vocab.to_disk('/vocab') + ++code-old. + nlp.save_to_directory('/model') + nlp.vocab.dump('/vocab') + ++h(3, "migrating-languages") Processing pipelines and language data + +p + | If you're importing language data or #[code Language] classes, make sure + | to change your import statements to import from #[code spacy.lang]. If + | you've added your own custom language, it needs to be moved to + | #[code spacy/lang/xx]. + ++code-new from spacy.lang.en import English ++code-old from spacy.en import English + +p + | All components, e.g. tokenizer exceptions, are now responsible for + | compiling their data in the correct format. The language_data.py files + | have been removed + ++h(3, "migrating-matcher") Adding patterns and callbacks to the matcher + +p + | If you're using the matcher, you can now add patterns in one step. This + | should be easy to update – simply merge the ID, callback and patterns + | into one call to #[+api("matcher#add") #[code matcher.add]]. + ++code-new. + matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}]) + ++code-old. + matcher.add_entity('GoogleNow', on_match=merge_phrases) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) + ++h(3, "migrating-models") Trained models From 697d3d7cb3e18c219d1bad037bcccf6dbea35fe3 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 00:36:38 +0200 Subject: [PATCH 27/51] Fix links to CLI docs --- website/docs/api/util.jade | 2 +- website/docs/usage/adding-languages.jade | 6 +++--- website/docs/usage/saving-loading.jade | 2 +- website/docs/usage/training-ner.jade | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index ed8b5d8e5..f14cdbb6d 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -225,7 +225,7 @@ p p | Print a formatted, text-wrapped message with optional title. If a text | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the #[+a("/docs/usage/cli") CLI]. + | be used for interactive components like the #[+a("/docs/api/cli") CLI]. +aside-code("Example"). data_path = Path('/some/path') diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index f77acdf24..7eadde4b6 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -535,7 +535,7 @@ p | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] | script from the spaCy developer resources. Note that your corpus should | not be preprocessed (i.e. you need punctuation for example). The - | #[+a("/docs/usage/cli#model") #[code model] command] expects a + | #[+a("/docs/api/cli#model") #[code model]] command expects a | tab-separated word frequencies file with three columns: +list("numbers") @@ -651,13 +651,13 @@ p | If your corpus uses the | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, | i.e. files with the extension #[code .conllu], you can use the - | #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to + | #[+a("/docs/api/cli#convert") #[code convert]] command to convert it to | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. p | Once you have your UD corpus transformed into JSON, you can train your | model use the using spaCy's - | #[+a("/docs/usage/cli#train") #[code train] command]: + | #[+a("/docs/api/cli#train") #[code train]] command: +code(false, "bash"). python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index b11007683..3513e9505 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -28,7 +28,7 @@ p | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + | #[+a("/docs/api/cli#package") #[code package]] command documentation. +aside-code("meta.json", "json"). { diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 78eb4905e..4d864ac9d 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -77,8 +77,8 @@ p p | To make the model more convenient to deploy, we recommend wrapping it as | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command] - | to create all required files and directories. + | module. spaCy comes with a handy #[+a("/docs/api/cli#package") #[code package]] + | CLI command to create all required files and directories. +code(false, "bash"). python -m spacy package /home/me/data/en_technology /home/me/my_models From 990a70732a280f87dacd86c83d8cefbbe1e70a4b Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 00:37:21 +0200 Subject: [PATCH 28/51] Move installation troubleshooting to installation docs --- website/docs/usage/index.jade | 130 ++++++++++++++++ website/docs/usage/models.jade | 2 +- website/docs/usage/troubleshooting.jade | 190 ------------------------ 3 files changed, 131 insertions(+), 191 deletions(-) delete mode 100644 website/docs/usage/troubleshooting.jade diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index da13f4d81..61398b431 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -175,6 +175,136 @@ p +cell Python 3.5+ +cell Visual Studio 2015 ++h(2, "troubleshooting") Troubleshooting guide + +p + | This section collects some of the most common errors you may come + | across when installing, loading and using spaCy, as well as their solutions. + ++aside("Help us improve this guide") + | Did you come across a problem like the ones listed here and want to + | share the solution? You can find the "Suggest edits" button at the + | bottom of this page that points you to the source. We always + | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]! + ++h(3, "compatible-model") No compatible model found + ++code(false, "text"). + No compatible model found for [lang] (spaCy v#{SPACY_VERSION}). + +p + | This usually means that the model you're trying to download does not + | exist, or isn't available for your version of spaCy. Check the + | #[+a(gh("spacy-models", "compatibility.json")) compatibility table] + | to see which models are available for your spaCy version. If you're using + | an old version, consider upgrading to the latest release. Note that while + | spaCy supports tokenization for + | #[+a("/docs/api/language-models/#alpha-support") a variety of languages], + | not all of them come with statistical models. To only use the tokenizer, + | import the language's #[code Language] class instead, for example + | #[code from spacy.fr import French]. + ++h(3, "symlink-privilege") Symbolic link privilege not held + ++code(false, "text"). + OSError: symbolic link privilege not held + +p + | To create #[+a("/docs/usage/models/#usage") shortcut links] that let you + | load models by name, spaCy creates a symbolic link in the + | #[code spacy/data] directory. This means your user needs permission to do + | this. The above error mostly occurs when doing a system-wide installation, + | which will create the symlinks in a system directory. Run the + | #[code download] or #[code link] command as administrator, or use a + | #[code virtualenv] to install spaCy in a user directory, instead + | of doing a system-wide installation. + ++h(3, "no-cache-dir") No such option: --no-cache-dir + ++code(false, "text"). + no such option: --no-cache-dir + +p + | The #[code download] command uses pip to install the models and sets the + | #[code --no-cache-dir] flag to prevent it from requiring too much memory. + | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] + | requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to + | the latest version of pip. To see which version you have installed, + | run #[code pip --version]. + ++h(3, "import-error") Import error + ++code(false, "text"). + Import Error: No module named spacy + +p + | This error means that the spaCy module can't be located on your system, or in + | your environment. Make sure you have spaCy installed. If you're using a + | #[code virtualenv], make sure it's activated and check that spaCy is + | installed in that environment – otherwise, you're trying to load a system + | installation. You can also run #[code which python] to find out where + | your Python executable is located. + ++h(3, "import-error-models") Import error: models + ++code(false, "text"). + ImportError: No module named 'en_core_web_sm' + +p + | As of spaCy v1.7, all models can be installed as Python packages. This means + | that they'll become importable modules of your application. When creating + | #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try + | to import the model to load its meta data. If this fails, it's usually a + | sign that the package is not installed in the current environment. + | Run #[code pip list] or #[code pip freeze] to check which model packages + | you have installed, and install the + | #[+a("/docs/usage/models#available") correct models] if necessary. If you're + | importing a model manually at the top of a file, make sure to use the name + | of the package, not the shortcut link you've created. + ++h(3, "vocab-strings") File not found: vocab/strings.json + ++code(false, "text"). + FileNotFoundError: No such file or directory: [...]/vocab/strings.json + +p + | This error may occur when using #[code spacy.load()] to load + | a language model – either because you haven't set up a + | #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it + | doesn't actually exist. Set up a + | #[+a("/docs/usage/models/#usage") shortcut link] for the model + | you want to load. This can either be an installed model package, or a + | local directory containing the model data. If you want to use one of the + | #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for + | languages that don't yet have a statistical model, you should import its + | #[code Language] class instead, for example + | #[code from spacy.lang.bn import Bengali]. + ++h(3, "command-not-found") Command not found + ++code(false, "text"). + command not found: spacy + +p + | This error may occur when running the #[code spacy] command from the + | command line. spaCy does not currently add an entry to our #[code PATH] + | environment variable, as this can lead to unexpected results, especially + | when using #[code virtualenv]. Run the command with #[code python -m], + | for example #[code python -m spacy download en]. For more info on this, + | see the #[+a("/docs/api/cli#download") CLI documentation]. + ++h(3, "module-load") 'module' object has no attribute 'load' + ++code(false, "text"). + AttributeError: 'module' object has no attribute 'load' + +p + | While this could technically have many causes, including spaCy being + | broken, the most likely one is that your script's file or directory name + | is "shadowing" the module – e.g. your file is called #[code spacy.py], + | or a directory you're importing from is called #[code spacy]. So, when + | using spaCy, never call anything else #[code spacy]. + +h(2, "tests") Run tests p diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 2dec5197e..832ad8211 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -195,7 +195,7 @@ p | privileges, the #[code spacy link] command may fail. The easiest solution | is to re-run the command as admin, or use a #[code virtualenv]. For more | info on this, see the - | #[+a("/docs/usage/troubleshooting#symlink-privilege") troubleshooting guide]. + | #[+a("/docs/usage/#symlink-privilege") troubleshooting guide]. +h(3, "usage-import") Importing models as modules diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade deleted file mode 100644 index 501a250c8..000000000 --- a/website/docs/usage/troubleshooting.jade +++ /dev/null @@ -1,190 +0,0 @@ -//- πŸ’« DOCS > USAGE > TROUBLESHOOTING - -include ../../_includes/_mixins - -p - | This section collects some of the most common errors you may come - | across when installing, loading and using spaCy, as well as their solutions. - -+aside("Help us improve this guide") - | Did you come across a problem like the ones listed here and want to - | share the solution? You can find the "Suggest edits" button at the - | bottom of this page that points you to the source. We always - | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]! - -+h(2, "install-loading") Installation and loading - -+h(3, "compatible-model") No compatible model found - -+code(false, "text"). - No compatible model found for [lang] (spaCy v#{SPACY_VERSION}). - -p - | This usually means that the model you're trying to download does not - | exist, or isn't available for your version of spaCy. - -+infobox("Solutions") - | Check the #[+a(gh("spacy-models", "compatibility.json")) compatibility table] - | to see which models are available for your spaCy version. If you're using - | an old version, consider upgrading to the latest release. Note that while - | spaCy supports tokenization for - | #[+a("/docs/api/language-models/#alpha-support") a variety of languages], - | not all of them come with statistical models. To only use the tokenizer, - | import the language's #[code Language] class instead, for example - | #[code from spacy.fr import French]. - -+h(3, "symlink-privilege") Symbolic link privilege not held - -+code(false, "text"). - OSError: symbolic link privilege not held - -p - | To create #[+a("/docs/usage/models/#usage") shortcut links] that let you - | load models by name, spaCy creates a symbolic link in the - | #[code spacy/data] directory. This means your user needs permission to do - | this. The above error mostly occurs when doing a system-wide installation, - | which will create the symlinks in a system directory. - -+infobox("Solutions") - | Run the #[code download] or #[code link] command as administrator, - | or use a #[code virtualenv] to install spaCy in a user directory, instead - | of doing a system-wide installation. - -+h(3, "no-cache-dir") No such option: --no-cache-dir - -+code(false, "text"). - no such option: --no-cache-dir - -p - | The #[code download] command uses pip to install the models and sets the - | #[code --no-cache-dir] flag to prevent it from requiring too much memory. - | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] - | requires pip v6.0 or newer. - -+infobox("Solution") - | Run #[code pip install -U pip] to upgrade to the latest version of pip. - | To see which version you have installed, run #[code pip --version]. - -+h(3, "import-error") Import error - -+code(false, "text"). - Import Error: No module named spacy - -p - | This error means that the spaCy module can't be located on your system, or in - | your environment. - -+infobox("Solutions") - | Make sure you have spaCy installed. If you're using a #[code virtualenv], - | make sure it's activated and check that spaCy is installed in that - | environment – otherwise, you're trying to load a system installation. You - | can also run #[code which python] to find out where your Python - | executable is located. - -+h(3, "import-error-models") Import error: models - -+code(false, "text"). - ImportError: No module named 'en_core_web_sm' - -p - | As of spaCy v1.7, all models can be installed as Python packages. This means - | that they'll become importable modules of your application. When creating - | #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try - | to import the model to load its meta data. If this fails, it's usually a - | sign that the package is not installed in the current environment. - -+infobox("Solutions") - | Run #[code pip list] or #[code pip freeze] to check which model packages - | you have installed, and install the - | #[+a("/docs/usage/models#available") correct models] if necessary. If you're - | importing a model manually at the top of a file, make sure to use the name - | of the package, not the shortcut link you've created. - -+h(3, "vocab-strings") File not found: vocab/strings.json - -+code(false, "text"). - FileNotFoundError: No such file or directory: [...]/vocab/strings.json - -p - | This error may occur when using #[code spacy.load()] to load - | a language model – either because you haven't set up a - | #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it - | doesn't actually exist. - -+infobox("Solutions") - | Set up a #[+a("/docs/usage/models/#usage") shortcut link] for the model - | you want to load. This can either be an installed model package, or a - | local directory containing the model data. If you want to use one of the - | #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for - | languages that don't yet have a statistical model, you should import its - | #[code Language] class instead, for example - | #[code from spacy.fr import French]. - -+h(3, "command-not-found") Command not found - -+code(false, "text"). - command not found: spacy - -p - | This error may occur when running the #[code spacy] command from the - | command line. spaCy does not currently add an entry to our #[code PATH] - | environment variable, as this can lead to unexpected results, especially - | when using #[code virtualenv]. Instead, commands need to be prefixed with - | #[code python -m]. - -+infobox("Solution") - | Run the command with #[code python -m], for example - | #[code python -m spacy download en]. For more info on this, see the - | #[+a("/docs/usage/cli") CLI documentation]. - -+h(3, "module-load") 'module' object has no attribute 'load' - -+code(false, "text"). - AttributeError: 'module' object has no attribute 'load' - -p - | While this could technically have many causes, including spaCy being - | broken, the most likely one is that your script's file or directory name - | is "shadowing" the module – e.g. your file is called #[code spacy.py], - | or a directory you're importing from is called #[code spacy]. - -+infobox("Solution") - | When using spaCy, never call anything else #[code spacy]. - -+h(2, "usage") Using spaCy - -+h(3, "pos-lemma-number") POS tag or lemma is returned as number - -+code. - doc = nlp(u'This is text.') - print([word.pos for word in doc]) - # [88, 98, 90, 95] - -p - | Like many NLP libraries, spaCy encodes all strings to integers. This - | reduces memory usage and improves efficiency. The integer mapping also - | makes it easy to interoperate with numpy. To access the string - | representation instead of the integer ID, add an underscore #[code _] - | after the attribute. - -+infobox("Solutions") - | Use #[code pos_] or #[code lemma_] instead. See the - | #[+api("token#attributes") #[code Token] attributes] for a list of available - | attributes and their string representations. - - -+h(3, "pron-lemma") Pronoun lemma is returned as #[code -PRON-] - -+code. - doc = nlp(u'They are') - print(doc[0].lemma_) - # -PRON- - -p - | This is in fact expected behaviour and not a bug. - | Unlike verbs and common nouns, there's no clear base form of a personal - | pronoun. Should the lemma of "me" be "I", or should we normalize person - | as well, giving "it" β€” or maybe "he"? spaCy's solution is to introduce a - | novel symbol, #[code -PRON-], which is used as the lemma for - | all personal pronouns. For more info on this, see the - | #[+api("annotation#lemmatization") annotation specs] on lemmatization. From 10afb3c796cb9739bd969294a7ed973b4e519164 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 00:37:47 +0200 Subject: [PATCH 29/51] Tidy up and merge usage pages --- website/docs/api/philosophy.jade | 14 --- website/docs/usage/_data.json | 91 ++++++++----------- website/docs/usage/adding-languages.jade | 3 + website/docs/usage/customizing-tokenizer.jade | 90 ++++++++++++------ .../usage/language-processing-pipeline.jade | 37 ++++++++ 5 files changed, 140 insertions(+), 95 deletions(-) delete mode 100644 website/docs/api/philosophy.jade diff --git a/website/docs/api/philosophy.jade b/website/docs/api/philosophy.jade deleted file mode 100644 index eda911045..000000000 --- a/website/docs/api/philosophy.jade +++ /dev/null @@ -1,14 +0,0 @@ -//- πŸ’« DOCS > API > PHILOSOPHY - -include ../../_includes/_mixins - -p Every product needs to know why it exists. Here's what we're trying to with spaCy and why it's different from other NLP libraries. - -+h(2) 1. No job too big. -p Most programs get cheaper to run over time, but NLP programs often get more expensive. The data often grows faster than the hardware improves. For web-scale tasks, Moore's law can't save us β€” so if we want to read the web, we have to sweat performance. - -+h(2) 2. Take a stand. -p Most NLP toolkits position themselves as platforms, rather than libraries. They offer a pluggable architecture, and leave it to the user to arrange the components they offer into a useful system. This is fine for researchers, but for production users, this does too little. Components go out of date quickly, and configuring a good system takes very detailed knowledge. Compatibility problems can be extremely subtle. spaCy is therefore extremely opinionated. The API does not expose any algorithmic details. You're free to configure another pipeline, but the core library eliminates redundancy, and only offers one choice of each component. - -+h(2) 3. Stay current. -p There's often significant improvement in NLP models year-on-year. This has been especially true recently, given the success of deep learning models. With spaCy, you should be able to build things you couldn't build yesterday. To deliver on that promise, we need to be giving you the latest stuff. diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 45daa8381..f903c7c1e 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -5,26 +5,23 @@ "Models": "models", "spaCy 101": "spacy-101", "Lightning tour": "lightning-tour", - "Visualizers": "visualizers", - "Troubleshooting": "troubleshooting", "What's new in v2.0": "v2" }, "Workflows": { - "Loading the pipeline": "language-processing-pipeline", - "Processing text": "processing-text", - "spaCy's data model": "data-model", "POS tagging": "pos-tagging", "Using the parse": "dependency-parse", "Entity recognition": "entity-recognition", - "Custom pipelines": "customizing-pipeline", - "Rule-based matching": "rule-based-matching", "Word vectors": "word-vectors-similarities", - "Deep learning": "deep-learning", "Custom tokenization": "customizing-tokenizer", + "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", + "Processing text": "processing-text", + "NLP pipelines": "language-processing-pipeline", + "Deep learning": "deep-learning", "Training": "training", "Training NER": "training-ner", - "Saving & loading": "saving-loading" + "Saving & loading": "saving-loading", + "Visualizers": "visualizers" }, "Examples": { "Tutorials": "tutorials", @@ -38,10 +35,6 @@ "quickstart": true }, - "v2": { - "title": "What's new in v2.0" - }, - "models": { "title": "Models", "next": "spacy-101", @@ -67,27 +60,13 @@ "next": "resources" }, - "resources": { - "title": "Resources" + "v2": { + "title": "What's new in v2.0" }, - "language-processing-pipeline": { - "title": "Loading a language processing pipeline", - "next": "processing-text" - }, - - "customizing-pipeline": { - "title": "Customizing the pipeline", - "next": "customizing-tokenizer" - }, - - "processing-text": { - "title": "Processing text", - "next": "data-model" - }, - - "data-model": { - "title": "Understanding spaCy's data model" + "pos-tagging": { + "title": "Part-of-speech tagging", + "next": "dependency-parse" }, "dependency-parse": { @@ -97,26 +76,44 @@ "entity-recognition": { "title": "Named Entity Recognition", - "next": "rule-based-matching" - }, - - "rule-based-matching": { - "title": "Rule-based matching" + "next": "training-ner" }, "word-vectors-similarities": { - "title": "Using word vectors and semantic similarities" - }, - - "deep-learning": { - "title": "Hooking a deep learning model into spaCy" + "title": "Using word vectors and semantic similarities", + "next": "customizing-tokenizer" }, "customizing-tokenizer": { "title": "Customizing the tokenizer", + "next": "rule-based-matching" + }, + + "rule-based-matching": { + "title": "Rule-based matching", "next": "adding-languages" }, + "adding-languages": { + "title": "Adding languages", + "next": "training" + }, + + "processing-text": { + "title": "Processing text", + "next": "language-processing-pipeline" + }, + + "language-processing-pipeline": { + "title": "Natural language processing pipelines", + "next": "deep-learning" + }, + + "deep-learning": { + "title": "Hooking a deep learning model into spaCy", + "next": "training" + }, + "training": { "title": "Training spaCy's statistical models", "next": "saving-loading" @@ -131,16 +128,6 @@ "title": "Saving and loading models" }, - "pos-tagging": { - "title": "Part-of-speech tagging", - "next": "dependency-parse" - }, - - "adding-languages": { - "title": "Adding languages", - "next": "training" - }, - "showcase": { "title": "Showcase", diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 7eadde4b6..f3648b885 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -104,6 +104,9 @@ p +image include ../../assets/img/docs/language_data.svg + .u-text-right + +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic + +table(["File name", "Variables", "Description"]) +row diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index d43fb438f..5871e1655 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -11,18 +11,56 @@ p | #[code spaces] booleans, which allow you to maintain alignment of the | tokens into the original string. -+aside("See Also") - | If you haven't read up on spaCy's #[+a("data-model") data model] yet, - | you should probably have a look. The main point to keep in mind is that - | spaCy's #[code Doc] doesn't copy or refer to the original string. The - | string is reconstructed from the tokens when required. ++aside("spaCy's data model") + | The main point to keep in mind is that spaCy's #[code Doc] doesn't + | copy or refer to the original string. The string is reconstructed from + | the tokens when required. ++h(2, "101") Tokenizer 101 + +include _spacy-101/_tokenization + + ++h(3, "101-data") Tokenizer data + +p + | #[strong Global] and #[strong language-specific] tokenizer data is + | supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang]. + | The tokenizer exceptions define special cases like "don't" in English, + | which needs to be split into two tokens: #[code {ORTH: "do"}] and + | #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes + | mosty define punctuation rules – for example, when to split off periods + | (at the end of a sentence), and when to leave token containing periods + | intact (abbreviations like "U.S."). + ++image + include ../../assets/img/docs/language_data.svg + .u-text-right + +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic + ++infobox + | For more details on the language-specific data, see the + | usage workflow on #[+a("/docs/usage/adding-languages") adding languages]. +h(2, "special-cases") Adding special case tokenization rules p | Most domains have at least some idiosyncracies that require custom - | tokenization rules. Here's how to add a special case rule to an existing + | tokenization rules. This could be very certain expressions, or + | abbreviations only used in this specific field. + ++aside("Language data vs. custom tokenization") + | Tokenization rules that are specific to one language, but can be + | #[strong generalised across that language] should ideally live in the + | language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang] – we + | always appreciate pull requests! Anything that's specific to a domain or + | text type – like financial trading abbreviations, or Bavarian youth slang + | – should be added as a special case rule to your tokenizer instance. If + | you're dealing with a lot of customisations, it might make sense to create + | an entirely custom subclass. + +p + | Here's how to add a special case rule to an existing | #[+api("tokenizer") #[code Tokenizer]] instance: +code. @@ -30,15 +68,12 @@ p from spacy.symbols import ORTH, LEMMA, POS nlp = spacy.load('en') - assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that'] - nlp.tokenizer.add_special_case(u'gimme', - [ - { - ORTH: u'gim', - LEMMA: u'give', - POS: u'VERB'}, - { - ORTH: u'me'}]) + doc = nlp(u'gimme that') # phrase to tokenize + assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization + + # add special case rule + special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}] + nlp.tokenizer.add_special_case(u'gimme', special_case) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] @@ -55,9 +90,8 @@ p | The special case rules have precedence over the punctuation splitting: +code. - nlp.tokenizer.add_special_case(u'...gimme...?', - [{ - ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) + special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}] + nlp.tokenizer.add_special_case(u'...gimme...?', special_case) assert len(nlp(u'...gimme...?')) == 1 p @@ -137,8 +171,8 @@ p +h(2, "native-tokenizers") Customizing spaCy's Tokenizer class p - | Let's imagine you wanted to create a tokenizer for a new language. There - | are four things you would need to define: + | Let's imagine you wanted to create a tokenizer for a new language or + | specific domain. There are four things you would need to define: +list("numbers") +item @@ -170,14 +204,14 @@ p import re from spacy.tokenizer import Tokenizer - prefix_re = re.compile(r'''[\[\("']''') - suffix_re = re.compile(r'''[\]\)"']''') - def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search) + prefix_re = re.compile(r'''[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']''') - nlp = spacy.load('en', tokenizer=create_make_doc) + def create_tokenizer(nlp): + return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, + suffix_search=suffix_re.search) + + nlp = spacy.load('en', tokenizer=create_tokenizer) p | If you need to subclass the tokenizer instead, the relevant methods to @@ -191,8 +225,6 @@ p | you're creating the pipeline: +code. - import spacy - nlp = spacy.load('en', make_doc=my_tokenizer) p diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index c372dfbf4..0ea2609d2 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -126,3 +126,40 @@ p +row +cell #[code matcher] +cell Supply a pre-built matcher, instead of creating one. + ++h(2, "customizing") Customizing the pipeline + +p + | spaCy provides several linguistic annotation functions by default. Each + | function takes a Doc object, and modifies it in-place. The default + | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 + | introduced the ability to customise this pipeline with arbitrary + | functions. + ++code. + def arbitrary_fixup_rules(doc): + for token in doc: + if token.text == u'bill' and token.tag_ == u'NNP': + token.tag_ = u'NN' + + def custom_pipeline(nlp): + return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) + + nlp = spacy.load('en', create_pipeline=custom_pipeline) + +p + | The easiest way to customise the pipeline is to pass a + | #[code create_pipeline] callback to the #[code spacy.load()] function. + +p + | The callback you pass to #[code create_pipeline] should take a single + | argument, and return a sequence of callables. Each callable in the + | sequence should accept a #[code Doc] object and modify it in place. + +p + | Instead of passing a callback, you can also write to the + | #[code .pipeline] attribute directly. + ++code. + nlp = spacy.load('en') + nlp.pipeline = [nlp.tagger] From 66088851dcd4fe72056c0d7534d80e28400aad15 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 11:58:17 +0200 Subject: [PATCH 30/51] Add Doc.to_disk() and Doc.from_disk() methods --- spacy/tokens/doc.pyx | 18 ++++++++++++++++++ website/docs/api/doc.jade | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0e4faafbe..611a68186 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -598,6 +598,24 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + def to_disk(self, path): + """Save the current state to a directory. + + path (unicode or Path): A path to a directory, which will be created if + it doesn't exist. Paths may be either strings or `Path`-like objects. + """ + raise NotImplementedError() + + def from_disk(self, path): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode or Path): A path to a directory. Paths may be either + strings or `Path`-like objects. + RETURNS (Doc): The modified `Doc` object. + """ + raise NotImplementedError() + def to_bytes(self): """Serialize, i.e. export the document contents to a binary string. diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 6a9faf4b4..62b1a2a76 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -253,6 +253,44 @@ p +cell #[code Doc] +cell Itself. ++h(2, "to_disk") Doc.to_disk + +tag method + +p Save the current state to a directory. + ++aside-code("Example"). + doc.to_disk('/path/to/doc') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory, which will be created if it doesn't exist. + | Paths may be either strings or #[code Path]-like objects. + ++h(2, "from_disk") Doc.from_disk + +tag method + +p Loads state from a directory. Modifies the object in place and returns it. + ++aside-code("Example"). + from spacy.tokens import Doc + doc = Doc().from_disk('/path/to/doc') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory. Paths may be either strings or + | #[code Path]-like objects. + + +footrow + +cell returns + +cell #[code Doc] + +cell The modified #[code Doc] object. + +h(2, "to_bytes") Doc.to_bytes +tag method From 8b86b08bedf8143dad696bc6077f4c10a12782b9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 11:59:08 +0200 Subject: [PATCH 31/51] Update usage workflows --- website/docs/api/util.jade | 2 +- website/docs/usage/_data.json | 2 +- website/docs/usage/adding-languages.jade | 11 ++- website/docs/usage/customizing-pipeline.jade | 38 ----------- website/docs/usage/index.jade | 2 +- website/docs/usage/processing-text.jade | 9 ++- website/docs/usage/saving-loading.jade | 70 +++++++++++--------- website/docs/usage/training-ner.jade | 2 +- 8 files changed, 55 insertions(+), 81 deletions(-) delete mode 100644 website/docs/usage/customizing-pipeline.jade diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index f14cdbb6d..bf81a4f61 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -225,7 +225,7 @@ p p | Print a formatted, text-wrapped message with optional title. If a text | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the #[+a("/docs/api/cli") CLI]. + | be used for interactive components like the #[+api("cli") cli]. +aside-code("Example"). data_path = Path('/some/path') diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index f903c7c1e..acd973aa1 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -125,7 +125,7 @@ }, "saving-loading": { - "title": "Saving and loading models" + "title": "Saving, loading and data serialization" }, "showcase": { diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index f3648b885..ae04aad57 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -538,8 +538,8 @@ p | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] | script from the spaCy developer resources. Note that your corpus should | not be preprocessed (i.e. you need punctuation for example). The - | #[+a("/docs/api/cli#model") #[code model]] command expects a - | tab-separated word frequencies file with three columns: + | #[+api("cli#model") #[code model]] command expects a tab-separated word + | frequencies file with three columns: +list("numbers") +item The number of times the word occurred in your language sample. @@ -654,13 +654,12 @@ p | If your corpus uses the | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, | i.e. files with the extension #[code .conllu], you can use the - | #[+a("/docs/api/cli#convert") #[code convert]] command to convert it to - | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. + | #[+api("cli#convert") #[code convert]] command to convert it to spaCy's + | #[+a("/docs/api/annotation#json-input") JSON format] for training. p | Once you have your UD corpus transformed into JSON, you can train your - | model use the using spaCy's - | #[+a("/docs/api/cli#train") #[code train]] command: + | model use the using spaCy's #[+api("cli#train") #[code train]] command: +code(false, "bash"). python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] diff --git a/website/docs/usage/customizing-pipeline.jade b/website/docs/usage/customizing-pipeline.jade deleted file mode 100644 index a4846d02e..000000000 --- a/website/docs/usage/customizing-pipeline.jade +++ /dev/null @@ -1,38 +0,0 @@ -//- πŸ’« DOCS > USAGE > CUSTOMIZING THE PIPELINE - -include ../../_includes/_mixins - -p - | spaCy provides several linguistic annotation functions by default. Each - | function takes a Doc object, and modifies it in-place. The default - | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 - | introduced the ability to customise this pipeline with arbitrary - | functions. - -+code. - def arbitrary_fixup_rules(doc): - for token in doc: - if token.text == u'bill' and token.tag_ == u'NNP': - token.tag_ = u'NN' - - def custom_pipeline(nlp): - return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) - - nlp = spacy.load('en', create_pipeline=custom_pipeline) - -p - | The easiest way to customise the pipeline is to pass a - | #[code create_pipeline] callback to the #[code spacy.load()] function. - -p - | The callback you pass to #[code create_pipeline] should take a single - | argument, and return a sequence of callables. Each callable in the - | sequence should accept a #[code Doc] object and modify it in place. - -p - | Instead of passing a callback, you can also write to the - | #[code .pipeline] attribute directly. - -+code. - nlp = spacy.load('en') - nlp.pipeline = [nlp.tagger] diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 61398b431..cb1ab5754 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -291,7 +291,7 @@ p | environment variable, as this can lead to unexpected results, especially | when using #[code virtualenv]. Run the command with #[code python -m], | for example #[code python -m spacy download en]. For more info on this, - | see the #[+a("/docs/api/cli#download") CLI documentation]. + | see #[+api("cli#download") download]. +h(3, "module-load") 'module' object has no attribute 'load' diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade index 4bd6132d2..2562d9fc4 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/processing-text.jade @@ -10,14 +10,19 @@ p doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') p - | The library should perform equally well with short or long documents. + | The library should perform equally well with #[strong short or long documents]. | All algorithms are linear-time in the length of the string, and once the | data is loaded, there's no significant start-up cost to consider. This | means that you don't have to strategically merge or split your text β€” | you should feel free to feed in either single tweets or whole novels. p - | If you run #[code nlp = spacy.load('en')], the #[code nlp] object will + | If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will + | load the #[+a("/docs/usage/models") model] associated with the name + | #[code 'en']. Each model is a Python package containing an + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py] + +the #[code nlp] object will | be an instance of #[code spacy.en.English]. This means that when you run | #[code doc = nlp(text)], you're executing | #[code spacy.en.English.__call__], which is implemented on its parent diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 3513e9505..63c951d40 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -1,5 +1,8 @@ include ../../_includes/_mixins + ++h(2, "models") Saving models + p | After training your model, you'll usually want to save its state, and load | it back later. You can do this with the @@ -14,28 +17,28 @@ p | will be written out. To make the model more convenient to deploy, we | recommend wrapping it as a Python package. -+h(2, "generating") Generating a model package ++h(3, "models-generating") Generating a model package +infobox("Important note") | The model packages are #[strong not suitable] for the public | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not | designed for binary data and files over 50 MB. However, if your company - | is running an internal installation of pypi, publishing your models on - | there can be a convenient solution to share them with your team. + | is running an #[strong internal installation] of PyPi, publishing your + | models on there can be a convenient way to share them with your team. p | spaCy comes with a handy CLI command that will create all required files, | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a - | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/api/cli#package") #[code package]] command documentation. + | path to it using the #[code --meta] flag. For more info on this, see + | the #[+api("cli#package") #[code package]] docs. +aside-code("meta.json", "json"). { "name": "example_model", "lang": "en", "version": "1.0.0", - "spacy_version": ">=1.7.0,<2.0.0", + "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", @@ -58,7 +61,7 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+src(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The @@ -66,44 +69,49 @@ p | respective #[code Language] class in spaCy, which will later be returned | by the model's #[code load()] method. -+h(2, "building") Building a model package - p - | To build the package, run the following command from within the + | To #[strong build the package], run the following command from within the | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. + | #[code /dist]. For more information on building Python packages, see the + | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + +code(false, "bash"). python setup.py sdist -p - | For more information on building Python packages, see the - | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. - - -+h(2, "loading") Loading a model package ++h(2, "loading") Loading a custom model package p - | Model packages can be installed by pointing pip to the model's - | #[code .tar.gz] archive: + | To load a model from a data directory, you can use + | #[+api("spacy#load") #[code spacy.load()]] with the local path: + ++code. + nlp = spacy.load('/path/to/model') + +p + | If you have generated a model package, you can also install it by + | pointing pip to the model's #[code .tar.gz] archive – this is pretty + | much exactly what spaCy's #[+api("cli#download") #[code download]] + | command does under the hood. +code(false, "bash"). pip install /path/to/en_example_model-1.0.0.tar.gz -p You'll then be able to load the model as follows: ++aside-code("Custom model names", "bash"). + # optional: assign custom name to model + python -m spacy link en_example_model my_cool_model + +p + | You'll then be able to load the model via spaCy's loader, or by importing + | it as a module. For larger code bases, we usually recommend native + | imports, as this will make it easier to integrate models with your + | existing build process, continuous integration workflow and testing + | framework. +code. + # option 1: import model as module import en_example_model nlp = en_example_model.load() -p - | To load the model via #[code spacy.load()], you can also - | create a #[+a("/docs/usage/models#usage") shortcut link] that maps the - | package name to a custom model name of your choice: - -+code(false, "bash"). - python -m spacy link en_example_model example - -+code. - import spacy - nlp = spacy.load('example') + # option 2: use spacy.load() + nlp = spacy.load('en_example_model') diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 4d864ac9d..8b8789485 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -77,7 +77,7 @@ p p | To make the model more convenient to deploy, we recommend wrapping it as | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+a("/docs/api/cli#package") #[code package]] + | module. spaCy comes with a handy #[+api("cli#package") #[code package]] | CLI command to create all required files and directories. +code(false, "bash"). From 823d22100b0335687e4ef4e9ba7734ecaa4211bb Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:21:12 +0200 Subject: [PATCH 32/51] Tidy up architecture.svg --- website/assets/img/docs/architecture.svg | 124 +++++++++++------------ 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/docs/architecture.svg index d62d08f88..1025fbaaf 100644 --- a/website/assets/img/docs/architecture.svg +++ b/website/assets/img/docs/architecture.svg @@ -3,126 +3,126 @@ .text-large { fill: #1a1e23; font: 20px "Source Sans Pro" } .text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" } .text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" } - .text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" } + .text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" } - + Language - - + + MAKES - - + + nlp.vocab.morphology - + Vocab - - + + nlp.vocab - + StringStore - - + + nlp.vocab.strings - - + + nlp.tokenizer.vocab - + Tokenizer - - + + nlp.make_doc() - - + + nlp.pipeline - - + + nlp.pipeline[i].vocab - + pt - + en - + de - + fr - + es - + it - + nl - + sv - + fi - + nb - + hu - + he - + bn - + ja - + zh - - - - + + + + doc.vocab - - + + MAKES - + Doc - - + + MAKES - - + + token.doc - + Token - + Span - - + + lexeme.vocab - + Lexeme - - + + MAKES - - + + span.doc - + Dependency Parser - + Entity Recognizer - + Tagger - + Matcher - + Lemmatizer - + Morphology From b546bcb05f0b47fb2ff40906123525c5193813a1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:21:18 +0200 Subject: [PATCH 33/51] Add pipeline illustration --- website/assets/img/docs/pipeline.svg | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 website/assets/img/docs/pipeline.svg diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg new file mode 100644 index 000000000..ddd1171ef --- /dev/null +++ b/website/assets/img/docs/pipeline.svg @@ -0,0 +1,30 @@ + + + + + Doc + + + + Text + + + + nlp + + tokenizer + + vectorizer + + + + tagger + + parser + + ner + From 54885b5e8812b0e400934d06ace8cede8657fea6 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:24:40 +0200 Subject: [PATCH 34/51] Add serialization 101 --- .../docs/usage/_spacy-101/_serialization.jade | 35 +++++++++++++++++++ website/docs/usage/saving-loading.jade | 10 ++++++ website/docs/usage/spacy-101.jade | 4 +++ 3 files changed, 49 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_serialization.jade diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade new file mode 100644 index 000000000..b6a889014 --- /dev/null +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -0,0 +1,35 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > SERIALIZATION + +p + | If you've been modifying the pipeline, vocabulary vectors and entities, or made + | updates to the model, you'll eventually want + | to #[strong save your progress] – for example, everything that's in your #[code nlp] + | object. This means you'll have to translate its contents and structure + | into a format that can be saved, like a file or a byte string. This + | process is called serialization. spaCy comes with + | #[strong built-in serialization methods] and supports the + | #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol]. + ++aside("What's pickle?") + | Pickle is Python's built-in object persistance system. It lets you + | transfer arbitrary Python objects between processes. This is usually used + | to load an object to and from disk, but it's also used for distributed + | computing, e.g. with + | #[+a("https://spark.apache.org/docs/0.9.0/python-programming-guide.html") PySpark] + | or #[+a("http://dask.pydata.org/en/latest/") Dask]. When you unpickle an + | object, you're agreeing to execute whatever code it contains. It's like + | calling #[code eval()] on a string – so don't unpickle objects from + | untrusted sources. + +p + | All container classes and pipeline components, i.e. + for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"] + | #[+api(cls.toLowerCase()) #[code=cls]], + | have the following methods available: + ++table(["Method", "Returns", "Example"]) + - style = [1, 0, 1] + +annotation-row(["to_bytes", "bytes", "nlp.to_bytes()"], style) + +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style) + +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) + +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 63c951d40..e580bca25 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -1,5 +1,15 @@ include ../../_includes/_mixins ++h(2, "101") Serialization 101 + +include _spacy-101/_serialization + ++infobox("Important note") + | In spaCy v2.0, the API for saving and loading has changed to only use the + | four methods listed above consistently across objects and classes. For an + | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] + | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. + +h(2, "models") Saving models diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 4fb758bb4..958200637 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -105,6 +105,10 @@ include _spacy-101/_word-vectors +h(2, "pipelines") Pipelines ++h(2, "serialization") Serialization + +include _spacy-101/_serialization + +h(2, "architecture") Architecture +image From 8aaed8bea79c9df11fd6c799ddfd31bae2c81318 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:25:13 +0200 Subject: [PATCH 35/51] Add pipelines 101 and rewrite pipelines workflow --- website/docs/usage/_data.json | 2 +- website/docs/usage/_spacy-101/_pipelines.jade | 44 ++ .../usage/language-processing-pipeline.jade | 452 ++++++++++++------ website/docs/usage/spacy-101.jade | 2 + 4 files changed, 349 insertions(+), 151 deletions(-) create mode 100644 website/docs/usage/_spacy-101/_pipelines.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index acd973aa1..4d065522b 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -105,7 +105,7 @@ }, "language-processing-pipeline": { - "title": "Natural language processing pipelines", + "title": "Language processing pipelines", "next": "deep-learning" }, diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade new file mode 100644 index 000000000..fe6c149f6 --- /dev/null +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -0,0 +1,44 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > PIPELINES + +p + | When you call #[code nlp] on a text, spaCy first tokenizes the text to + | produce a #[code Doc] object. The #[code Doc] is the processed in several + | different steps – this is also referred to as the + | #[strong processing pipeline]. The pipeline used by our + | #[+a("/docs/usage/models") default models] consists of a + | vectorizer, a tagger, a parser and an entity recognizer. Each pipeline + | component returns the processed #[code Doc], which is then passed on to + | the next component. + ++image + include ../../../assets/img/docs/pipeline.svg + .u-text-right + +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic + ++table(["Name", "Component", "Creates"]) + +row + +cell tokenizer + +cell #[+api("tokenizer") #[code Tokenizer]] + +cell #[code Doc] + + +row("divider") + +cell vectorizer + +cell #[code Vectorizer] + +cell #[code Doc.tensor] + + +row + +cell tagger + +cell #[+api("tagger") #[code Tagger]] + +cell #[code Doc[i].tag] + + +row + +cell parser + +cell #[+api("dependencyparser") #[code DependencyParser]] + +cell + | #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents], + | #[code Doc.noun_chunks] + + +row + +cell ner + +cell #[+api("entityrecognizer") #[code EntityRecognizer]] + +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type] diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 0ea2609d2..3b41ad5de 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -2,164 +2,316 @@ include ../../_includes/_mixins -p - | The standard entry point into spaCy is the #[code spacy.load()] - | function, which constructs a language processing pipeline. The standard - | variable name for the language processing pipeline is #[code nlp], for - | Natural Language Processing. The #[code nlp] variable is usually an - | instance of class #[code spacy.language.Language]. For English, the - | #[code spacy.en.English] class is the default. ++h(2, "101") Pipelines 101 + +include _spacy-101/_pipelines + ++h(2, "pipelines") How pipelines work p - | You'll use the nlp instance to produce #[+api("doc") #[code Doc]] - | objects. You'll then use the #[code Doc] object to access linguistic - | annotations to help you with whatever text processing task you're - | trying to do. - -+code. - import spacy # See "Installing spaCy" - nlp = spacy.load('en') # You are here. - doc = nlp(u'Hello, spacy!') # See "Using the pipeline" - print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token" - -+aside("Why do we have to preload?") - | Loading the models takes ~200x longer than - | processing a document. We therefore want to amortize the start-up cost - | across multiple invocations. It's often best to wrap the pipeline as a - | singleton. The library avoids doing that for you, because it's a - | difficult design to back out of. - -p The #[code load] function takes the following positional arguments: - -+table([ "Name", "Description" ]) - +row - +cell #[code lang_id] - +cell - | An ID that is resolved to a class or factory function by - | #[code spacy.util.get_lang_class()]. Common values are - | #[code 'en'] for the English pipeline, or #[code 'de'] for the - | German pipeline. You can register your own factory function or - | class with #[code spacy.util.set_lang_class()]. + | spaCy makes it very easy to create your own pipelines consisting of + | reusable components – this includes spaCy's default vectorizer, tagger, + | parser and entity regcognizer, but also your own custom processing + | functions. A pipeline component can be added to an already existing + | #[code nlp] object, specified when initialising a #[code Language] class, + | or defined within a + | #[+a("/docs/usage/saving-loading#models-generating") model package]. p - | All keyword arguments are passed forward to the pipeline factory. No - | keyword arguments are required. The built-in factories (e.g. - | #[code spacy.en.English], #[code spacy.de.German]), which are subclasses - | of #[+api("language") #[code Language]], respond to the following - | keyword arguments: + | When you load a model, spaCy first consults the model's + | #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its + | #[code setup] details. This typically includes the ID of a language class, + | and an optional list of pipeline components. spaCy then does the + | following: -+table([ "Name", "Description"]) - +row - +cell #[code path] - +cell - | Where to load the data from. If None, the default data path is - | fetched via #[code spacy.util.get_data_path()]. You can - | configure this default using #[code spacy.util.set_data_path()]. - | The data path is expected to be either a string, or an object - | responding to the #[code pathlib.Path] interface. If the path is - | a string, it will be immediately transformed into a - | #[code pathlib.Path] object. spaCy promises to never manipulate - | or open file-system paths as strings. All access to the - | file-system is done via the #[code pathlib.Path] interface. - | spaCy also promises to never check the type of path objects. - | This allows you to customize the loading behaviours in arbitrary - | ways, by creating your own object that implements the - | #[code pathlib.Path] interface. ++aside-code("meta.json (excerpt)", "json"). + { + "name": "example_model", + "description": "Example model for spaCy", + "setup": { + "lang": "en", + "pipeline": ["token_vectors", "tagger"] + } + } - +row - +cell #[code pipeline] - +cell - | A sequence of functions that take the Doc object and modify it - | in-place. See - | #[+a("customizing-pipeline") Customizing the pipeline]. - - +row - +cell #[code create_pipeline] - +cell - | Callback to construct the pipeline sequence. It should accept - | the #[code nlp] instance as its only argument, and return a - | sequence of functions that take the #[code Doc] object and - | modify it in-place. - | See #[+a("customizing-pipeline") Customizing the pipeline]. If - | a value is supplied to the pipeline keyword argument, the - | #[code create_pipeline] keyword argument is ignored. - - +row - +cell #[code make_doc] - +cell A function that takes the input and returns a document object. - - +row - +cell #[code create_make_doc] - +cell - | Callback to construct the #[code make_doc] function. It should - | accept the #[code nlp] instance as its only argument. To use the - | built-in annotation processes, it should return an object of - | type #[code Doc]. If a value is supplied to the #[code make_doc] - | keyword argument, the #[code create_make_doc] keyword argument - | is ignored. - - +row - +cell #[code vocab] - +cell Supply a pre-built Vocab instance, instead of constructing one. - - +row - +cell #[code add_vectors] - +cell - | Callback that installs word vectors into the Vocab instance. The - | #[code add_vectors] callback should take a - | #[+api("vocab") #[code Vocab]] instance as its only argument, - | and set the word vectors and #[code vectors_length] in-place. See - | #[+a("word-vectors-similarities") Word Vectors and Similarities]. - - +row - +cell #[code tagger] - +cell Supply a pre-built tagger, instead of creating one. - - +row - +cell #[code parser] - +cell Supply a pre-built parser, instead of creating one. - - +row - +cell #[code entity] - +cell Supply a pre-built entity recognizer, instead of creating one. - - +row - +cell #[code matcher] - +cell Supply a pre-built matcher, instead of creating one. - -+h(2, "customizing") Customizing the pipeline ++list("numbers") + +item + | Look up #[strong pipeline IDs] in the available + | #[strong pipeline factories]. + +item + | Initialise the #[strong pipeline components] by calling their + | factories with the #[code Vocab] as an argument. This gives each + | factory and component access to the pipeline's shared data, like + | strings, morphology and annotation scheme. + +item + | Load the #[strong language class and data] for the given ID via + | #[+api("util.get_lang_class") #[code get_lang_class]]. + +item + | Pass the path to the #[strong model data] to the #[code Language] + | class and return it. p - | spaCy provides several linguistic annotation functions by default. Each - | function takes a Doc object, and modifies it in-place. The default - | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 - | introduced the ability to customise this pipeline with arbitrary - | functions. - -+code. - def arbitrary_fixup_rules(doc): - for token in doc: - if token.text == u'bill' and token.tag_ == u'NNP': - token.tag_ = u'NN' - - def custom_pipeline(nlp): - return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) - - nlp = spacy.load('en', create_pipeline=custom_pipeline) - -p - | The easiest way to customise the pipeline is to pass a - | #[code create_pipeline] callback to the #[code spacy.load()] function. - -p - | The callback you pass to #[code create_pipeline] should take a single - | argument, and return a sequence of callables. Each callable in the - | sequence should accept a #[code Doc] object and modify it in place. - -p - | Instead of passing a callback, you can also write to the - | #[code .pipeline] attribute directly. + | So when you call this... +code. nlp = spacy.load('en') - nlp.pipeline = [nlp.tagger] + +p + | ... the model tells spaCy to use the pipeline + | #[code ["vectorizer", "tagger", "parser", "ner"]]. spaCy will then look + | up each string in its internal factories registry and initialise the + | individual components. It'll then load #[code spacy.lang.en.English], + | pass it the path to the model's data directory, and return it for you + | to use as the #[code nlp] object. + +p + | When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and + | then #[strong call each component] on the #[code Doc], in order. + | Components all return the modified document, which is then processed by + | the component next in the pipeline. + ++code("The pipeline under the hood"). + doc = nlp.make_doc(u'This is a sentence') + for proc in nlp.pipeline: + doc = proc(doc) + ++h(2, "creating") Creating pipeline components and factories + +p + | spaCy lets you customise the pipeline with your own components. Components + | are functions that receive a #[code Doc] object, modify and return it. + | If your component is stateful, you'll want to create a new one for each + | pipeline. You can do that by defining and registering a factory which + | receives the shared #[code Vocab] object and returns a component. + ++h(3, "creating-component") Creating a component + +p + | A component receives a #[code Doc] object and + | #[strong performs the actual processing] – for example, using the current + | weights to make a prediction and set some annotation on the document. By + | adding a component to the pipeline, you'll get access to the #[code Doc] + | at any point #[strong during] processing – instead of only being able to + | modify it afterwards. + ++aside-code("Example"). + def my_component(doc): + # do something to the doc here + return doc + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The #[code Doc] object processed by the previous component. + + +footrow + +cell returns + +cell #[code Doc] + +cell The #[code Doc] object processed by this pipeline component. + +p + | When creating a new #[code Language] class, you can pass it a list of + | pipeline component functions to execute in that order. You can also + | add it to an existing pipeline by modifying #[code nlp.pipeline] – just + | be careful not to overwrite a pipeline or its components by accident! + ++code. + # Create a new Language object with a pipeline + from spacy.language import Language + nlp = Language(pipeline=[my_component]) + + # Modify an existing pipeline + nlp = spacy.load('en') + nlp.pipeline.append(my_component) + ++h(3, "creating-factory") Creating a factory + +p + | A factory is a #[strong function that returns a pipeline component]. + | It's called with the #[code Vocab] object, to give it access to the + | shared data between components – for example, the strings, morphology, + | vectors or annotation scheme. Factories are useful for creating + | #[strong stateful components], especially ones which + | #[strong depend on shared data]. + ++aside-code("Example"). + def my_factory(vocab): + # load some state + def my_component(doc): + # process the doc + return doc + return my_component + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[coce Vocab] + +cell + | Shared data between components, including strings, morphology, + | vectors etc. + + +footrow + +cell returns + +cell callable + +cell The pipeline component. + +p + | By creating a factory, you're essentially telling spaCy how to get the + | pipeline component #[strong once the vocab is available]. Factories need to + | be registered via #[+api("spacy#set_factory") #[code set_factory()]] and + | by assigning them a unique ID. This ID can be added to the pipeline as a + | string. When creating a pipeline, you're free to mix strings and + | callable components: + ++code. + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory', my_other_component]) + +p + | If spaCy comes across a string in the pipeline, it will try to resolve it + | by looking it up in the available factories. The factory will then be + | initialised with the #[code Vocab]. Providing factory names instead of + | callables also makes it easy to specify them in the model's + | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. If you're + | training your own model and want to use one of spaCy's default components, + | you won't have to worry about finding and implementing it either – to use + | the default tagger, simply add #[code "tagger"] to the pipeline, and + | #[strong spaCy will know what to do]. + + ++infobox("Important note") + | Because factories are #[strong resolved on initialisation] of the + | #[code Language] class, it's #[strong not possible] to add them to the + | pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only + | works with individual component functions. To use factories, you need to + | create a new #[code Language] object, or generate a + | #[+a("/docs/usage/saving-loading#models-generating") model package] with + | a custom pipeline. + ++h(2, "example1") Example: Custom sentence segmentation logic + ++aside("Real-world examples") + | To see real-world examples of pipeline factories and components in action, + | you can have a look at the source of spaCy's built-in components, e.g. + | the #[+src(gh("spacy")) tagger], #[+src(gh("spacy")) parser] or + | #[+src(gh("spacy")) entity recognizer]. + +p + | Let's say you want to implement custom logic to improve spaCy's sentence + | boundary detection. Currently, sentence segmentation is based on the + | dependency parse, which doesn't always produce ideal results. The custom + | logic should therefore be applied #[strong after] tokenization, but + | #[strong before] the dependency parsing – this way, the parser can also + | take advantage of the sentence boundaries. + ++code. + def sbd_component(doc): + for i, token in enumerate(doc[:-2]): + # define sentence start if period + titlecase token + if token.text == '.' and doc[i+1].is_title: + doc[i+1].sent_start = True + return doc + +p + | In this case, we simply want to add the component to the existing + | pipeline of the English model. We can do this by inserting it at index 0 + | of #[code nlp.pipeline]: + ++code. + nlp = spacy.load('en') + nlp.pipeline.insert(0, sbd_component) + +p + | When you call #[code nlp] on some text, spaCy will tokenize it to create + | a #[code Doc] object, and first call #[code sbd_component] on it, followed + | by the model's default pipeline. + ++h(2, "example2") Example: Sentiment model + +p + | Let's say you have trained your own document sentiment model on English + | text. After tokenization, you want spaCy to first execute the + | #[strong default vectorizer], followed by a custom + | #[strong sentiment component] that adds a #[code .sentiment] + | property to the #[code Doc], containing your model's sentiment precition. + +p + | Your component class will have a #[code from_disk()] method that spaCy + | calls to load the model data. When called, the component will compute + | the sentiment score, add it to the #[code Doc] and return the modified + | document. Optionally, the component can include an #[code update()] method + | to allow training the model. + ++code. + import pickle + from pathlib import Path + + class SentimentComponent(object): + def __init__(self, vocab): + self.weights = None + + def __call__(self, doc): + doc.sentiment = sum(self.weights*doc.vector) # set sentiment property + return doc + + def from_disk(self, path): # path = model path + factory ID ('sentiment') + self.weights = pickle.load(Path(path) / 'weights.bin') # load weights + return self + + def update(self, doc, gold): # update weights – allows training! + prediction = sum(self.weights*doc.vector) + self.weights -= 0.001*doc.vector*(prediction-gold.sentiment) + +p + | The factory will initialise the component with the #[code Vocab] object. + | To be able to add it to your model's pipeline as #[code 'sentiment'], + | it also needs to be registered via + | #[+api("spacy#set_factory") #[code set_factory()]]. + ++code. + def sentiment_factory(vocab): + component = SentimentComponent(vocab) # initialise component + return component + + spacy.set_factory('sentiment', sentiment_factory) + +p + | The above code should be #[strong shipped with your model]. You can use + | the #[+api("cli#package") #[code package]] command to create all required + | files and directories. The model package will include an + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py] + | with a #[code load()] method, that will initialise the language class with + | the model's pipeline and call the #[code from_disk()] method to load + | the model data. + +p + | In the model package's meta.json, specify the language class and pipeline + | IDs in #[code setup]: + ++code("meta.json (excerpt)", "json"). + { + "name": "my_sentiment_model", + "version": "1.0.0", + "spacy_version": ">=2.0.0,<3.0.0", + "setup": { + "lang": "en", + "pipeline": ["vectorizer", "sentiment"] + } + } + +p + | When you load your new model, spaCy will call the model's #[code load()] + | method. This will return a #[code Language] object with a pipeline + | containing the default vectorizer, and the sentiment component returned + | by your custom #[code "sentiment"] factory. + ++code. + nlp = spacy.load('my_sentiment_model') + doc = nlp(u'I love pizza') + assert doc.sentiment + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 958200637..f8779b52f 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -105,6 +105,8 @@ include _spacy-101/_word-vectors +h(2, "pipelines") Pipelines +include _spacy-101/_pipelines + +h(2, "serialization") Serialization include _spacy-101/_serialization From 4f396236f66ff56a168846bdd682d8c8bbaa5c79 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:25:49 +0200 Subject: [PATCH 36/51] Update saving and loading docs --- website/docs/usage/models.jade | 2 +- website/docs/usage/saving-loading.jade | 32 ++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 832ad8211..a837b4d29 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -233,4 +233,4 @@ p +infobox("Saving and loading models") | For more information and a detailed guide on how to package your model, | see the documentation on - | #[+a("/docs/usage/saving-loading") saving and loading models]. + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index e580bca25..74370bbb1 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -10,6 +10,27 @@ include _spacy-101/_serialization | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. + | save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]], + | and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]]. + | This will overwrite the existing object and return it. + ++code. + import spacy + from spacy.tokens import Span + + text = u'Netflix is hiring a new VP of global policy' + + nlp = spacy.load('en') + doc = nlp(text) + assert len(doc.ents) == 0 # Doc has no entities + doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity + doc.to_disk('/path/to/doc') # save Doc to disk + + new_doc = nlp(text) + assert len(new_doc.ents) == 0 # new Doc has no entities + new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite + assert len(new_doc.ents) == 1 # entity is now recognised! + assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')] +h(2, "models") Saving models @@ -46,13 +67,16 @@ p +aside-code("meta.json", "json"). { "name": "example_model", - "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", - "license": "CC BY-SA 3.0" + "license": "CC BY-SA 3.0", + "setup": { + "lang": "en", + "pipeline": ["token_vectors", "tagger"] + } } +code(false, "bash"). @@ -71,10 +95,10 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+src(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+src(gh("spacy-dev-resources", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of - | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The + | #[code lang_name] and #[code lang_name-version]. | #[code lang] setting in the meta.json is also used to create the | respective #[code Language] class in spaCy, which will later be returned | by the model's #[code load()] method. From 764bfa3239f4edb2cd73708643c9cb10102c675d Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 20:53:43 +0200 Subject: [PATCH 37/51] Add section on using displaCy in a web app --- website/docs/usage/visualizers.jade | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index fe779add9..385fa0fd0 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -315,3 +315,61 @@ p 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], 'title': None } + ++h(2, "webapp") Using displaCy in a web application + +p + | If you want to use the visualizers as part of a web application, for + | example to create something like our + | #[+a(DEMOS_URL + "/displacy") online demo], it's not recommended to + | simply wrap and serve the displaCy renderer. Instead, you should only + | rely on the server to perform spaCy's processing capabilities, and use + | #[+a(gh("displacy")) displaCy.js] to render the JSON-formatted output. + ++aside("Why not return the HTML by the server?") + | It's certainly possible to just have your server return the markup. + | But outputting raw, unsanitised HTML is risky and makes your app vulnerable to + | #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting] + | (XSS). All your user needs to do is find a way to make spaCy return one + | token #[code <script src="malicious-code.js"><script>]. + | Instead of relying on the server to render and sanitize HTML, you + | can do this on the client in JavaScript. displaCy.js creates + | the SVG markup as DOM nodes and will never insert raw HTML. + +p + | The #[code parse_deps] function takes a #[code Doc] object and returns + | a dictionary in a format that can be rendered by displaCy. + ++code("Example"). + import spacy + from spacy import displacy + + nlp = spacy.load('en') + + def displacy_service(text): + doc = nlp(text) + return displacy.parse_deps(doc) + +p + | Using a library like #[+a("https://falconframework.org/") Falcon] or + | #[+a("http://www.hug.rest/") Hug], you can easily turn the above code + | into a simple REST API that receives a text and returns a JSON-formatted + | parse. In your front-end, include #[+a(gh("displacy")) displacy.js] and + | initialise it with the API URL and the ID or query selector of the + | container to render the visualisation in, e.g. #[code '#displacy'] for + | #[code <div id="displacy">]. + ++code("script.js", "javascript"). + var displacy = new displaCy('http://localhost:8080', { + container: '#displacy' + }) + + function parse(text) { + displacy.parse(text); + } + +p + | When you call #[code parse()], it will make a request to your API, + | receive the JSON-formatted parse and render it in your container. To + | create an interactive experience, you could trigger this function by + | a button and read the text from an #[code <input>] field. From f4658ff0539f36560bf1776a2ef6a1090713bf99 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 20:54:02 +0200 Subject: [PATCH 38/51] Rewrite usage workflow on saving and loading --- website/docs/usage/saving-loading.jade | 124 ++++++++++++++++++------- 1 file changed, 93 insertions(+), 31 deletions(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 74370bbb1..413b86477 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -10,6 +10,13 @@ include _spacy-101/_serialization | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. ++h(3, "example-doc") Example: Saving and loading a document + +p + | For simplicity, let's assume you've + | #[+a("/docs/usage/entity-recognition#setting") added custom entities] to + | a #[code Doc], either manually, or by using a + | #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can | save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]], | and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]]. | This will overwrite the existing object and return it. @@ -99,53 +106,108 @@ p | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code lang_name] and #[code lang_name-version]. - | #[code lang] setting in the meta.json is also used to create the - | respective #[code Language] class in spaCy, which will later be returned - | by the model's #[code load()] method. + ++h(3, "models-custom") Customising the model setup p - | To #[strong build the package], run the following command from within the - | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. For more information on building Python packages, see the - | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + | The meta.json includes a #[code setup] key that lets you customise how + | the model should be initialised and loaded. You can define the language + | data to be loaded and the + | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to + | execute. ++table(["Setting", "Type", "Description"]) + +row + +cell #[code lang] + +cell unicode + +cell ID of the language class to initialise. + + +row + +cell #[code pipeline] + +cell list + +cell + | A list of strings mapping to the IDs of pipeline factories to + | apply in that order. If not set, spaCy's + | #[+a("/docs/usage/language-processing/pipelines") default pipeline] + | will be used. + +p + | The #[code load()] method that comes with our model package + | templates will take care of putting all this together and returning a + | #[code Language] object with the loaded pipeline and data. If your model + | requires custom pipeline components, you should + | #[strong ship then with your model] and register their + | #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories] + | via #[+api("spacy#set_factory") #[code set_factory()]]. + ++aside-code("Factory example"). + def my_factory(vocab): + # load some state + def my_component(doc): + # process the doc + return doc + return my_component + ++code. + spacy.set_factory('custom_component', custom_component_factory) + ++infobox("Custom models with pipeline components") + | For more details and an example of how to package a sentiment model + | with a custom pipeline component, see the usage workflow on + | #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines]. + ++h(3, "models-building") Building the model package + +p + | To build the package, run the following command from within the + | directory. For more information on building Python packages, see the + | docs on Python's + | #[+a("https://setuptools.readthedocs.io/en/latest/") Setuptools]. +code(false, "bash"). python setup.py sdist +p + | This will create a #[code .tar.gz] archive in a directory #[code /dist]. + | The model can be installed by pointing pip to the path of the archive: + ++code(false, "bash"). + pip install /path/to/en_example_model-1.0.0.tar.gz + +p + | You can then load the model via its name, #[code en_example_model], or + | import it directly as a module and then call its #[code load()] method. + +h(2, "loading") Loading a custom model package p | To load a model from a data directory, you can use - | #[+api("spacy#load") #[code spacy.load()]] with the local path: + | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will + | look for a meta.json in the directory and use the #[code setup] details + | to initialise a #[code Language] class with a processing pipeline and + | load in the model data. +code. nlp = spacy.load('/path/to/model') p - | If you have generated a model package, you can also install it by - | pointing pip to the model's #[code .tar.gz] archive – this is pretty - | much exactly what spaCy's #[+api("cli#download") #[code download]] - | command does under the hood. - -+code(false, "bash"). - pip install /path/to/en_example_model-1.0.0.tar.gz - -+aside-code("Custom model names", "bash"). - # optional: assign custom name to model - python -m spacy link en_example_model my_cool_model - -p - | You'll then be able to load the model via spaCy's loader, or by importing - | it as a module. For larger code bases, we usually recommend native - | imports, as this will make it easier to integrate models with your - | existing build process, continuous integration workflow and testing - | framework. + | If you want to #[strong load only the binary data], you'll have to create + | a #[code Language] class and call + | #[+api("language#from_disk") #[code from_disk]] instead. +code. - # option 1: import model as module - import en_example_model - nlp = en_example_model.load() + from spacy.lang.en import English + nlp = English().from_disk('/path/to/data') - # option 2: use spacy.load() - nlp = spacy.load('en_example_model') ++infobox("Important note: Loading data in v2.x") + .o-block + | In spaCy 1.x, the distinction between #[code spacy.load()] and the + | #[code Language] class constructor was quite unclear. You could call + | #[code spacy.load()] when no model was present, and it would silently + | return an empty object. Likewise, you could pass a path to + | #[code English], even if the mode required a different language. + | spaCy v2.0 solves this with a clear distinction between setting up + | the instance and loading the data. + + +code-new nlp = English.from_disk('/path/to/data') + +code-old nlp = spacy.load('en', path='/path/to/data') From c25f3133ca6ce1147b84860cd820d945fe45e322 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 20:54:37 +0200 Subject: [PATCH 39/51] Update section on new v2.0 features --- website/docs/usage/v2.jade | 131 ++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 68 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 4a0e6ca2f..a058c5c13 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -8,6 +8,65 @@ p +h(2, "features") New features ++h(3, "features-pipelines") Improved processing pipelines + ++aside-code("Example"). + # Modify an existing pipeline + nlp = spacy.load('en') + nlp.pipeline.append(my_component) + + # Register a factory to create a component + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory', mycomponent]) + +p + | It's now much easier to customise the pipeline with your own components. + | Components are functions that receive a #[code Doc] object, modify and + | return it. If your component is stateful, you'll want to create a new one + | for each pipeline. You can do that by defining and registering a factory + | which receives the shared #[code Vocab] object and returns a component. + +p + | spaCy's default components – the vectorizer, tagger, parser and entity + | recognizer, can be added to your pipeline by using their string IDs. + | This way, you won't have to worry about finding and implementing them – + | to use the default tagger, simply add #[code "tagger"] to the pipeline, + | and spaCy will know what to do. + ++infobox + | #[strong API:] #[+api("language") #[code Language]] + | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] + ++h(3, "features-serializer") Saving, loading and serialization + ++aside-code("Example"). + nlp = spacy.load('en') # shortcut link + nlp = spacy.load('en_core_web_sm') # package + nlp = spacy.load('/path/to/en') # unicode path + nlp = spacy.load(Path('/path/to/en')) # pathlib Path + + nlp.to_disk('/path/to/nlp') + nlp = English().from_disk('/path/to/nlp') + +p + | spay's serialization API has been made consistent across classes and + | objects. All container classes and pipeline components now have a + | #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and + | #[code from_disk()] method that supports the Pickle protocol. + +p + | The improved #[code spacy.load] makes loading models easier and more + | transparent. You can load a model by supplying its + | #[+a("/docs/usage/models#usage") shortcut link], the name of an installed + | #[+a("/docs/usage/saving-loading#generating") model package] or a path. + | The #[code Language] class to initialise will be determined based on the + | model's settings. For a blank language, you can import the class directly, + | e.g. #[code from spacy.lang.en import English]. + ++infobox + | #[strong API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]] + | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] + +h(3, "features-displacy") displaCy visualizer with Jupyter support +aside-code("Example"). @@ -28,33 +87,6 @@ p | #[strong API:] #[+api("displacy") #[code displacy]] | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy] -+h(3, "features-loading") Loading - -+aside-code("Example"). - nlp = spacy.load('en') # shortcut link - nlp = spacy.load('en_core_web_sm') # package - nlp = spacy.load('/path/to/en') # unicode path - nlp = spacy.load(Path('/path/to/en')) # pathlib Path - -p - | The improved #[code spacy.load] makes loading models easier and more - | transparent. You can load a model by supplying its - | #[+a("/docs/usage/models#usage") shortcut link], the name of an installed - | #[+a("/docs/usage/saving-loading#generating") model package], a unicode - | path or a #[code Path]-like object. spaCy will try resolving the load - | argument in this order. The #[code path] keyword argument is now deprecated. - -p - | The #[code Language] class to initialise will be determined based on the - | model's settings. If no model is found, spaCy will let you know and won't - | just return an empty #[code Language] object anymore. If you want a blank - | language, you can always import the class directly, e.g. - | #[code from spacy.lang.en import English]. - -+infobox - | #[strong API:] #[+api("spacy#load") #[code spacy.load]] - | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] - +h(3, "features-language") Improved language data and lazy loading p @@ -65,46 +97,15 @@ p | complex regular expressions. The language data has also been tidied up | and simplified. It's now also possible to overwrite the functions that | compute lexical attributes like #[code like_num], and supply - | language-specific syntax iterators, e.g. to determine noun chunks. + | language-specific syntax iterators, e.g. to determine noun chunks. spaCy + | now also supports simple lookup-based lemmatization. The data is stored + | in a dictionary mapping a string to its lemma. +infobox + | #[strong API:] #[+api("language") #[code Language]] | #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang] | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] -+h(3, "features-pipelines") Improved processing pipelines - -+aside-code("Example"). - from spacy.language import Language - nlp = Language(pipeline=['token_vectors', 'tags', - 'dependencies']) - -+infobox - | #[strong API:] #[+api("language") #[code Language]] - | #[strong Usage:] #[+a("/docs/usage/processing-text") Processing text] - -+h(3, "features-lemmatizer") Simple lookup-based lemmatization - -+aside-code("Example"). - LOOKUP = { - "aba": "abar", - "ababa": "abar", - "ababais": "abar", - "ababan": "abar", - "ababanes": "ababΓ‘n" - } - -p - | spaCy now supports simple lookup-based lemmatization. The data is stored - | in a dictionary mapping a string to its lemma. To determine a token's - | lemma, spaCy simply looks it up in the table. The lookup lemmatizer can - | be imported from #[code spacy.lemmatizerlookup]. It's initialised with - | the lookup table, and should be returned by the #[code create_lemmatizer] - | classmethod of the language's defaults. - -+infobox - | #[strong API:] #[+api("language") #[code Language]] - | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] - +h(3, "features-matcher") Revised matcher API +aside-code("Example"). @@ -129,12 +130,6 @@ p | #[strong API:] #[+api("matcher") #[code Matcher]] | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] -+h(3, "features-serializer") Serialization - -+infobox - | #[strong API:] #[+api("serializer") #[code Serializer]] - | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] - +h(3, "features-models") Neural network models for English, German, French and Spanish +infobox From 9337866dae5915f7b1a385b9d903c1310c8884d9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 22:46:18 +0200 Subject: [PATCH 40/51] Add aside to pipeline 101 table --- website/docs/usage/_spacy-101/_pipelines.jade | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index fe6c149f6..d984a4708 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -15,6 +15,12 @@ p .u-text-right +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic ++aside + | #[strong Name:] ID of the pipeline component.#[br] + | #[strong Component:] spaCy's implementation of the component.#[br] + | #[strong Creates:] Objects, attributes and properties modified and set by + | the component. + +table(["Name", "Component", "Creates"]) +row +cell tokenizer From 9efa662345e89b93ce2cf1c569c30cd7abd4ba19 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:09:51 +0200 Subject: [PATCH 41/51] Update dependency parse docs and add note on disabling parser --- website/docs/usage/dependency-parse.jade | 66 ++++++++++++++---------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/website/docs/usage/dependency-parse.jade b/website/docs/usage/dependency-parse.jade index abfa1f825..dfb37f786 100644 --- a/website/docs/usage/dependency-parse.jade +++ b/website/docs/usage/dependency-parse.jade @@ -6,18 +6,20 @@ p | spaCy features a fast and accurate syntactic dependency parser, and has | a rich API for navigating the tree. The parser also powers the sentence | boundary detection, and lets you iterate over base noun phrases, or - | "chunks". - -p - | You can check whether a #[+api("doc") #[code Doc]] object has been - | parsed with the #[code doc.is_parsed] attribute, which returns a boolean - | value. If this attribute is #[code False], the default sentence iterator - | will raise an exception. + | "chunks". You can check whether a #[+api("doc") #[code Doc]] object has + | been parsed with the #[code doc.is_parsed] attribute, which returns a + | boolean value. If this attribute is #[code False], the default sentence + | iterator will raise an exception. +h(2, "noun-chunks") Noun chunks +tag-model("dependency parse") -p Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt nisi. Integer commodo pellentesque tincidunt. Nam at turpis finibus tortor gravida sodales tincidunt sit amet est. Nullam euismod arcu in tortor auctor. +p + | Noun chunks are "base noun phrases" – flat phrases that have a noun as + | their head. You can think of noun chunks as a noun plus the words describing + | the noun – for example, "the lavish green grass" or "the world’s largest + | tech fund". To get the noun chunks in a document, simply iterate over + | #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]. +code("Example"). nlp = spacy.load('en') @@ -28,9 +30,10 @@ p Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pr +aside | #[strong Text:] The original noun chunk text.#[br] - | #[strong Root text:] ...#[br] - | #[strong Root dep:] ...#[br] - | #[strong Root head text:] ...#[br] + | #[strong Root text:] The original text of the word connecting the noun + | chunk to the rest of the parse.#[br] + | #[strong Root dep:] Dependcy relation connecting the root to its head.#[br] + | #[strong Root head text:] The text of the root token's head.#[br] +table(["Text", "root.text", "root.dep_", "root.head.text"]) - var style = [0, 0, 1, 0] @@ -59,7 +62,7 @@ p | #[strong Dep]: The syntactic relation connecting child to head.#[br] | #[strong Head text]: The original text of the token head.#[br] | #[strong Head POS]: The part-of-speech tag of the token head.#[br] - | #[strong Children]: ... + | #[strong Children]: The immediate syntactic dependents of the token. +table(["Text", "Dep", "Head text", "Head POS", "Children"]) - var style = [0, 1, 0, 1, 0] @@ -204,20 +207,31 @@ p +h(2, "disabling") Disabling the parser p - | The parser is loaded and enabled by default. If you don't need any of - | the syntactic information, you should disable the parser. Disabling the - | parser will make spaCy load and run much faster. Here's how to prevent - | the parser from being loaded: + | In the #[+a("/docs/usage/models/available") default models], the parser + | is loaded and enabled as part of the + | #[+a("docs/usage/language-processing-pipelines") standard processing pipeline]. + | If you don't need any of the syntactic information, you should disable + | the parser. Disabling the parser will make spaCy load and run much faster. + | If you want to load the parser, but need to disable it for specific + | documents, you can also control its use on the #[code nlp] object. +code. - nlp = spacy.load('en', parser=False) + nlp = spacy.load('en', disable=['parser']) + nlp = English().from_disk('/model', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) -p - | If you need to load the parser, but need to disable it for specific - | documents, you can control its use with the #[code parse] keyword - | argument: - -+code. - nlp = spacy.load('en') - doc1 = nlp(u'Text I do want parsed.') - doc2 = nlp(u"Text I don't want parsed", parse=False) ++infobox("Important note: disabling pipeline components") + .o-block + | Since spaCy v2.0 comes with better support for customising the + | processing pipeline components, the #[code parser] keyword argument + | has been replaced with #[code disable], which takes a list of + | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | This lets you disable both default and custom components when loading + | a model, or initialising a Language class via + | #[+api("language-from_disk") #[code from_disk]]. + +code-new. + nlp = spacy.load('en', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old. + nlp = spacy.load('en', parser=False) + doc = nlp(u"I don't want parsed", parse=False) From 419d265ff047370e025797395cef5543efce9773 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:10:06 +0200 Subject: [PATCH 42/51] Add section on disabling pipeline components --- .../usage/language-processing-pipeline.jade | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 3b41ad5de..7124bdadc 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -315,3 +315,43 @@ p | For more information and a detailed guide on how to package your model, | see the documentation on | #[+a("/docs/usage/saving-loading#models") saving and loading models]. + ++h(2, "disabling") Disabling pipeline components + +p + | If you don't need a particular component of the pipeline – for + | example, the tagger or the parser, you can disable loading it. This can + | sometimes make a big difference and improve loading speed. Disabled + | component names can be provided to #[code spacy.load], #[code from_disk] + | or the #[code nlp] object itself as a list: + ++code. + nlp = spacy.load('en', disable['parser', 'tagger']) + nlp = English().from_disk('/model', disable=['vectorizer', 'ner']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +p + | Note that you can't write directly to #[code nlp.pipeline], as this list + | holds the #[em actual components], not the IDs. However, if you know the + | order of the components, you can still slice the list: + ++code. + nlp = spacy.load('en') + nlp.pipeline = nlp.pipeline[:2] # only use the first two components + ++infobox("Important note: disabling pipeline components") + .o-block + | Since spaCy v2.0 comes with better support for customising the + | processing pipeline components, the #[code parser], #[code tagger] + | and #[code entity] keyword arguments have been replaced with + | #[code disable], which takes a list of + | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | This lets you disable both default and custom components when loading + | a model, or initialising a Language class via + | #[+api("language-from_disk") #[code from_disk]]. + +code-new. + nlp = spacy.load('en', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old. + nlp = spacy.load('en', parser=False) + doc = nlp(u"I don't want parsed", parse=False) From 0f48fb1f9702f702715cddc95a2b3e57fb4e1cfb Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:10:33 +0200 Subject: [PATCH 43/51] Rename processing text to production use and remove linear feature scheme --- website/docs/api/_data.json | 7 +- website/docs/api/features.jade | 138 ------------------ website/docs/usage/_data.json | 13 +- ...ocessing-text.jade => production-use.jade} | 63 -------- 4 files changed, 8 insertions(+), 213 deletions(-) delete mode 100644 website/docs/api/features.jade rename website/docs/usage/{processing-text.jade => production-use.jade} (58%) diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 443ee9a67..f3f996846 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -27,8 +27,7 @@ "GoldCorpus": "goldcorpus" }, "Other": { - "Annotation Specs": "annotation", - "Feature Scheme": "features" + "Annotation Specs": "annotation" } }, @@ -143,9 +142,5 @@ "annotation": { "title": "Annotation Specifications" - }, - - "features": { - "title": "Linear Model Feature Scheme" } } diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade deleted file mode 100644 index 018790145..000000000 --- a/website/docs/api/features.jade +++ /dev/null @@ -1,138 +0,0 @@ -//- πŸ’« DOCS > API > LINEAR MOEL FEATURES - -include ../../_includes/_mixins - -p - | There are two popular strategies for putting together machine learning - | models for NLP: sparse linear models, and neural networks. To solve NLP - | problems with linear models, feature templates need to be assembled that - | combine multiple atomic predictors. This page documents the atomic - | predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]], - | #[+api("tagger") #[code Tagger]] and - | #[+api("entityrecognizer") #[code EntityRecognizer]]. - -p - | To understand the scheme, recall that spaCy's #[code Parser] and - | #[code EntityRecognizer] are implemented as push-down automata. They - | maintain a "stack" that holds the current entity, and a "buffer" - | consisting of the words to be processed. - -p - | Each state consists of the words on the stack (if any), which consistute - | the current entity being constructed. We also have the current word, and - | the two subsequent words. Finally, we also have the entities previously - | built. - -p - | This gives us a number of tokens to ask questions about, to make the - | features. About each of these tokens, we can ask about a number of - | different properties. Each feature identifier asks about a specific - | property of a specific token of the context. - -+h(2, "tokens") Context tokens - -+table([ "ID", "Description" ]) - +row - +cell #[code S0] - +cell - | The first word on the stack, i.e. the token most recently added - | to the current entity. - - +row - +cell #[code S1] - +cell The second word on the stack, i.e. the second most recently added. - - +row - +cell #[code S2] - +cell The third word on the stack, i.e. the third most recently added. - - +row - +cell #[code N0] - +cell The first word of the buffer, i.e. the current word being tagged. - - +row - +cell #[code N1] - +cell The second word of the buffer. - - +row - +cell #[code N2] - +cell The third word of the buffer. - - +row - +cell #[code P1] - +cell The word immediately before #[code N0]. - - +row - +cell #[code P2] - +cell The second word before #[code N0]. - - +row - +cell #[code E0] - +cell The first word of the previously constructed entity. - - +row - +cell #[code E1] - +cell The first word of the second previously constructed entity. - -p About each of these tokens, we can ask: - -+table([ "ID", "Attribute", "Description" ]) - +row - +cell #[code N0w] - +cell #[code token.orth] - +cell The word form. - - +row - +cell #[code N0W] - +cell #[code token.lemma] - +cell The word's lemma. - - +row - +cell #[code N0p] - +cell #[code token.tag] - +cell The word's (full) POS tag. - - +row - +cell #[code N0c] - +cell #[code token.cluster] - +cell The word's (full) Brown cluster. - - +row - +cell #[code N0c4] - +cell - - +cell First four digit prefix of the word's Brown cluster. - - +row - +cell #[code N0c6] - +cell - - +cell First six digit prefix of the word's Brown cluster. - - +row - +cell #[code N0L] - +cell - - +cell The word's dependency label. Not used as a feature in the NER. - - +row - +cell #[code N0_prefix] - +cell #[code token.prefix] - +cell The first three characters of the word. - - +row - +cell #[code N0_suffix] - +cell #[code token.suffix] - +cell The last three characters of the word. - - +row - +cell #[code N0_shape] - +cell #[code token.shape] - +cell The word's shape, i.e. is it alphabetic, numeric, etc. - - +row - +cell #[code N0_ne_iob] - +cell #[code token.ent_iob] - +cell The Inside/Outside/Begin code of the word's NER tag. - - +row - +cell #[code N0_ne_type] - +cell #[code token.ent_type] - +cell The word's NER type. diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 4d065522b..3a24a38df 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -15,9 +15,9 @@ "Custom tokenization": "customizing-tokenizer", "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", - "Processing text": "processing-text", "NLP pipelines": "language-processing-pipeline", "Deep learning": "deep-learning", + "Production use": "production-use", "Training": "training", "Training NER": "training-ner", "Saving & loading": "saving-loading", @@ -99,11 +99,6 @@ "next": "training" }, - "processing-text": { - "title": "Processing text", - "next": "language-processing-pipeline" - }, - "language-processing-pipeline": { "title": "Language processing pipelines", "next": "deep-learning" @@ -111,9 +106,15 @@ "deep-learning": { "title": "Hooking a deep learning model into spaCy", + "next": "production use" + }, + + "production-use": { + "title": "Production use", "next": "training" }, + "training": { "title": "Training spaCy's statistical models", "next": "saving-loading" diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/production-use.jade similarity index 58% rename from website/docs/usage/processing-text.jade rename to website/docs/usage/production-use.jade index 2562d9fc4..68a313d8a 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/production-use.jade @@ -6,69 +6,6 @@ p | Once you have loaded the #[code nlp] object, you can call it as though | it were a function. This allows you to process a single unicode string. -+code. - doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') - -p - | The library should perform equally well with #[strong short or long documents]. - | All algorithms are linear-time in the length of the string, and once the - | data is loaded, there's no significant start-up cost to consider. This - | means that you don't have to strategically merge or split your text β€” - | you should feel free to feed in either single tweets or whole novels. - -p - | If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will - | load the #[+a("/docs/usage/models") model] associated with the name - | #[code 'en']. Each model is a Python package containing an - | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py] - -the #[code nlp] object will - | be an instance of #[code spacy.en.English]. This means that when you run - | #[code doc = nlp(text)], you're executing - | #[code spacy.en.English.__call__], which is implemented on its parent - | class, #[+api("language") #[code Language]]. - -+code. - doc = nlp.make_doc(text) - for proc in nlp.pipeline: - proc(doc) - -p - | I've tried to make sure that the #[code Language.__call__] function - | doesn't do any "heavy lifting", so that you won't have complicated logic - | to replicate if you need to make your own pipeline class. This is all it - | does. - -p - | The #[code .make_doc()] method and #[code .pipeline] attribute make it - | easier to customise spaCy's behaviour. If you're using the default - | pipeline, we can desugar one more time. - -+code. - doc = nlp.tokenizer(text) - nlp.tagger(doc) - nlp.parser(doc) - nlp.entity(doc) - -p Finally, here's where you can find out about each of those components: - -+table(["Name", "Source"]) - +row - +cell #[code tokenizer] - +cell #[+src(gh("spacy", "spacy/tokenizer.pyx")) spacy.tokenizer.Tokenizer] - - +row - +cell #[code tagger] - +cell #[+src(gh("spacy", "spacy/tagger.pyx")) spacy.pipeline.Tagger] - - +row - +cell #[code parser] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.DependencyParser] - - +row - +cell #[code entity] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.EntityRecognizer] - +h(2, "multithreading") Multi-threading with #[code .pipe()] p From d122bbc9084adcb9aa0e6af57f5df828d0753ffb Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:30:21 +0200 Subject: [PATCH 44/51] Rewrite custom tokenizer docs --- website/docs/usage/customizing-tokenizer.jade | 101 +++++++++++------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index 5871e1655..86040a4eb 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -11,16 +11,10 @@ p | #[code spaces] booleans, which allow you to maintain alignment of the | tokens into the original string. -+aside("spaCy's data model") - | The main point to keep in mind is that spaCy's #[code Doc] doesn't - | copy or refer to the original string. The string is reconstructed from - | the tokens when required. - +h(2, "101") Tokenizer 101 include _spacy-101/_tokenization - +h(3, "101-data") Tokenizer data p @@ -221,27 +215,68 @@ p +h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline p - | You can pass a custom tokenizer using the #[code make_doc] keyword, when - | you're creating the pipeline: + | The tokenizer is the first component of the processing pipeline and the + | only one that can't be replaced by writing to #[code nlp.pipeline]. This + | is because it has a different signature from all the other components: + | it takes a text and returns a #[code Doc], whereas all other components + | expect to already receive a tokenized #[code Doc]. + ++image + include ../../assets/img/docs/pipeline.svg + .u-text-right + +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic -+code. - nlp = spacy.load('en', make_doc=my_tokenizer) p - | However, this approach often leaves us with a chicken-and-egg problem. - | To construct the tokenizer, we usually want attributes of the #[code nlp] - | pipeline. Specifically, we want the tokenizer to hold a reference to the - | pipeline's vocabulary object. Let's say we have the following class as - | our tokenizer: - + | To overwrite the existing tokenizer, you need to replace + | #[code nlp.tokenizer] with a custom function that takes a text, and + | returns a #[code Doc]. + ++code. + nlp = spacy.load('en') + nlp.tokenizer = my_tokenizer + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code text] + +cell unicode + +cell The raw text to tokenize. + + +footrow + +cell returns + +cell #[code Doc] + +cell The tokenized document. + ++infobox("Important note: using a custom tokenizer") + .o-block + | In spaCy v1.x, you had to add a custom tokenizer by passing it to the + | #[code make_doc] keyword argument, or by passing a tokenizer "factory" + | to #[code create_make_doc]. This was unnecessarily complicated. Since + | spaCy v2.0, you can simply write to #[code nlp.tokenizer]. If your + | tokenizer needs the vocab, you can write a function and use + | #[code nlp.vocab]. + + +code-new. + nlp.tokenizer = my_tokenizer + nlp.tokenizer = my_tokenizer_factory(nlp.vocab) + +code-old. + nlp = spacy.load('en', make_doc=my_tokenizer) + nlp = spacy.load('en', create_make_doc=my_tokenizer_factory) + ++h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer + +p + | To construct the tokenizer, we usually want attributes of the #[code nlp] + | pipeline. Specifically, we want the tokenizer to hold a reference to the + | vocabulary object. Let's say we have the following class as + | our tokenizer: +code. - import spacy from spacy.tokens import Doc class WhitespaceTokenizer(object): - def __init__(self, nlp): - self.vocab = nlp.vocab + def __init__(self, vocab): + self.vocab = vocab def __call__(self, text): words = text.split(' ') @@ -250,28 +285,12 @@ p return Doc(self.vocab, words=words, spaces=spaces) p - | As you can see, we need a #[code vocab] instance to construct this β€” but - | we won't get the #[code vocab] instance until we get back the #[code nlp] - | object from #[code spacy.load()]. The simplest solution is to build the - | object in two steps: + | As you can see, we need a #[code Vocab] instance to construct this β€” but + | we won't have it until we get back the loaded #[code nlp] object. The + | simplest solution is to build the tokenizer in two steps. This also means + | that you can reuse the "tokenizer factory" and initialise it with + | different instances of #[code Vocab]. +code. nlp = spacy.load('en') - nlp.make_doc = WhitespaceTokenizer(nlp) - -p - | You can instead pass the class to the #[code create_make_doc] keyword, - | which is invoked as callback once the #[code nlp] object is ready: - -+code. - nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer) - -p - | Finally, you can of course create your own subclasses, and create a bound - | #[code make_doc] method. The disadvantage of this approach is that spaCy - | uses inheritance to give each language-specific pipeline its own class. - | If you're working with multiple languages, a naive solution will - | therefore require one custom class per language you're working with. - | This might be at least annoying. You may be able to do something more - | generic by doing some clever magic with metaclasses or mixins, if that's - | the sort of thing you're into. + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) From 709ea589909bf1b290ad4d4a1fb7545961bcf683 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:56:16 +0200 Subject: [PATCH 45/51] Tidy up workflows --- website/docs/usage/_data.json | 10 +- website/docs/usage/data-model.jade | 264 ------------------ .../usage/language-processing-pipeline.jade | 4 +- website/docs/usage/resources.jade | 118 -------- 4 files changed, 4 insertions(+), 392 deletions(-) delete mode 100644 website/docs/usage/data-model.jade delete mode 100644 website/docs/usage/resources.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 3a24a38df..9f51df5c4 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -15,7 +15,7 @@ "Custom tokenization": "customizing-tokenizer", "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", - "NLP pipelines": "language-processing-pipeline", + "Processing pipelines": "language-processing-pipeline", "Deep learning": "deep-learning", "Production use": "production-use", "Training": "training", @@ -48,18 +48,13 @@ "lightning-tour": { "title": "Lightning tour", - "next": "visualizers" + "next": "v2" }, "visualizers": { "title": "Visualizers" }, - "troubleshooting": { - "title": "Troubleshooting", - "next": "resources" - }, - "v2": { "title": "What's new in v2.0" }, @@ -114,7 +109,6 @@ "next": "training" }, - "training": { "title": "Training spaCy's statistical models", "next": "saving-loading" diff --git a/website/docs/usage/data-model.jade b/website/docs/usage/data-model.jade deleted file mode 100644 index 6be205178..000000000 --- a/website/docs/usage/data-model.jade +++ /dev/null @@ -1,264 +0,0 @@ -//- πŸ’« DOCS > USAGE > SPACY'S DATA MODEL - -include ../../_includes/_mixins - -p After reading this page, you should be able to: - -+list - +item Understand how spaCy's Doc, Span, Token and Lexeme object work - +item Start using spaCy's Cython API - +item Use spaCy more efficiently - -+h(2, "architecture") Architecture - -+image - include ../../assets/img/docs/architecture.svg - -+h(2, "design-considerations") Design considerations - -+h(3, "no-job-too-big") No job too big - -p - | When writing spaCy, one of my mottos was #[em no job too big]. I wanted - | to make sure that if Google or Facebook were founded tomorrow, spaCy - | would be the obvious choice for them. I wanted spaCy to be the obvious - | choice for web-scale NLP. This meant sweating about performance, because - | for web-scale tasks, Moore's law can't save you. - -p - | Most computational work gets less expensive over time. If you wrote a - | program to solve fluid dynamics in 2008, and you ran it again in 2014, - | you would expect it to be cheaper. For NLP, it often doesn't work out - | that way. The problem is that we're writing programs where the task is - | something like "Process all articles in the English Wikipedia". Sure, - | compute prices dropped from $0.80 per hour to $0.20 per hour on AWS in - | 2008-2014. But the size of Wikipedia grew from 3GB to 11GB. Maybe the - | job is a #[em little] cheaper in 2014 β€” but not by much. - -+h(3, "annotation-layers") Multiple layers of annotation - -p - | When I tell a certain sort of person that I'm a computational linguist, - | this comic is often the first thing that comes to their mind: - -+image("http://i.imgur.com/n3DTzqx.png", 450) - +image-caption © #[+a("http://xkcd.com") xkcd] - -p - | I've thought a lot about what this comic is really trying to say. It's - | probably not talking about #[em data models] β€” but in that sense at - | least, it really rings true. - -p - | You'll often need to model a document as a sequence of sentences. Other - | times you'll need to model it as a sequence of words. Sometimes you'll - | care about paragraphs, other times you won't. Sometimes you'll care - | about extracting quotes, which can cross paragraph boundaries. A quote - | can also occur within a sentence. When we consider sentence structure, - | things get even more complicated and contradictory. We have syntactic - | trees, sequences of entities, sequences of phrases, sub-word units, - | multi-word units... - -p - | Different applications are going to need to query different, - | overlapping, and often contradictory views of the document. They're - | often going to need to query them jointly. You need to be able to get - | the syntactic head of a named entity, or the sentiment of a paragraph. - -+h(2, "solutions") Solutions - -+h(3) Fat types, thin tokens - -+h(3) Static model, dynamic views - -p - | Different applications are going to need to query different, - | overlapping, and often contradictory views of the document. For this - | reason, I think it's a bad idea to have too much of the document - | structure reflected in the data model. If you structure the data - | according to the needs of one layer of annotation, you're going to need - | to copy the data and transform it in order to use a different layer of - | annotation. You'll soon have lots of copies, and no single source of - | truth. - -+h(3) Never go full stand-off - -+h(3) Implementation - -+h(3) Cython 101 - -+h(3) #[code cdef class Doc] - -p - | Let's start at the top. Here's the memory layout of the - | #[+api("doc") #[code Doc]] class, minus irrelevant details: - -+code. - from cymem.cymem cimport Pool - from ..vocab cimport Vocab - from ..structs cimport TokenC - - cdef class Doc: - cdef Pool mem - cdef Vocab vocab - - cdef TokenC* c - - cdef int length - cdef int max_length - -p - | So, our #[code Doc] class is a wrapper around a TokenC* array β€” that's - | where the actual document content is stored. Here's the #[code TokenC] - | struct, in its entirety: - -+h(3) #[code cdef struct TokenC] - -+code. - cdef struct TokenC: - const LexemeC* lex - uint64_t morph - univ_pos_t pos - bint spacy - int tag - int idx - int lemma - int sense - int head - int dep - bint sent_start - - uint32_t l_kids - uint32_t r_kids - uint32_t l_edge - uint32_t r_edge - - int ent_iob - int ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. - hash_t ent_id - -p - | The token owns all of its linguistic annotations, and holds a const - | pointer to a #[code LexemeC] struct. The #[code LexemeC] struct owns all - | of the #[em vocabulary] data about the word β€” all the dictionary - | definition stuff that we want to be shared by all instances of the type. - | Here's the #[code LexemeC] struct, in its entirety: - -+h(3) #[code cdef struct LexemeC] - -+code. - cdef struct LexemeC: - - int32_t id - - int32_t orth # Allows the string to be retrieved - int32_t length # Length of the string - - uint64_t flags # These are the most useful parts. - int32_t cluster # Distributional similarity cluster - float prob # Probability - float sentiment # Slot for sentiment - - int32_t lang - - int32_t lower # These string views made sense - int32_t norm # when NLP meant linear models. - int32_t shape # Now they're less relevant, and - int32_t prefix # will probably be revised. - int32_t suffix - - float* vector # <-- This was a design mistake, and will change. - -+h(2, "dynamic-views") Dynamic views - -+h(3) Text - -p - | You might have noticed that in all of the structs above, there's not a - | string to be found. The strings are all stored separately, in the - | #[+api("stringstore") #[code StringStore]] class. The lexemes don't know - | the strings β€” they only know their integer IDs. The document string is - | never stored anywhere, either. Instead, it's reconstructed by iterating - | over the tokens, which look up the #[code orth] attribute of their - | underlying lexeme. Once we have the orth ID, we can fetch the string - | from the vocabulary. Finally, each token knows whether a single - | whitespace character (#[code ' ']) should be used to separate it from - | the subsequent tokens. This allows us to preserve whitespace. - -+code. - cdef print_text(Vocab vocab, const TokenC* tokens, int length): - for i in range(length): - word_string = vocab.strings[tokens.lex.orth] - if tokens.lex.spacy: - word_string += ' ' - print(word_string) - -p - | This is why you get whitespace tokens in spaCy β€” we need those tokens, - | so that we can reconstruct the document string. I also think you should - | have those tokens anyway. Most NLP libraries strip them, making it very - | difficult to recover the paragraph information once you're at the token - | level. You'll never have that sort of problem with spaCy β€” because - | there's a single source of truth. - -+h(3) #[code cdef class Token] - -p When you do... - -+code. - doc[i] - -p - | ...you get back an instance of class #[code spacy.tokens.token.Token]. - | This instance owns no data. Instead, it holds the information - | #[code (doc, i)], and uses these to retrieve all information via the - | parent container. - -+h(3) #[code cdef class Span] - -p When you do... - -+code. - doc[i : j] - -p - | ...you get back an instance of class #[code spacy.tokens.span.Span]. - | #[code Span] instances are also returned by the #[code .sents], - | #[code .ents] and #[code .noun_chunks] iterators of the #[code Doc] - | object. A #[code Span] is a slice of tokens, with an optional label - | attached. Its data model is: - -+code. - cdef class Span: - cdef readonly Doc doc - cdef int start - cdef int end - cdef int start_char - cdef int end_char - cdef int label - -p - | Once again, the #[code Span] owns almost no data. Instead, it refers - | back to the parent #[code Doc] container. - -p - | The #[code start] and #[code end] attributes refer to token positions, - | while #[code start_char] and #[code end_char] record the character - | positions of the span. By recording the character offsets, we can still - | use the #[code Span] object if the tokenization of the document changes. - -+h(3) #[code cdef class Lexeme] - -p When you do... - -+code. - vocab[u'the'] - -p - | ...you get back an instance of class #[code spacy.lexeme.Lexeme]. The - | #[code Lexeme]'s data model is: - -+code. - cdef class Lexeme: - cdef LexemeC* c - cdef readonly Vocab vocab diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 7124bdadc..8bb92caae 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -350,8 +350,8 @@ p | a model, or initialising a Language class via | #[+api("language-from_disk") #[code from_disk]]. +code-new. - nlp = spacy.load('en', disable=['parser']) + nlp = spacy.load('en', disable=['tagger', 'ner']) doc = nlp(u"I don't want parsed", disable=['parser']) +code-old. - nlp = spacy.load('en', parser=False) + nlp = spacy.load('en', tagger=False, entity=False) doc = nlp(u"I don't want parsed", parse=False) diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade deleted file mode 100644 index 56e92a1e7..000000000 --- a/website/docs/usage/resources.jade +++ /dev/null @@ -1,118 +0,0 @@ -//- πŸ’« DOCS > USAGE > RESOURCES - -include ../../_includes/_mixins - -p Many of the associated tools and resources that we're developing alongside spaCy can be found in their own repositories. - -+h(2, "developer") Developer tools - -+table(["Name", "Description"]) - +row - +cell - +src(gh("spacy-models")) spaCy Models - - +cell - | Model releases for spaCy. - - +row - +cell - +src(gh("spacy-dev-resources")) spaCy Dev Resources - - +cell - | Scripts, tools and resources for developing spaCy, adding new - | languages and training new models. - - +row - +cell - +src("spacy-benchmarks") spaCy Benchmarks - - +cell - | Runtime performance comparison of spaCy against other NLP - | libraries. - - +row - +cell - +src(gh("spacy-services")) spaCy Services - - +cell - | REST microservices for spaCy demos and visualisers. - - +row - +cell - +src(gh("spacy-notebooks")) spaCy Notebooks - - +cell - | Jupyter notebooks for spaCy examples and tutorials. - -+h(2, "libraries") Libraries and projects -+table(["Name", "Description"]) - +row - +cell - +src(gh("sense2vec")) sense2vec - - +cell - | Use spaCy to go beyond vanilla - | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec]. - -+h(2, "utility") Utility libraries and dependencies - -+table(["Name", "Description"]) - +row - +cell - +src(gh("thinc")) Thinc - - +cell - | spaCy's Machine Learning library for NLP in Python. - - +row - +cell - +src(gh("cymem")) Cymem - - +cell - | Gate Cython calls to malloc/free behind Python ref-counted - | objects. - - +row - +cell - +src(gh("preshed")) Preshed - - +cell - | Cython hash tables that assume keys are pre-hashed - - +row - +cell - +src(gh("murmurhash")) MurmurHash - - +cell - | Cython bindings for - | #[+a("https://en.wikipedia.org/wiki/MurmurHash") MurmurHash2]. - -+h(2, "visualizers") Visualisers and demos - -+table(["Name", "Description"]) - +row - +cell - +src(gh("displacy")) displaCy.js - - +cell - | A lightweight dependency visualisation library for the modern - | web, built with JavaScript, CSS and SVG. - | #[+a(DEMOS_URL + "/displacy") Demo here]. - - +row - +cell - +src(gh("displacy-ent")) displaCy#[sup ENT] - - +cell - | A lightweight and modern named entity visualisation library - | built with JavaScript and CSS. - | #[+a(DEMOS_URL + "/displacy-ent") Demo here]. - - +row - +cell - +src(gh("sense2vec-demo")) sense2vec Demo - - +cell - | Source of our Semantic Analysis of the Reddit Hivemind - | #[+a(DEMOS_URL + "/sense2vec") demo] using - | #[+a(gh("sense2vec")) sense2vec]. From fe2b0b8b8ded38fa6ba59f951f2ca437d64d8521 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:56:35 +0200 Subject: [PATCH 46/51] Update migrating docs --- website/docs/usage/v2.jade | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index a058c5c13..9bf32bf96 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -260,12 +260,16 @@ p +h(3, "migrating-saving-loading") Saving, loading and serialization -+h(2, "migrating") Migrating from spaCy 1.x p | Double-check all calls to #[code spacy.load()] and make sure they don't - | use the #[code path] keyword argument. + | use the #[code path] keyword argument. If you're only loading in binary + | data and not a model package that can construct its own #[code Language] + | class and pipeline, you should now use the + | #[+api("language#from_disk") #[code Language.from_disk()]] method. -+code-new nlp = spacy.load('/model') ++code-new. + nlp = spacy.load('/model') + nlp = English().from_disk('/model/data') +code-old nlp = spacy.load('en', path='/model') p @@ -288,15 +292,26 @@ p | If you're importing language data or #[code Language] classes, make sure | to change your import statements to import from #[code spacy.lang]. If | you've added your own custom language, it needs to be moved to - | #[code spacy/lang/xx]. + | #[code spacy/lang/xx] and adjusted accordingly. +code-new from spacy.lang.en import English +code-old from spacy.en import English p - | All components, e.g. tokenizer exceptions, are now responsible for - | compiling their data in the correct format. The language_data.py files - | have been removed + | If you've been using custom pipeline components, check out the new + | guide on #[+a("/docs/usage/language-processing-pipelines") processing pipelines]. + | Appending functions to the pipeline still works – but you might be able + | to make this more convenient by registering "component factories". + | Components of the processing pipeline can now be disabled by passing a + | list of their names to the #[code disable] keyword argument on loading + | or processing. + ++code-new. + nlp = spacy.load('en', disable=['tagger', 'ner']) + doc = nlp(u"I don't want parsed", disable=['parser']) ++code-old. + nlp = spacy.load('en', tagger=False, entity=False) + doc = nlp(u"I don't want parsed", parse=False) +h(3, "migrating-matcher") Adding patterns and callbacks to the matcher From 87c976e04c15ff9c440d875a93f7937398cdf8a5 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 01:58:22 +0200 Subject: [PATCH 47/51] Update model tag --- website/docs/usage/pos-tagging.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index 245156b77..dd72efeba 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -8,7 +8,7 @@ p | processes. They can also be useful features in some statistical models. +h(2, "101") Part-of-speech tagging 101 - +tag-model("dependency parse") + +tag-model("tagger", "dependency parse") include _spacy-101/_pos-deps From 4b5540cc63a611812d98477901b3fae60fff6700 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 01:58:33 +0200 Subject: [PATCH 48/51] Rewrite examples in lightning tour --- website/docs/usage/lightning-tour.jade | 260 +++++++++++++------------ 1 file changed, 134 insertions(+), 126 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 24654b853..a946beb55 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -6,40 +6,138 @@ p | The following examples and code snippets give you an overview of spaCy's | functionality and its usage. -+h(2, "models") Install and load models ++h(2, "models") Install models and process text +code(false, "bash"). python -m spacy download en + python -m spacy download de +code. import spacy nlp = spacy.load('en') + doc = nlp(u'Hello, world. Here are two sentences.') -+h(2, "examples-resources") Load resources and process text + nlp_de = spacy.load('de') + doc_de = nlp_de(u'Ich bin ein Berliner.') + ++infobox + | #[strong API:] #[+api("spacy#load") #[code spacy.load()]] + | #[strong Usage:] #[+a("/docs/usage/models") Models], + | #[+a("/docs/usage/spacy-101") spaCy 101] + ++h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences + +tag-model("dependency parse") + ++code. + doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " + u"emoji. It's outranking eggplant πŸ‘ ") + + assert doc[0].text == u'Peach' + assert doc[1].text == u'emoji' + assert doc[-1].text == u'πŸ‘' + assert doc[17:19] == u'outranking eggplant' + assert doc.noun_chunks[0].text == u'Peach emoji' + + sentences = list(doc.sents) + assert len(sentences) == 3 + assert sentences[0].text == u'Peach is the superior emoji.' + ++infobox + | #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101] + ++h(2, "examples-pos-tags") Get part-of-speech tags and flags + +tag-model("tagger") + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + apple = doc[0] + assert [apple.pos_, apple.pos] == [u'PROPN', 94] + assert [apple.tag_, apple.tag] == [u'NNP', 475] + assert [apple.shape_, apple.shape] == [u'Xxxxx', 684] + assert apple.is_alpha == True + assert apple.is_punct == False + + billion = doc[10] + assert billion.is_digit == False + assert billion.like_num == True + assert billion.like_email == False + ++infobox + | #[strong API:] #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging] + ++h(2, "examples-integer-ids") Use integer IDs for any string + ++code. + hello_id = nlp.vocab.strings['Hello'] + hello_str = nlp.vocab.strings[hello_id] + assert token.text == hello_id == 3125 + assert token.text == hello_str == 'Hello' + ++h(2, "examples-entities") Recongnise and update named entities + +tag-model("NER") + ++code. + doc = nlp(u'San Francisco considers banning sidewalk delivery robots') + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents == [(u'San Francisco', 0, 13, u'GPE')] + + from spacy.tokens import Span + doc = nlp(u'Netflix is hiring a new VP of global policy') + doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] + ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents] + assert ents == [(0, 7, u'ORG')] + ++infobox + | #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition] + ++h(2, "displacy") Visualize a dependency parse and named entities in your browser + +tag-model("dependency parse", "NER") + ++code. + from spacy import displacy + + doc_dep = nlp(u'This is a sentence.') + displacy.serve(doc_dep, style='dep') + + doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at ' + u'Google in 2007, few people outside of the company took him seriously.') + displacy.serve(doc_ent, style='ent') + ++infobox + | #[strong API:] #[+api("displacy") #[code displacy]] + | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers] + ++h(2, "examples-word-vectors") Word vectors + +tag-model("word vectors") + ++code. + doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") + apple = doc[0] + banana = doc[2] + pasta = doc[6] + hippo = doc[8] + assert apple.similarity(banana) > pasta.similarity(hippo) + ++infobox + | #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity] + ++h(2, "examples-serialization") Simple and efficient serialization +code. import spacy - en_nlp = spacy.load('en') - de_nlp = spacy.load('de') - en_doc = en_nlp(u'Hello, world. Here are two sentences.') - de_doc = de_nlp(u'ich bin ein Berliner.') + from spacy.tokens.doc import Doc -+h(2, "displacy-dep") Visualize a dependency parse in your browser + nlp = spacy.load('en') + moby_dick = open('moby_dick.txt', 'r') + doc = nlp(moby_dick) + doc.to_disk('/moby_dick.bin') -+code. - from spacy import displacy + new_doc = Doc().from_disk('/moby_dick.bin') - doc = nlp(u'This is a sentence.') - displacy.serve(doc, style='dep') - -+h(2, "displacy-ent") Visualize named entities in your browser - -+code. - from spacy import displacy - - doc = nlp(u'When Sebastian Thrun started working on self-driving cars at ' - u'Google in 2007, few people outside of the company took him seriously.') - displacy.serve(doc, style='ent') ++infobox + | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] +h(2, "multi-threaded") Multi-threaded generator @@ -52,37 +150,25 @@ p if i == 100: break -+h(2, "examples-tokens-sentences") Get tokens and sentences ++infobox + | #[strong API:] #[+api("doc") #[code Doc]] + | #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage] + ++h(2, "examples-dependencies") Get syntactic dependencies + +tag-model("dependency parse") +code. - token = doc[0] - sentence = next(doc.sents) - assert token is sentence[0] - assert sentence.text == 'Hello, world.' + def dependency_labels_to_root(token): + """Walk up the syntactic tree, collecting the arc labels.""" + dep_labels = [] + while token.head is not token: + dep_labels.append(token.dep) + token = token.head + return dep_labels -+h(2, "examples-integer-ids") Use integer IDs for any string - -+code. - hello_id = nlp.vocab.strings['Hello'] - hello_str = nlp.vocab.strings[hello_id] - - assert token.orth == hello_id == 3125 - assert token.orth_ == hello_str == 'Hello' - -+h(2, "examples-string-views-flags") Get and set string views and flags - -+code. - assert token.shape_ == 'Xxxxx' - for lexeme in nlp.vocab: - if lexeme.is_alpha: - lexeme.shape_ = 'W' - elif lexeme.is_digit: - lexeme.shape_ = 'D' - elif lexeme.is_punct: - lexeme.shape_ = 'P' - else: - lexeme.shape_ = 'M' - assert token.shape_ == 'W' ++infobox + | #[strong API:] #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse] +h(2, "examples-numpy-arrays") Export to numpy arrays @@ -97,70 +183,6 @@ p assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] -+h(2, "examples-word-vectors") Word vectors - -+code. - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - - apples = doc[0] - oranges = doc[2] - boots = doc[6] - hippos = doc[8] - - assert apples.similarity(oranges) > boots.similarity(hippos) - -+h(2, "examples-pos-tags") Part-of-speech tags - -+code. - from spacy.parts_of_speech import ADV - - def is_adverb(token): - return token.pos == spacy.parts_of_speech.ADV - - # These are data-specific, so no constants are provided. You have to look - # up the IDs from the StringStore. - NNS = nlp.vocab.strings['NNS'] - NNPS = nlp.vocab.strings['NNPS'] - def is_plural_noun(token): - return token.tag == NNS or token.tag == NNPS - - def print_coarse_pos(token): - print(token.pos_) - - def print_fine_pos(token): - print(token.tag_) - -+h(2, "examples-dependencies") Syntactic dependencies - -+code. - def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels - -+h(2, "examples-entities") Named entities - -+code. - def iter_products(docs): - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PRODUCT': - yield ent - - def word_is_in_entity(word): - return word.ent_type != 0 - - def count_parent_verb_by_person(docs): - counts = defaultdict(lambda: defaultdict(int)) - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - counts[ent.orth_][ent.root.head.lemma_] += 1 - return counts - +h(2, "examples-inline") Calculate inline mark-up on original string +code. @@ -187,17 +209,3 @@ p string = string.replace('\n', '') string = string.replace('\t', ' ') return string - -+h(2, "examples-binary") Efficient binary serialization - -+code. - import spacy - from spacy.tokens.doc import Doc - - byte_string = doc.to_bytes() - open('moby_dick.bin', 'wb').write(byte_string) - - nlp = spacy.load('en') - for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): - doc = Doc(nlp.vocab) - doc.from_bytes(byte_string) From dcb10da61596aa2249882e7d7ca8a404fb33c6ea Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 11:15:56 +0200 Subject: [PATCH 49/51] Update and fix lightning tour examples --- website/docs/usage/lightning-tour.jade | 50 ++++++++++++++++---------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index a946beb55..473f10c5e 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -101,15 +101,15 @@ p doc_dep = nlp(u'This is a sentence.') displacy.serve(doc_dep, style='dep') - doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at ' - u'Google in 2007, few people outside of the company took him seriously.') + doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google ' + u'in 2007, few people outside of the company took him seriously.') displacy.serve(doc_ent, style='ent') +infobox | #[strong API:] #[+api("displacy") #[code displacy]] | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers] -+h(2, "examples-word-vectors") Word vectors ++h(2, "examples-word-vectors") Get word vectors and similarity +tag-model("word vectors") +code. @@ -119,6 +119,7 @@ p pasta = doc[6] hippo = doc[8] assert apple.similarity(banana) > pasta.similarity(hippo) + assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector +infobox | #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity] @@ -139,6 +140,23 @@ p +infobox | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] ++h(2, "rule-matcher") Match text with token rules + ++code. + import spacy + from spacy.matcher import Matcher + + nlp = spacy.load('en') + matcher = Matcher(nlp.vocab) + # match "Google I/O" or "Google i/o" + pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] + matcher.add('GoogleIO', None, pattern) + matches = nlp(LOTS_OF TEXT) + ++infobox + | #[strong API:] #[+api("matcher") #[code Matcher]] + | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] + +h(2, "multi-threaded") Multi-threaded generator +code. @@ -183,28 +201,24 @@ p assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] -+h(2, "examples-inline") Calculate inline mark-up on original string ++h(2, "examples-inline") Calculate inline markup on original string +code. def put_spans_around_tokens(doc, get_classes): - '''Given some function to compute class names, put each token in a - span element, with the appropriate classes computed. - - All whitespace is preserved, outside of the spans. (Yes, I know HTML - won't display it. But the point is no information is lost, so you can - calculate what you need, e.g.
tags,

tags, etc.) - ''' + """Given some function to compute class names, put each token in a + span element, with the appropriate classes computed. All whitespace is + preserved, outside of the spans. (Of course, HTML won't display more than + one whitespace character it – but the point is, no information is lost + and you can calculate what you need, e.g. <br />, <p> etc.) + """ output = [] - template = '{word}{space}' + html = '<span class="{classes}">{word}</span>{space}' for token in doc: if token.is_space: - output.append(token.orth_) + output.append(token.text) else: - output.append( - template.format( - classes=' '.join(get_classes(token)), - word=token.orth_, - space=token.whitespace_)) + classes = ' '.join(get_classes(token)) + output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) string = ''.join(output) string = string.replace('\n', '') string = string.replace('\t', ' ') From b2324be3e90d40f9442d326763d8dd9622603562 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 11:17:21 +0200 Subject: [PATCH 50/51] Fix typos, text, examples and formatting --- website/docs/usage/_data.json | 2 +- website/docs/usage/_spacy-101/_pipelines.jade | 4 +- website/docs/usage/_spacy-101/_pos-deps.jade | 2 +- .../docs/usage/_spacy-101/_serialization.jade | 5 ++ .../docs/usage/_spacy-101/_tokenization.jade | 10 ++-- .../docs/usage/_spacy-101/_word-vectors.jade | 2 +- website/docs/usage/entity-recognition.jade | 2 +- .../usage/language-processing-pipeline.jade | 3 +- website/docs/usage/production-use.jade | 8 +-- website/docs/usage/saving-loading.jade | 2 +- website/docs/usage/spacy-101.jade | 6 +++ website/docs/usage/visualizers.jade | 50 +++++++++---------- 12 files changed, 51 insertions(+), 45 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 9f51df5c4..a611151b3 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -7,7 +7,7 @@ "Lightning tour": "lightning-tour", "What's new in v2.0": "v2" }, - "Workflows": { + "Guides": { "POS tagging": "pos-tagging", "Using the parse": "dependency-parse", "Entity recognition": "entity-recognition", diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index d984a4708..db095ef04 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -2,9 +2,9 @@ p | When you call #[code nlp] on a text, spaCy first tokenizes the text to - | produce a #[code Doc] object. The #[code Doc] is the processed in several + | produce a #[code Doc] object. The #[code Doc] is then processed in several | different steps – this is also referred to as the - | #[strong processing pipeline]. The pipeline used by our + | #[strong processing pipeline]. The pipeline used by the | #[+a("/docs/usage/models") default models] consists of a | vectorizer, a tagger, a parser and an entity recognizer. Each pipeline | component returns the processed #[code Doc], which is then passed on to diff --git a/website/docs/usage/_spacy-101/_pos-deps.jade b/website/docs/usage/_spacy-101/_pos-deps.jade index 5aa719c23..b42847aee 100644 --- a/website/docs/usage/_spacy-101/_pos-deps.jade +++ b/website/docs/usage/_spacy-101/_pos-deps.jade @@ -28,7 +28,7 @@ p | #[strong Text:] The original word text.#[br] | #[strong Lemma:] The base form of the word.#[br] | #[strong POS:] The simple part-of-speech tag.#[br] - | #[strong Tag:] ...#[br] + | #[strong Tag:] The detailed part-of-speech tag.#[br] | #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br] | #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br] | #[strong is alpha:] Is the token an alpha character?#[br] diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade index b6a889014..f3926dd9c 100644 --- a/website/docs/usage/_spacy-101/_serialization.jade +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -33,3 +33,8 @@ p +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style) +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) + ++code. + moby_dick = open('moby_dick.txt', 'r') # open a large document + doc = nlp(moby_dick) # process it + doc.to_disk('/moby_dick.bin') # save the processed Doc diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade index 28fd448b4..64e3f5881 100644 --- a/website/docs/usage/_spacy-101/_tokenization.jade +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -2,11 +2,11 @@ p | During processing, spaCy first #[strong tokenizes] the text, i.e. - | segments it into words, punctuation and so on. For example, punctuation - | at the end of a sentence should be split off – whereas "U.K." should - | remain one token. This is done by applying rules specific to each - | language. Each #[code Doc] consists of individual tokens, and we can - | simply iterate over them: + | segments it into words, punctuation and so on. This is done by applying + | rules specific to each language. For example, punctuation at the end of a + | sentence should be split off – whereas "U.K." should remain one token. + | Each #[code Doc] consists of individual tokens, and we can simply iterate + | over them: +code. for token in doc: diff --git a/website/docs/usage/_spacy-101/_word-vectors.jade b/website/docs/usage/_spacy-101/_word-vectors.jade index 4ed8e4c78..cbb9d06f2 100644 --- a/website/docs/usage/_spacy-101/_word-vectors.jade +++ b/website/docs/usage/_spacy-101/_word-vectors.jade @@ -6,7 +6,7 @@ p | vectors can be generated using an algorithm like | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's | #[+a("/docs/usage/models") default models] come with - | #[strong 300-dimensional vectors], that look like this: + | #[strong 300-dimensional vectors] that look like this: +code("banana.vector", false, false, 250). array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index bcad07baa..527c14dde 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -52,7 +52,7 @@ p assert ent_san == [u'San', u'B', u'GPE'] assert ent_francisco == [u'Francisco', u'I', u'GPE'] -+table(["Text", "ent_iob", "ent.iob_", "ent_type", "ent_type_", "Description"]) ++table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"]) - var style = [0, 1, 1, 1, 1, 0] +annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style) +annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 8bb92caae..948212d82 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -344,8 +344,7 @@ p | Since spaCy v2.0 comes with better support for customising the | processing pipeline components, the #[code parser], #[code tagger] | and #[code entity] keyword arguments have been replaced with - | #[code disable], which takes a list of - | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | #[code disable], which takes a list of pipeline component names. | This lets you disable both default and custom components when loading | a model, or initialising a Language class via | #[+api("language-from_disk") #[code from_disk]]. diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade index 68a313d8a..c7f872c6d 100644 --- a/website/docs/usage/production-use.jade +++ b/website/docs/usage/production-use.jade @@ -2,16 +2,12 @@ include ../../_includes/_mixins -p - | Once you have loaded the #[code nlp] object, you can call it as though - | it were a function. This allows you to process a single unicode string. - +h(2, "multithreading") Multi-threading with #[code .pipe()] p | If you have a sequence of documents to process, you should use the - | #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()] - | method takes an iterator of texts, and accumulates an internal buffer, + | #[+api("language#pipe") #[code .pipe()]] method. The method takes an + | iterator of texts, and accumulates an internal buffer, | which it works on in parallel. It then yields the documents in order, | one-by-one. After a long and bitter struggle, the global interpreter | lock was freed around spaCy's main parsing loop in v0.100.3. This means diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 413b86477..477db925c 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -209,5 +209,5 @@ p | spaCy v2.0 solves this with a clear distinction between setting up | the instance and loading the data. - +code-new nlp = English.from_disk('/path/to/data') + +code-new nlp = English().from_disk('/path/to/data') +code-old nlp = spacy.load('en', path='/path/to/data') diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index f8779b52f..47d49ad40 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -81,6 +81,12 @@ p nlp = spacy.load('en') doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') +p + | Even though a #[code Doc] is processed – e.g. split into individual words + | and annotated – it still holds #[strong all information of the original text], + | like whitespace characters. This way, you'll never lose any information + | when processing text with spaCy. + +h(3, "annotations-token") Tokenization include _spacy-101/_tokenization diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 385fa0fd0..90a343700 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -180,8 +180,8 @@ p p | If you don't need the web server and just want to generate the markup | – for example, to export it to a file or serve it in a custom - | way – you can use #[+api("displacy#render") #[code displacy.render]] - | instead. It works the same, but returns a string containing the markup. + | way – you can use #[+api("displacy#render") #[code displacy.render]]. + | It works the same way, but returns a string containing the markup. +code("Example"). import spacy @@ -220,10 +220,32 @@ p | a standalone graphic.) So instead of rendering all #[code Doc]s at one, | loop over them and export them separately. + ++h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses + ++code("Example"). + import spacy + from spacy import displacy + from pathlib import Path + + nlp = spacy.load('en') + sentences = ["This is an example.", "This is another one."] + for sent in sentences: + doc = nlp(sentence) + svg = displacy.render(doc, style='dep') + file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' + output_path = Path('/images/' + file_name) + output_path.open('w', encoding='utf-8').write(svg) + +p + | The above code will generate the dependency visualizations and them to + | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. + + +h(2, "jupyter") Using displaCy in Jupyter notebooks p - | displaCy is able to detect whether you're within a + | displaCy is able to detect whether you're working in a | #[+a("https://jupyter.org") Jupyter] notebook, and will return markup | that can be rendered in a cell straight away. When you export your | notebook, the visualizations will be included as HTML. @@ -257,28 +279,6 @@ p html = displacy.render(doc, style='dep') return display(HTML(html)) -+h(2, "examples") Usage examples - -+h(3, "examples-export-svg") Export SVG graphics of dependency parses - -+code("Example"). - import spacy - from spacy import displacy - from pathlib import Path - - nlp = spacy.load('en') - sentences = ["This is an example.", "This is another one."] - for sent in sentences: - doc = nlp(sentence) - svg = displacy.render(doc, style='dep') - file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' - output_path = Path('/images/' + file_name) - output_path.open('w', encoding='utf-8').write(svg) - -p - | The above code will generate the dependency visualizations and them to - | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. - +h(2, "manual-usage") Rendering data manually p From 9063654a1ad2dd2b9b04f39b34ccf5395953f4b9 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 11:18:02 +0200 Subject: [PATCH 51/51] Add Training 101 stub --- website/docs/usage/_spacy-101/_training.jade | 3 +++ website/docs/usage/spacy-101.jade | 4 ++++ website/docs/usage/training.jade | 4 ++++ 3 files changed, 11 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_training.jade diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/docs/usage/_spacy-101/_training.jade new file mode 100644 index 000000000..59861434c --- /dev/null +++ b/website/docs/usage/_spacy-101/_training.jade @@ -0,0 +1,3 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > TRAINING + +p diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 47d49ad40..9373f182a 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -117,6 +117,10 @@ include _spacy-101/_pipelines include _spacy-101/_serialization ++h(2, "training") Training + +include _spacy-101/_training + +h(2, "architecture") Architecture +image diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 8a5c111bd..9df71851a 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -6,6 +6,10 @@ p | Once the model is trained, you can then | #[+a("/docs/usage/saving-loading") save and load] it. ++h(2, "101") Training 101 + +include _spacy-101/_training + +h(2, "train-pos-tagger") Training the part-of-speech tagger +code.