From 5b67bcbee0887d11f421456dddb02bba7dacfa64 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 May 2017 15:20:16 -0500 Subject: [PATCH 001/118] Increase default embed size to 7500 --- spacy/pipeline.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index af71b1ad6..7ca2ed99d 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -43,7 +43,7 @@ class TokenVectorEncoder(object): name = 'tok2vec' @classmethod - def Model(cls, width=128, embed_size=5000, **cfg): + def Model(cls, width=128, embed_size=7500, **cfg): """Create a new statistical model for the class. width (int): Output size of the model. From 620df0414fa167b1c8f3cf935e29f93d52368746 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 May 2017 15:20:45 -0500 Subject: [PATCH 002/118] Fix dropout in parser --- spacy/syntax/nn_parser.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 5b7752abb..6f23a08b5 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -249,11 +249,13 @@ cdef class Parser: with Model.use_device('cpu'): if depth == 0: upper = chain() + upper.is_noop = True else: upper = chain( clone(Maxout(hidden_width), (depth-1)), - zero_init(Affine(nr_class)) + zero_init(Affine(nr_class, drop_factor=0.0)) ) + upper.is_noop = False # TODO: This is an unfortunate hack atm! # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) @@ -364,7 +366,7 @@ cdef class Parser: cdef np.ndarray scores c_token_ids = token_ids.data c_is_valid = is_valid.data - cdef int has_hidden = hasattr(vec2scores, 'W') + cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) while not next_step.empty(): if not has_hidden: for i in cython.parallel.prange( @@ -426,7 +428,7 @@ cdef class Parser: states = self.moves.init_batch(docs) state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, - drop) + 0.0) todo = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] @@ -438,11 +440,14 @@ cdef class Parser: states, golds = zip(*todo) token_ids = self.get_token_ids(states) - vector, bp_vector = state2vec.begin_update(token_ids, drop=drop) + vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) + mask = vec2scores.ops.get_dropout_mask(vector.shape, drop) + vector *= mask scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) d_vector = bp_scores(d_scores, sgd=sgd) + d_vector *= mask if isinstance(self.model[0].ops, CupyOps) \ and not isinstance(token_ids, state2vec.ops.xp.ndarray): From 05761e1750e3bd31ef19839abd3415e9ebf3a601 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:11:38 +0200 Subject: [PATCH 003/118] Allow size on procon icon --- website/_includes/_mixins-base.jade | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index 5a7a535c9..c42994e8f 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -42,10 +42,11 @@ mixin icon(name, size) //- Pro/Con/Neutral icon icon - [string] "pro", "con" or "neutral" (default: "neutral") + size - [integer] icon size (optional) -mixin procon(icon) +mixin procon(icon, size) - colors = { pro: "green", con: "red", neutral: "yellow" } - +icon(icon)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) + +icon(icon, size)(class="u-color-#{colors[icon] || 'subtle'}" aria-label=icon)&attributes(attributes) //- Headlines Helper Mixin From 7e5163402e7bcbc09507484261c00501dc646de3 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:13:26 +0200 Subject: [PATCH 004/118] Allow clipping code block to height and add docs --- website/_includes/_mixins.jade | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index f9960b71f..250865884 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -103,9 +103,11 @@ mixin button(url, trusted, ...style) label - [string] aside title (optional or false for no label) language - [string] language for syntax highlighting (default: "python") supports basic relevant languages available for PrismJS + icon - [string] icon to display next to code block, mostly used for old/new + height - [integer] optional height to clip code block to -mixin code(label, language, icon) - pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes) +mixin code(label, language, icon, height) + pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes) if label h4.u-text-label.u-text-label--dark=label From 00ede349dc02a4fc73aa06de7e9243fa0ba8a717 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:13:37 +0200 Subject: [PATCH 005/118] Add table row for linguistic annotations --- website/_includes/_mixins.jade | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 250865884..f815d9c4a 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -352,7 +352,22 @@ mixin pos-row(tag, pos, morph, desc) | #[code=m] +cell.u-text-small=desc + mixin dep-row(label, desc) +row +cell #[code=label] +cell=desc + + +//- Table rows for linguistic annotations + annots [array] - array of cell content + style [array] array of 1 (display as code) or 0 (display as text) + +mixin annotation-row(annots, style) + +row + for cell, i in annots + if style && style[i] + - cell = (typeof(cell) != 'boolean') ? cell : cell ? 'True' : 'False' + +cell #[code=cell] + else + +cell=cell From 0a8a2d2f6dcc2f10a6b684f42b71d9eeefb9a3b3 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:13:51 +0200 Subject: [PATCH 006/118] Remove tip infoboxes from annotation docs --- website/docs/api/_annotation/_dep-labels.jade | 5 ----- website/docs/api/_annotation/_named-entities.jade | 5 ----- website/docs/api/_annotation/_pos-tags.jade | 5 ----- 3 files changed, 15 deletions(-) diff --git a/website/docs/api/_annotation/_dep-labels.jade b/website/docs/api/_annotation/_dep-labels.jade index 9e1e89324..427b2f53a 100644 --- a/website/docs/api/_annotation/_dep-labels.jade +++ b/website/docs/api/_annotation/_dep-labels.jade @@ -1,10 +1,5 @@ //- πŸ’« DOCS > API > ANNOTATION > DEPENDENCY LABELS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a label. For example, - | #[code spacy.explain("prt")] will return "particle". - +h(3, "dependency-parsing-english") English dependency labels p diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/docs/api/_annotation/_named-entities.jade index 68b3bd17d..476659d4a 100644 --- a/website/docs/api/_annotation/_named-entities.jade +++ b/website/docs/api/_annotation/_named-entities.jade @@ -1,10 +1,5 @@ //- πŸ’« DOCS > API > ANNOTATION > NAMED ENTITIES -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of an entity label. For example, - | #[code spacy.explain("LANGUAGE")] will return "any named language". - +table([ "Type", "Description" ]) +row +cell #[code PERSON] diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index d3ceef777..ea3a225bf 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -1,10 +1,5 @@ //- πŸ’« DOCS > API > ANNOTATION > POS TAGS -+infobox("Tip") - | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the - | description for the string representation of a tag. For example, - | #[code spacy.explain("RB")] will return "adverb". - +h(3, "pos-tagging-english") English part-of-speech tag scheme p From c8bde2161cf199665d2a2e9eab87ecbb2af53a39 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:14:02 +0200 Subject: [PATCH 007/118] Add kwargs to spacy.load --- website/docs/api/spacy.jade | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade index da8c97b9c..6ad88c1a8 100644 --- a/website/docs/api/spacy.jade +++ b/website/docs/api/spacy.jade @@ -33,6 +33,11 @@ p +cell unicode or #[code Path] +cell Model to load, i.e. shortcut link, package name or path. + +row + +cell #[code **overrides] + +cell - + +cell Override or disable components. + +footrow +cell returns +cell #[code Language] From 6ef09d7ed8957c46ac90afb065f2da06662f03ac Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:15:31 +0200 Subject: [PATCH 008/118] Change save_to_directory to to_disk --- website/docs/usage/saving-loading.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index c4eb08f04..b11007683 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -3,11 +3,11 @@ include ../../_includes/_mixins p | After training your model, you'll usually want to save its state, and load | it back later. You can do this with the - | #[+api("language#save_to_directory") #[code Language.save_to_directory()]] + | #[+api("language#to_disk") #[code Language.to_disk()]] | method: +code. - nlp.save_to_directory('/home/me/data/en_example_model') + nlp.to_disk('/home/me/data/en_example_model') p | The directory will be created if it doesn't exist, and the whole pipeline From 3aff8834344071974503d7a9b819260161273448 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:15:39 +0200 Subject: [PATCH 009/118] Add displaCy examples to lightning tour --- website/docs/usage/lightning-tour.jade | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 967d0c61e..24654b853 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -24,6 +24,23 @@ p en_doc = en_nlp(u'Hello, world. Here are two sentences.') de_doc = de_nlp(u'ich bin ein Berliner.') ++h(2, "displacy-dep") Visualize a dependency parse in your browser + ++code. + from spacy import displacy + + doc = nlp(u'This is a sentence.') + displacy.serve(doc, style='dep') + ++h(2, "displacy-ent") Visualize named entities in your browser + ++code. + from spacy import displacy + + doc = nlp(u'When Sebastian Thrun started working on self-driving cars at ' + u'Google in 2007, few people outside of the company took him seriously.') + displacy.serve(doc, style='ent') + +h(2, "multi-threaded") Multi-threaded generator +code. From 786af87ffbc4f6dd98ec149c074c8cbd60fa9a6b Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:15:50 +0200 Subject: [PATCH 010/118] Update IOB docs --- website/docs/api/token.jade | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 9be41081c..744446ec2 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -338,8 +338,10 @@ p The L2 norm of the token's vector representation. +cell #[code ent_iob] +cell int +cell - | IOB code of named entity tag. - | #[code 1="I", 2="O", 3="B"]. #[code 0] means no tag is assigned. + | IOB code of named entity tag. #[code "B"] + | means the token begins an entity, #[code "I"] means it is inside + | an entity, #[code "O"] means it is outside an entity, and + | #[code ""] means no entity tag is set. +row +cell #[code ent_iob_] From a38393e2f624b6c58806acdc18015329e75542d5 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:16:17 +0200 Subject: [PATCH 011/118] Update annotation docs --- website/docs/api/annotation.jade | 38 +++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index bc723b5c6..048e69897 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -14,11 +14,12 @@ p | (#[code ' ']) is included as a token. +aside-code("Example"). - from spacy.en import English - nlp = English(parser=False) + from spacy.lang.en import English + nlp = English() tokens = nlp('Some\nspaces and\ttab characters') - print([t.orth_ for t in tokens]) - # ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] + tokens_text = [t.text for t in tokens] + assert tokens_text == ['Some', '\n', 'spaces', ' ', 'and', + '\t', 'tab', 'characters'] p | The whitespace tokens are useful for much the same reason punctuation is @@ -38,6 +39,11 @@ p +h(2, "pos-tagging") Part-of-speech Tagging ++aside("Tip: Understanding tags") + | You can also use #[code spacy.explain()] to get the escription for the + | string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + include _annotation/_pos-tags +h(2, "lemmatization") Lemmatization @@ -50,25 +56,35 @@ p A "lemma" is the uninflected form of a word. In English, this means: +item #[strong Nouns]: The form like "dog", not "dogs"; like "child", not "children" +item #[strong Verbs]: The form like "write", not "writes", "writing", "wrote" or "written" -+aside("About spaCy's custom pronoun lemma") - | Unlike verbs and common nouns, there's no clear base form of a personal - | pronoun. Should the lemma of "me" be "I", or should we normalize person - | as well, giving "it" β€” or maybe "he"? spaCy's solution is to introduce a - | novel symbol, #[code.u-nowrap -PRON-], which is used as the lemma for - | all personal pronouns. - p | The lemmatization data is taken from | #[+a("https://wordnet.princeton.edu") WordNet]. However, we also add a | special case for pronouns: all pronouns are lemmatized to the special | token #[code -PRON-]. ++infobox("About spaCy's custom pronoun lemma") + | Unlike verbs and common nouns, there's no clear base form of a personal + | pronoun. Should the lemma of "me" be "I", or should we normalize person + | as well, giving "it" β€” or maybe "he"? spaCy's solution is to introduce a + | novel symbol, #[code -PRON-], which is used as the lemma for + | all personal pronouns. + +h(2, "dependency-parsing") Syntactic Dependency Parsing ++aside("Tip: Understanding labels") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition ++aside("Tip: Understanding entity types") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + include _annotation/_named-entities +h(3, "biluo") BILUO Scheme From 3523715d52a318329f238e0bc6d3f14ebf248533 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:16:31 +0200 Subject: [PATCH 012/118] Add spaCy 101 components --- .../usage/_spacy-101/_named-entities.jade | 38 +++++ website/docs/usage/_spacy-101/_pos-deps.jade | 62 +++++++ .../docs/usage/_spacy-101/_similarity.jade | 44 +++++ .../docs/usage/_spacy-101/_tokenization.jade | 18 +++ .../docs/usage/_spacy-101/_word-vectors.jade | 152 ++++++++++++++++++ 5 files changed, 314 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_named-entities.jade create mode 100644 website/docs/usage/_spacy-101/_pos-deps.jade create mode 100644 website/docs/usage/_spacy-101/_similarity.jade create mode 100644 website/docs/usage/_spacy-101/_tokenization.jade create mode 100644 website/docs/usage/_spacy-101/_word-vectors.jade diff --git a/website/docs/usage/_spacy-101/_named-entities.jade b/website/docs/usage/_spacy-101/_named-entities.jade new file mode 100644 index 000000000..a3c539564 --- /dev/null +++ b/website/docs/usage/_spacy-101/_named-entities.jade @@ -0,0 +1,38 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > NAMED ENTITIES + +p + | A named entity is a "real-world object" that's assigned a name – for + | example, a person, a country, a product or a book title. spaCy can + | #[strong recognise] #[+a("/docs/api/annotation#named-entities") various types] + | of named entities in a document, by asking the model for a + | #[strong prediction]. Because models are statistical and strongly depend + | on the examples they were trained on, this doesn't always work + | #[em perfectly] and might need some tuning later, depending on your use + | case. + +p + | Named entities are available as the #[code ents] property of a #[code Doc]: + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + + for ent in doc.ents: + print(ent.text, ent.start_char, ent.end_char, ent.label_) + ++aside + | #[strong Text]: The original entity text.#[br] + | #[strong Start]: Index of start of entity in the #[code Doc].#[br] + | #[strong End]: Index of end of entity in the #[code Doc].#[br] + | #[strong Label]: Entity label, i.e. type. + ++table(["Text", "Start", "End", "Label", "Description"]) + - var style = [0, 1, 1, 1, 0] + +annotation-row(["Apple", 0, 5, "ORG", "Companies, agencies, institutions."], style) + +annotation-row(["U.K.", 27, 31, "GPE", "Geopolitical entity, i.e. countries, cities, states."], style) + +annotation-row(["$1 billion", 44, 54, "MONEY", "Monetary values, including unit."], style) + +p + | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer], + | here's what our example sentence and its named entities look like: + ++codepen("2f2ad1408ff79fc6a326ea3aedbb353b", 160) diff --git a/website/docs/usage/_spacy-101/_pos-deps.jade b/website/docs/usage/_spacy-101/_pos-deps.jade new file mode 100644 index 000000000..5aa719c23 --- /dev/null +++ b/website/docs/usage/_spacy-101/_pos-deps.jade @@ -0,0 +1,62 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING + +p + | After tokenization, spaCy can also #[strong parse] and #[strong tag] a + | given #[code Doc]. This is where the statistical model comes in, which + | enables spaCy to #[strong make a prediction] of which tag or label most + | likely applies in this context. A model consists of binary data and is + | produced by showing a system enough examples for it to make predictions + | that generalise across the language – for example, a word following "the" + | in English is most likely a noun. + +p + | Linguistic annotations are available as + | #[+api("token#attributes") #[code Token] attributes]. Like many NLP + | libraries, spaCy #[strong encodes all strings to integers] to reduce + | memory usage and improve efficiency. So to get the readable string + | representation of an attribute, we need to add an underscore #[code _] + | to its name: + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + + for token in doc: + print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, + token.shape_, token.is_alpha, token.is_stop) + ++aside + | #[strong Text:] The original word text.#[br] + | #[strong Lemma:] The base form of the word.#[br] + | #[strong POS:] The simple part-of-speech tag.#[br] + | #[strong Tag:] ...#[br] + | #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br] + | #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br] + | #[strong is alpha:] Is the token an alpha character?#[br] + | #[strong is stop:] Is the token part of a stop list, i.e. the most common + | words of the language?#[br] + ++table(["Text", "Lemma", "POS", "Tag", "Dep", "Shape", "alpha", "stop"]) + - var style = [0, 0, 1, 1, 1, 1, 1, 1] + +annotation-row(["Apple", "apple", "PROPN", "NNP", "nsubj", "Xxxxx", true, false], style) + +annotation-row(["is", "be", "VERB", "VBZ", "aux", "xx", true, true], style) + +annotation-row(["looking", "look", "VERB", "VBG", "ROOT", "xxxx", true, false], style) + +annotation-row(["at", "at", "ADP", "IN", "prep", "xx", true, true], style) + +annotation-row(["buying", "buy", "VERB", "VBG", "pcomp", "xxxx", true, false], style) + +annotation-row(["U.K.", "u.k.", "PROPN", "NNP", "compound", "X.X.", false, false], style) + +annotation-row(["startup", "startup", "NOUN", "NN", "dobj", "xxxx", true, false], style) + +annotation-row(["for", "for", "ADP", "IN", "prep", "xxx", true, true], style) + +annotation-row(["$", "$", "SYM", "$", "quantmod", "$", false, false], style) + +annotation-row(["1", "1", "NUM", "CD", "compound", "d", false, false], style) + +annotation-row(["billion", "billion", "NUM", "CD", "pobj", "xxxx", true, false], style) + ++aside("Tip: Understanding tags and labels") + | Most of the tags and labels look pretty abstract, and they vary between + | languages. #[code spacy.explain()] will show you a short description – + | for example, #[code spacy.explain("VBZ")] returns "verb, 3rd person + | singular present". + +p + | Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer], + | here's what our example sentence and its dependencies look like: + ++codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460) diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade new file mode 100644 index 000000000..c99bc9658 --- /dev/null +++ b/website/docs/usage/_spacy-101/_similarity.jade @@ -0,0 +1,44 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > SIMILARITY + +p + | spaCy is able to compare two objects, and make a prediction of + | #[strong how similar they are]. Predicting similarity is useful for + | building recommendation systems or flagging duplicates. For example, you + | can suggest a user content that's similar to what they're currently + | looking at, or label a support ticket as a duplicate, if it's very + | similar to an already existing one. + +p + | Each #[code Doc], #[code Span] and #[code Token] comes with a + | #[+api("token#similarity") #[code .similarity()]] method that lets you + | compare it with another object, and determine the similarity. Of course + | similarity is always subjective – whether "dog" and "cat" are similar + | really depends on how you're looking at it. spaCy's similarity model + | usually assumes a pretty general-purpose definition of similarity. + ++code. + tokens = nlp(u'dog cat banana') + + for token1 in tokens: + for token2 in tokens: + print(token1.similarity(token2)) + ++aside + | #[strong #[+procon("neutral", 16)] similarity:] identical#[br] + | #[strong #[+procon("pro", 16)] similarity:] similar (higher is more similar) #[br] + | #[strong #[+procon("con", 16)] similarity:] dissimilar (lower is less similar) + ++table(["", "dog", "cat", "banana"]) + each cells, label in {"dog": [1.00, 0.80, 0.24], "cat": [0.80, 1.00, 0.28], "banana": [0.24, 0.28, 1.00]} + +row + +cell.u-text-label.u-color-theme=label + for cell in cells + +cell #[code=cell.toFixed(2)] + | #[+procon(cell < 0.5 ? "con" : cell != 1 ? "pro" : "neutral")] + +p + | In this case, the model's predictions are pretty on point. A dog is very + | similar to a cat, whereas a banana is not very similar to either of them. + | Identical tokens are obviously 100% similar to each other (just not always + | exactly #[code 1.0], because of vector math and floating point + | imprecisions). diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade new file mode 100644 index 000000000..28fd448b4 --- /dev/null +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -0,0 +1,18 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > TOKENIZATION + +p + | During processing, spaCy first #[strong tokenizes] the text, i.e. + | segments it into words, punctuation and so on. For example, punctuation + | at the end of a sentence should be split off – whereas "U.K." should + | remain one token. This is done by applying rules specific to each + | language. Each #[code Doc] consists of individual tokens, and we can + | simply iterate over them: + ++code. + for token in doc: + print(token.text) + ++table([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).u-text-center + +row + for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"] + +cell=cell diff --git a/website/docs/usage/_spacy-101/_word-vectors.jade b/website/docs/usage/_spacy-101/_word-vectors.jade new file mode 100644 index 000000000..4ed8e4c78 --- /dev/null +++ b/website/docs/usage/_spacy-101/_word-vectors.jade @@ -0,0 +1,152 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > WORD VECTORS + +p + | Similarity is determined by comparing #[strong word vectors] or "word + | embeddings", multi-dimensional meaning representations of a word. Word + | vectors can be generated using an algorithm like + | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's + | #[+a("/docs/usage/models") default models] come with + | #[strong 300-dimensional vectors], that look like this: + ++code("banana.vector", false, false, 250). + array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, + 3.28450017e-02, -4.19569999e-01, 7.20689967e-02, + -3.74760002e-01, 5.74599989e-02, -1.24009997e-02, + 5.29489994e-01, -5.23800015e-01, -1.97710007e-01, + -3.41470003e-01, 5.33169985e-01, -2.53309999e-02, + 1.73800007e-01, 1.67720005e-01, 8.39839995e-01, + 5.51070012e-02, 1.05470002e-01, 3.78719985e-01, + 2.42750004e-01, 1.47449998e-02, 5.59509993e-01, + 1.25210002e-01, -6.75960004e-01, 3.58420014e-01, + -4.00279984e-02, 9.59490016e-02, -5.06900012e-01, + -8.53179991e-02, 1.79800004e-01, 3.38669986e-01, + 1.32300004e-01, 3.10209990e-01, 2.18779996e-01, + 1.68530002e-01, 1.98740005e-01, -5.73849976e-01, + -1.06490001e-01, 2.66689986e-01, 1.28380001e-01, + -1.28030002e-01, -1.32839993e-01, 1.26570001e-01, + 8.67229998e-01, 9.67210010e-02, 4.83060002e-01, + 2.12709993e-01, -5.49900010e-02, -8.24249983e-02, + 2.24079996e-01, 2.39749998e-01, -6.22599982e-02, + 6.21940017e-01, -5.98999977e-01, 4.32009995e-01, + 2.81430006e-01, 3.38420011e-02, -4.88150001e-01, + -2.13589996e-01, 2.74010003e-01, 2.40950003e-01, + 4.59500015e-01, -1.86049998e-01, -1.04970002e+00, + -9.73049998e-02, -1.89080000e-01, -7.09290028e-01, + 4.01950002e-01, -1.87680006e-01, 5.16870022e-01, + 1.25200003e-01, 8.41499984e-01, 1.20970003e-01, + 8.82389992e-02, -2.91959997e-02, 1.21510006e-03, + 5.68250008e-02, -2.74210006e-01, 2.55640000e-01, + 6.97930008e-02, -2.22580001e-01, -3.60060006e-01, + -2.24020004e-01, -5.36990017e-02, 1.20220006e+00, + 5.45350015e-01, -5.79980016e-01, 1.09049998e-01, + 4.21669990e-01, 2.06619993e-01, 1.29360005e-01, + -4.14570011e-02, -6.67770028e-01, 4.04670000e-01, + -1.52179999e-02, -2.76400000e-01, -1.56110004e-01, + -7.91980028e-02, 4.00369987e-02, -1.29439995e-01, + -2.40900001e-04, -2.67850012e-01, -3.81150007e-01, + -9.72450018e-01, 3.17259997e-01, -4.39509988e-01, + 4.19340014e-01, 1.83530003e-01, -1.52600005e-01, + -1.08080000e-01, -1.03579998e+00, 7.62170032e-02, + 1.65189996e-01, 2.65259994e-04, 1.66160002e-01, + -1.52810007e-01, 1.81229994e-01, 7.02740014e-01, + 5.79559989e-03, 5.16639985e-02, -5.97449988e-02, + -2.75510013e-01, -3.90489995e-01, 6.11319989e-02, + 5.54300010e-01, -8.79969969e-02, -4.16810006e-01, + 3.28260005e-01, -5.25489986e-01, -4.42880005e-01, + 8.21829960e-03, 2.44859993e-01, -2.29819998e-01, + -3.49810004e-01, 2.68940002e-01, 3.91660005e-01, + -4.19039994e-01, 1.61909997e-01, -2.62630010e+00, + 6.41340017e-01, 3.97430003e-01, -1.28680006e-01, + -3.19460005e-01, -2.56330013e-01, -1.22199997e-01, + 3.22750002e-01, -7.99330026e-02, -1.53479993e-01, + 3.15050006e-01, 3.05909991e-01, 2.60120004e-01, + 1.85530007e-01, -2.40429997e-01, 4.28860001e-02, + 4.06219989e-01, -2.42559999e-01, 6.38700008e-01, + 6.99829996e-01, -1.40430003e-01, 2.52090007e-01, + 4.89840001e-01, -6.10670000e-02, -3.67659986e-01, + -5.50890028e-01, -3.82649988e-01, -2.08430007e-01, + 2.28320003e-01, 5.12179971e-01, 2.78679997e-01, + 4.76520002e-01, 4.79510017e-02, -3.40079993e-01, + -3.28729987e-01, -4.19669986e-01, -7.54989982e-02, + -3.89539987e-01, -2.96219997e-02, -3.40700001e-01, + 2.21699998e-01, -6.28560036e-02, -5.19029975e-01, + -3.77739996e-01, -4.34770016e-03, -5.83010018e-01, + -8.75459984e-02, -2.39289999e-01, -2.47109994e-01, + -2.58870006e-01, -2.98940003e-01, 1.37150005e-01, + 2.98919994e-02, 3.65439989e-02, -4.96650010e-01, + -1.81600004e-01, 5.29389977e-01, 2.19919994e-01, + -4.45140004e-01, 3.77979994e-01, -5.70620000e-01, + -4.69460003e-02, 8.18059966e-02, 1.92789994e-02, + 3.32459986e-01, -1.46200001e-01, 1.71560004e-01, + 3.99809986e-01, 3.62170011e-01, 1.28160000e-01, + 3.16439986e-01, 3.75690013e-01, -7.46899992e-02, + -4.84800003e-02, -3.14009994e-01, -1.92860007e-01, + -3.12940001e-01, -1.75529998e-02, -1.75139993e-01, + -2.75870003e-02, -1.00000000e+00, 1.83870003e-01, + 8.14339995e-01, -1.89129993e-01, 5.09989977e-01, + -9.19600017e-03, -1.92950002e-03, 2.81890005e-01, + 2.72470005e-02, 4.34089988e-01, -5.49669981e-01, + -9.74259973e-02, -2.45399997e-01, -1.72030002e-01, + -8.86500031e-02, -3.02980006e-01, -1.35910004e-01, + -2.77649999e-01, 3.12860007e-03, 2.05559999e-01, + -1.57720000e-01, -5.23079991e-01, -6.47010028e-01, + -3.70139986e-01, 6.93930015e-02, 1.14009999e-01, + 2.75940001e-01, -1.38750002e-01, -2.72680014e-01, + 6.68910027e-01, -5.64539991e-02, 2.40170002e-01, + -2.67300010e-01, 2.98599988e-01, 1.00830004e-01, + 5.55920005e-01, 3.28489989e-01, 7.68579990e-02, + 1.55279994e-01, 2.56359994e-01, -1.07720003e-01, + -1.23590000e-01, 1.18270002e-01, -9.90289971e-02, + -3.43279988e-01, 1.15019999e-01, -3.78080010e-01, + -3.90120000e-02, -3.45930010e-01, -1.94040000e-01, + -3.35799992e-01, -6.23340011e-02, 2.89189994e-01, + 2.80319989e-01, -5.37410021e-01, 6.27939999e-01, + 5.69549985e-02, 6.21469975e-01, -2.52819985e-01, + 4.16700006e-01, -1.01079997e-02, -2.54339993e-01, + 4.00029987e-01, 4.24320012e-01, 2.26720005e-01, + 1.75530002e-01, 2.30489999e-01, 2.83230007e-01, + 1.38820007e-01, 3.12180002e-03, 1.70570001e-01, + 3.66849989e-01, 2.52470002e-03, -6.40089989e-01, + -2.97650009e-01, 7.89430022e-01, 3.31680000e-01, + -1.19659996e+00, -4.71559986e-02, 5.31750023e-01], dtype=float32) + +p + | The #[code .vector] attribute will return an object's vector. + | #[+api("doc#vector") #[code Doc.vector]] and + | #[+api("span#vector") #[code Span.vector]] will default to an average + | of their token vectors. You can also check if a token has a vector + | assigned, and get the L2 norm, which can be used to normalise + | vectors. + ++code. + tokens = nlp(u'dog cat banana sasquatch') + + for token in tokens: + print(token.text, token.has_vector, token.vector_norm, token.is_oov) + ++aside + | #[strong Text]: The original token text.#[br] + | #[strong has vector]: Does the token have a vector representation?#[br] + | #[strong Vector norm]: The L2 norm of the token's vector (the square root + | of the sum of the values squared)#[br] + | #[strong is OOV]: Is the word out-of-vocabulary? + ++table(["Text", "Has vector", "Vector norm", "OOV"]) + - var style = [0, 1, 1, 1] + +annotation-row(["dog", true, 7.033672992262838, false], style) + +annotation-row(["cat", true, 6.68081871208896, false], style) + +annotation-row(["banana", true, 6.700014292148571, false], style) + +annotation-row(["sasquatch", false, 0, true], style) + +p + | The words "dog", "cat" and "banana" are all pretty common in English, so + | they're part of the model's vocabulary, and come with a vector. The word + | "sasquatch" on the other hand is a lot less common and out-of-vocabulary + | – so its vector representation consists of 300 dimensions of #[code 0], + | which means it's practically nonexistent. + +p + | If your application will benefit from a large vocabulary with more + | vectors, you should consider using one of the + | #[+a("/docs/usage/models#available") larger models] instead of the default, + | smaller ones, which usually come with a clipped vocabulary. From a433e5012a901bb47ffc34fadb0af2514171b289 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:16:44 +0200 Subject: [PATCH 013/118] Update adding languages docs --- website/docs/usage/adding-languages.jade | 43 ++++++++---------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index d1cb1887c..f77acdf24 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -436,6 +436,8 @@ p +h(3, "morph-rules") Morph rules +//- TODO: write morph rules section + +h(2, "testing") Testing the new language tokenizer p @@ -626,37 +628,20 @@ p | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. | The #[code vectors.bin] file should consist of one word and vector per line. -+h(2, "model-directory") Setting up a model directory - -p - | Once you've collected the word frequencies, Brown clusters and word - | vectors files, you can use the - | #[+a("/docs/usage/cli#model") #[code model] command] to create a data - | directory: - -+code(false, "bash"). - python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] - +aside-code("your_data_directory", "yaml"). β”œβ”€β”€ vocab/ - | β”œβ”€β”€ lexemes.bin # via nlp.vocab.dump(path) - | β”œβ”€β”€ strings.json # via nlp.vocab.strings.dump(file_) - | └── oov_prob # optional - β”œβ”€β”€ pos/ # optional - | β”œβ”€β”€ model # via nlp.tagger.model.dump(path) - | └── config.json # via Langage.train - β”œβ”€β”€ deps/ # optional - | β”œβ”€β”€ model # via nlp.parser.model.dump(path) - | └── config.json # via Langage.train - └── ner/ # optional - β”œβ”€β”€ model # via nlp.entity.model.dump(path) - └── config.json # via Langage.train - -p - | This creates a spaCy data directory with a vocabulary model, ready to be - | loaded. By default, the command expects to be able to find your language - | class using #[code spacy.util.get_lang_class(lang_id)]. - + | β”œβ”€β”€ lexemes.bin + | β”œβ”€β”€ strings.json + | └── oov_prob + β”œβ”€β”€ pos/ + | β”œβ”€β”€ model + | └── config.json + β”œβ”€β”€ deps/ + | β”œβ”€β”€ model + | └── config.json + └── ner/ + β”œβ”€β”€ model + └── config.json +h(2, "train-tagger-parser") Training the tagger and parser From 1c06ef35427e5b495eab09a4d165bcec588bdead Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:17:25 +0200 Subject: [PATCH 014/118] Update spaCy architecture --- website/docs/usage/spacy-101.jade | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index daace114b..06f88ace2 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -8,3 +8,85 @@ include ../../_includes/_mixins include ../../assets/img/docs/architecture.svg .u-text-right +button("/assets/img/docs/architecture.svg", false, "secondary").u-text-tag View large graphic + ++table(["Name", "Description"]) + +row + +cell #[+api("language") #[code Language]] + +cell + | A text-processing pipeline. Usually you'll load this once per + | process as #[code nlp] and pass the instance around your application. + + +row + +cell #[+api("doc") #[code Doc]] + +cell A container for accessing linguistic annotations. + + +row + +cell #[+api("span") #[code Span]] + +cell A slice from a #[code Doc] object. + + +row + +cell #[+api("token") #[code Token]] + +cell + | An individual token β€” i.e. a word, punctuation symbol, whitespace, + | etc. + + +row + +cell #[+api("lexeme") #[code Lexeme]] + +cell + | An entry in the vocabulary. It's a word type with no context, as + | opposed to a word token. It therefore has no part-of-speech tag, + | dependency parse etc. + + +row + +cell #[+api("vocab") #[code Vocab]] + +cell + | A lookup table for the vocabulary that allows you to access + | #[code Lexeme] objects. + + +row + +cell #[code Morphology] + +cell + + +row + +cell #[+api("stringstore") #[code StringStore]] + +cell Map strings to and from integer IDs. + + +row + +row + +cell #[+api("tokenizer") #[code Tokenizer]] + +cell + | Segment text, and create #[code Doc] objects with the discovered + | segment boundaries. + + +row + +cell #[+api("tagger") #[code Tagger]] + +cell Annotate part-of-speech tags on #[code Doc] objects. + + +row + +cell #[+api("dependencyparser") #[code DependencyParser]] + +cell Annotate syntactic dependencies on #[code Doc] objects. + + +row + +cell #[+api("entityrecognizer") #[code EntityRecognizer]] + +cell + | Annotate named entities, e.g. persons or products, on #[code Doc] + | objects. + + +row + +cell #[+api("matcher") #[code Matcher]] + +cell + | Match sequences of tokens, based on pattern rules, similar to + | regular expressions. + ++h(3, "architecture-other") Other + ++table(["Name", "Description"]) + +row + +cell #[+api("goldparse") #[code GoldParse]] + +cell Collection for training annotations. + + +row + +cell #[+api("goldcorpus") #[code GoldCorpus]] + +cell + | An annotated corpus, using the JSON file format. Manages + | annotations for tagging, dependency parsing and NER. From 61cf2bba5518fa97009631b46f8bc2bca7a9a9c6 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:17:37 +0200 Subject: [PATCH 015/118] Fix code example --- website/docs/usage/visualizers.jade | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 93a4b5567..fe779add9 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -314,3 +314,4 @@ p 'text': 'But Google is starting from behind.', 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], 'title': None + } From 43258d6b0a3e0c265c873d6e7e41bb62ca331cf2 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:17:57 +0200 Subject: [PATCH 016/118] Update NER workflow --- website/docs/usage/entity-recognition.jade | 205 ++++++++++++--------- 1 file changed, 116 insertions(+), 89 deletions(-) diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 2c3116b82..bcad07baa 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -9,14 +9,12 @@ p | locations, organizations and products. You can add arbitrary classes to | the entity recognition system, and update the model with new examples. -+aside-code("Example"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'London is a big city in the United Kingdom.') - for ent in doc.ents: - print(ent.label_, ent.text) - # GPE London - # GPE United Kingdom ++h(2, "101") Named Entity Recognition 101 + +tag-model("named entities") + +include _spacy-101/_named-entities + ++h(2, "accessing") Accessing entity annotations p | The standard way to access entity annotations is the @@ -26,56 +24,89 @@ p | #[code ent.label] and #[code ent.label_]. The #[code Span] object acts | as a sequence of tokens, so you can iterate over the entity or index into | it. You can also get the text form of the whole entity, as though it were - | a single token. See the #[+api("span") API reference] for more details. + | a single token. p - | You can access token entity annotations using the #[code token.ent_iob] - | and #[code token.ent_type] attributes. The #[code token.ent_iob] - | attribute indicates whether an entity starts, continues or ends on the - | tag (In, Begin, Out). + | You can also access token entity annotations using the + | #[+api("token#attributes") #[code token.ent_iob]] and + | #[+api("token#attributes") #[code token.ent_type]] attributes. + | #[code token.ent_iob] indicates whether an entity starts, continues or + | ends on the tag. If no entity type is set on a token, it will return an + | empty string. + ++aside("IOB Scheme") + | #[code I] – Token is inside an entity.#[br] + | #[code O] – Token is outside an entity.#[br] + | #[code B] – Token is the beginning of an entity.#[br] +code("Example"). - doc = nlp(u'London is a big city in the United Kingdom.') - print(doc[0].text, doc[0].ent_iob, doc[0].ent_type_) - # (u'London', 2, u'GPE') - print(doc[1].text, doc[1].ent_iob, doc[1].ent_type_) - # (u'is', 3, u'') + doc = nlp(u'San Francisco considers banning sidewalk delivery robots') + + # document level + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents == [(u'San Francisco', 0, 13, u'GPE')] + + # token level + ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_] + ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_] + assert ent_san == [u'San', u'B', u'GPE'] + assert ent_francisco == [u'Francisco', u'I', u'GPE'] + ++table(["Text", "ent_iob", "ent.iob_", "ent_type", "ent_type_", "Description"]) + - var style = [0, 1, 1, 1, 1, 0] + +annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style) + +annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style) + +annotation-row(["considers", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["banning", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["sidewalk", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["delivery", 2, "O", 0, '""', "outside an entity"], style) + +annotation-row(["robots", 2, "O", 0, '""', "outside an entity"], style) +h(2, "setting") Setting entity annotations p | To ensure that the sequence of token annotations remains consistent, you - | have to set entity annotations at the document level β€” you can't write - | directly to the #[code token.ent_iob] or #[code token.ent_type] - | attributes. The easiest way to set entities is to assign to the - | #[code doc.ents] attribute. + | have to set entity annotations #[strong at the document level]. However, + | you can't write directly to the #[code token.ent_iob] or + | #[code token.ent_type] attributes, so the easiest way to set entities is + | to assign to the #[+api("doc#ents") #[code doc.ents]] attribute + | and create the new entity as a #[+api("span") #[code Span]]. +code("Example"). - doc = nlp(u'London is a big city in the United Kingdom.') - doc.ents = [] - assert doc[0].ent_type_ == '' - doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings['GPE'])] - assert doc[0].ent_type_ == 'GPE' - doc.ents = [] - doc.ents = [(u'LondonCity', doc.vocab.strings['GPE'], 0, 1)] + from spacy.tokens import Span + + doc = nlp(u'Netflix is hiring a new VP of global policy') + # the model didn't recognise any entities :( + + ORG = doc.vocab.strings[u'ORG'] # get integer ID of entity label + netflix_ent = Span(doc, 0, 1, label=ORG) # create a Span for the new entity + doc.ents = [netflix_ent] + + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents = [(u'Netflix', 0, 7, u'ORG')] p - | The value you assign should be a sequence, the values of which - | can either be #[code Span] objects, or #[code (ent_id, ent_type, start, end)] - | tuples, where #[code start] and #[code end] are token offsets that - | describe the slice of the document that should be annotated. + | Keep in mind that you need to create a #[code Span] with the start and + | end index of the #[strong token], not the start and end index of the + | entity in the document. In this case, "Netflix" is token #[code (0, 1)] – + | but at the document level, the entity will have the start and end + | indices #[code (0, 7)]. + ++h(3, "setting-from-array") Setting entity annotations from array p - | You can also assign entity annotations using the #[code doc.from_array()] - | method. To do this, you should include both the #[code ENT_TYPE] and the - | #[code ENT_IOB] attributes in the array you're importing from. + | You can also assign entity annotations using the + | #[+api("doc#from_array") #[code doc.from_array()]] method. To do this, + | you should include both the #[code ENT_TYPE] and the #[code ENT_IOB] + | attributes in the array you're importing from. -+code("Example"). - from spacy.attrs import ENT_IOB, ENT_TYPE ++code. import numpy + from spacy.attrs import ENT_IOB, ENT_TYPE doc = nlp.make_doc(u'London is a big city in the United Kingdom.') assert list(doc.ents) == [] + header = [ENT_IOB, ENT_TYPE] attr_array = numpy.zeros((len(doc), len(header))) attr_array[0, 0] = 2 # B @@ -83,12 +114,14 @@ p doc.from_array(header, attr_array) assert list(doc.ents)[0].text == u'London' ++h(3, "setting-cython") Setting entity annotations in Cython + p | Finally, you can always write to the underlying struct, if you compile - | a Cython function. This is easy to do, and allows you to write efficient - | native code. + | a #[+a("http://cython.org/") Cython] function. This is easy to do, and + | allows you to write efficient native code. -+code("Example"). ++code. # cython: infer_types=True from spacy.tokens.doc cimport Doc @@ -104,67 +137,30 @@ p | you'll have responsibility for ensuring that the data is left in a | consistent state. - -+h(2, "displacy") Visualizing named entities - -p - | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] - | lets you explore an entity recognition model's behaviour interactively. - | If you're training a model, it's very useful to run the visualization - | yourself. To help you do that, spaCy v2.0+ comes with a visualization - | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to - | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to - | run the web server, or #[+api("displacy#render") #[code displacy.render]] - | to generate the raw markup. - -p - | For more details and examples, see the - | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. - -+code("Named Entity example"). - import spacy - from spacy import displacy - - text = """But Google is starting from behind. The company made a late push - into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa - software, which runs on its Echo and Dot devices, have clear leads in - consumer adoption.""" - - nlp = spacy.load('custom_ner_model') - doc = nlp(text) - displacy.serve(doc, style='ent') - -+codepen("a73f8b68f9af3157855962b283b364e4", 345) - +h(2, "entity-types") Built-in entity types -include ../api/_annotation/_named-entities ++aside("Tip: Understanding entity types") + | You can also use #[code spacy.explain()] to get the description for the + | string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". -+aside("Install") - | The #[+api("load") #[code spacy.load()]] function configures a pipeline that - | includes all of the available annotators for the given ID. In the example - | above, the #[code 'en'] ID tells spaCy to load the default English - | pipeline. If you have installed the data with - | #[code python -m spacy download en], this will include the entity - | recognition model. +include ../api/_annotation/_named-entities +h(2, "updating") Training and updating p | To provide training examples to the entity recogniser, you'll first need - | to create an instance of the #[code GoldParse] class. You can specify - | your annotations in a stand-off format or as token tags. + | to create an instance of the #[+api("goldparse") #[code GoldParse]] class. + | You can specify your annotations in a stand-off format or as token tags. +code. - import spacy import random + import spacy from spacy.gold import GoldParse - from spacy.language import EntityRecognizer + from spacy.pipeline import EntityRecognizer - train_data = [ - ('Who is Chaka Khan?', [(7, 17, 'PERSON')]), - ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) - ] + train_data = [('Who is Chaka Khan?', [(7, 17, 'PERSON')]), + ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])] nlp = spacy.load('en', entity=False, parser=False) ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC']) @@ -237,3 +233,34 @@ p | loss, via the #[+a("http://www.aclweb.org/anthology/C12-1059") dynamic oracle] | imitation learning strategy. The transition system is equivalent to the | BILOU tagging scheme. + ++h(2, "displacy") Visualizing named entities + +p + | The #[+a(DEMOS_URL + "/displacy-ent/") displaCy #[sup ENT] visualizer] + | lets you explore an entity recognition model's behaviour interactively. + | If you're training a model, it's very useful to run the visualization + | yourself. To help you do that, spaCy v2.0+ comes with a visualization + | module. Simply pass a #[code Doc] or a list of #[code Doc] objects to + | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to + | run the web server, or #[+api("displacy#render") #[code displacy.render]] + | to generate the raw markup. + +p + | For more details and examples, see the + | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. + ++code("Named Entity example"). + import spacy + from spacy import displacy + + text = """But Google is starting from behind. The company made a late push + into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa + software, which runs on its Echo and Dot devices, have clear leads in + consumer adoption.""" + + nlp = spacy.load('custom_ner_model') + doc = nlp(text) + displacy.serve(doc, style='ent') + ++codepen("a73f8b68f9af3157855962b283b364e4", 345) From b6209e24271bcc141c21168e4592a5063e8bc2f2 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:18:08 +0200 Subject: [PATCH 017/118] Update POS tagging workflow --- website/docs/usage/pos-tagging.jade | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index cded00b6c..245156b77 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -7,22 +7,12 @@ p | assigned to each token in the document. They're useful in rule-based | processes. They can also be useful features in some statistical models. -p - | To use spaCy's tagger, you need to have a data pack installed that - | includes a tagging model. Tagging models are included in the data - | downloads for English and German. After you load the model, the tagger - | is applied automatically, as part of the default pipeline. You can then - | access the tags using the #[+api("token") #[code Token.tag]] and - | #[+api("token") #[code token.pos]] attributes. For English, the tagger - | also triggers some simple rule-based morphological processing, which - | gives you the lemma as well. ++h(2, "101") Part-of-speech tagging 101 + +tag-model("dependency parse") -+code("Usage"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'They told us to duck.') - for word in doc: - print(word.text, word.lemma, word.lemma_, word.tag, word.tag_, word.pos, word.pos_) +include _spacy-101/_pos-deps + ++aside("Help – spaCy's output is wrong!") +h(2, "rule-based-morphology") Rule-based morphology @@ -63,7 +53,8 @@ p +list("numbers") +item - | The tokenizer consults a #[strong mapping table] + | The tokenizer consults a + | #[+a("/docs/usage/adding-languages#tokenizer-exceptions") mapping table] | #[code TOKENIZER_EXCEPTIONS], which allows sequences of characters | to be mapped to multiple tokens. Each token may be assigned a part | of speech and one or more morphological features. @@ -77,8 +68,9 @@ p +item | For words whose POS is not set by a prior process, a - | #[strong mapping table] #[code TAG_MAP] maps the tags to a - | part-of-speech and a set of morphological features. + | #[+a("/docs/usage/adding-languages#tag-map") mapping table] + | #[code TAG_MAP] maps the tags to a part-of-speech and a set of + | morphological features. +item | Finally, a #[strong rule-based deterministic lemmatizer] maps the From b6c62baab39e54c78b75104e0f2ec532ad3e69b8 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:18:53 +0200 Subject: [PATCH 018/118] Update What's new in v2 docs --- website/docs/usage/v2.jade | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 8faae9d32..d3941bba0 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -55,7 +55,23 @@ p | #[strong API:] #[+api("spacy#load") #[code spacy.load]] | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] -+h(3, "features-language") Improved language data and processing pipelines ++h(3, "features-language") Improved language data and lazy loading + +p + | Language-specfic data now lives in its own submodule, #[code spacy.lang]. + | Languages are lazy-loaded, i.e. only loaded when you import a + | #[code Language] class, or load a model that initialises one. This allows + | languages to contain more custom data, e.g. lemmatizer lookup tables, or + | complex regular expressions. The language data has also been tidied up + | and simplified. It's now also possible to overwrite the functions that + | compute lexical attributes like #[code like_num], and supply + | language-specific syntax iterators, e.g. to determine noun chunks. + ++infobox + | #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang] + | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] + ++h(3, "features-pipelines") Improved processing pipelines +aside-code("Example"). from spacy.language import Language @@ -64,7 +80,7 @@ p +infobox | #[strong API:] #[+api("language") #[code Language]] - | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] + | #[strong Usage:] #[+a("/docs/usage/processing-text") Processing text] +h(3, "features-lemmatizer") Simple lookup-based lemmatization @@ -95,7 +111,7 @@ p from spacy.matcher import Matcher from spacy.attrs import LOWER, IS_PUNCT matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', on_match=None, + matcher.add('HelloWorld', None, [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], [{LOWER: 'hello'}, {LOWER: 'world'}]) assert len(matcher) == 1 @@ -128,6 +144,18 @@ p +h(2, "incompat") Backwards incompatibilities +table(["Old", "New"]) + +row + +cell + | #[code spacy.en] + | #[code spacy.xx] + +cell + | #[code spacy.lang.en] + | #[code spacy.lang.xx] + + +row + +cell #[code spacy.orth] + +cell #[code spacy.lang.xx.lex_attrs] + +row +cell #[code Language.save_to_directory] +cell #[+api("language#to_disk") #[code Language.to_disk]] From af348025ecbe0229b016e341c1c9dc43625957f4 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:19:09 +0200 Subject: [PATCH 019/118] Update word vectors & similarity workflow --- .../docs/usage/word-vectors-similarities.jade | 75 +++++++++---------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 3cc0a67a8..00e200f59 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -6,46 +6,40 @@ p | Dense, real valued vectors representing distributional similarity | information are now a cornerstone of practical NLP. The most common way | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] - | family of algorithms. - -+aside("Tip") - | If you need to train a word2vec model, we recommend the implementation in - | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. - -p - | spaCy makes using word vectors very easy. The - | #[+api("lexeme") #[code Lexeme]], #[+api("token") #[code Token]], - | #[+api("span") #[code Span]] and #[+api("doc") #[code Doc]] classes all - | have a #[code .vector] property, which is a 1-dimensional numpy array of - | 32-bit floats: - -+code. - import numpy - - apples, and_, oranges = nlp(u'apples and oranges') - print(apples.vector.shape) - # (1,) - apples.similarity(oranges) - -p - | By default, #[code Token.vector] returns the vector for its underlying - | lexeme, while #[code Doc.vector] and #[code Span.vector] return an - | average of the vectors of their tokens. You can customize these - | behaviours by modifying the #[code doc.user_hooks], - | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] - | dictionaries. - -+aside-code("Example"). - # TODO - -p - | The default English model installs vectors for one million vocabulary - | entries, using the 300-dimensional vectors trained on the Common Crawl + | family of algorithms. The default + | #[+a("/docs/usage/models#available") English model] installs + | 300-dimensional vectors trained on the Common Crawl | corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe] | algorithm. The GloVe common crawl vectors have become a de facto | standard for practical NLP. -+aside-code("Example"). ++aside("Tip: Training a word2vec model") + | If you need to train a word2vec model, we recommend the implementation in + | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. + ++h(2, "101") Similarity and word vectors 101 + +tag-model("vectors") + +include _spacy-101/_similarity +include _spacy-101/_word-vectors + + ++h(2, "custom") Customising word vectors + +p + | By default, #[+api("token#vector") #[code Token.vector]] returns the + | vector for its underlying #[+api("lexeme") #[code Lexeme]], while + | #[+api("doc#vector") #[code Doc.vector]] and + | #[+api("span#vector") #[code Span.vector]] return an average of the + | vectors of their tokens. + +p + | You can customize these + | behaviours by modifying the #[code doc.user_hooks], + | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] + | dictionaries. + ++code("Example"). # TODO p @@ -56,11 +50,14 @@ p | can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a | path to a binary file written by #[code vocab.dump_vectors()]. -+aside-code("Example"). ++code("Example"). # TODO p - | You can also load vectors from memory, by writing to the #[code lexeme.vector] - | property. If the vectors you are writing are of different dimensionality + | You can also load vectors from memory by writing to the + | #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors + | you are writing are of different dimensionality | from the ones currently loaded, you should first call | #[code vocab.resize_vectors(new_size)]. + ++h(2, "similarity") Similarity From fe24267948c75759f774130bb63c27fc3cf539ee Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:19:20 +0200 Subject: [PATCH 020/118] Update usage docs meta and navigation --- website/docs/usage/_data.json | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 8eca16a8c..45daa8381 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -3,13 +3,13 @@ "Get started": { "Installation": "./", "Models": "models", + "spaCy 101": "spacy-101", "Lightning tour": "lightning-tour", "Visualizers": "visualizers", "Troubleshooting": "troubleshooting", "What's new in v2.0": "v2" }, "Workflows": { - "spaCy 101": "spacy-101", "Loading the pipeline": "language-processing-pipeline", "Processing text": "processing-text", "spaCy's data model": "data-model", @@ -44,13 +44,18 @@ "models": { "title": "Models", - "next": "lightning-tour", + "next": "spacy-101", "quickstart": true }, + "spacy-101": { + "title": "spaCy 101", + "next": "lightning-tour" + }, + "lightning-tour": { "title": "Lightning tour", - "next": "spacy-101" + "next": "visualizers" }, "visualizers": { @@ -66,10 +71,6 @@ "title": "Resources" }, - "spacy-101": { - "title": "spaCy 101" - }, - "language-processing-pipeline": { "title": "Loading a language processing pipeline", "next": "processing-text" @@ -95,7 +96,7 @@ }, "entity-recognition": { - "title": "Entity recognition", + "title": "Named Entity Recognition", "next": "rule-based-matching" }, From 9ed6b48a49c289af307388e304f2a8ff2a25254a Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:34:39 +0200 Subject: [PATCH 021/118] Update dependency parse workflow --- website/docs/usage/dependency-parse.jade | 205 +++++++++++++++-------- 1 file changed, 132 insertions(+), 73 deletions(-) diff --git a/website/docs/usage/dependency-parse.jade b/website/docs/usage/dependency-parse.jade index 904522bd4..abfa1f825 100644 --- a/website/docs/usage/dependency-parse.jade +++ b/website/docs/usage/dependency-parse.jade @@ -8,55 +8,80 @@ p | boundary detection, and lets you iterate over base noun phrases, or | "chunks". -+aside-code("Example"). - import spacy - nlp = spacy.load('en') - doc = nlp(u'I like green eggs and ham.') - for np in doc.noun_chunks: - print(np.text, np.root.text, np.root.dep_, np.root.head.text) - # I I nsubj like - # green eggs eggs dobj like - # ham ham conj eggs - p | You can check whether a #[+api("doc") #[code Doc]] object has been | parsed with the #[code doc.is_parsed] attribute, which returns a boolean | value. If this attribute is #[code False], the default sentence iterator | will raise an exception. -+h(2, "displacy") The displaCy visualizer ++h(2, "noun-chunks") Noun chunks + +tag-model("dependency parse") -p - | The best way to understand spaCy's dependency parser is interactively, - | through the #[+a(DEMOS_URL + "/displacy", true) displaCy visualizer]. If - | you want to know how to write rules that hook into some type of syntactic - | construction, just plug the sentence into the visualizer and see how - | spaCy annotates it. +p Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt nisi. Integer commodo pellentesque tincidunt. Nam at turpis finibus tortor gravida sodales tincidunt sit amet est. Nullam euismod arcu in tortor auctor. + ++code("Example"). + nlp = spacy.load('en') + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + for chunk in doc.noun_chunks: + print(chunk.text, chunk.root.text, chunk.root.dep_, + chunk.root.head.text) + ++aside + | #[strong Text:] The original noun chunk text.#[br] + | #[strong Root text:] ...#[br] + | #[strong Root dep:] ...#[br] + | #[strong Root head text:] ...#[br] + ++table(["Text", "root.text", "root.dep_", "root.head.text"]) + - var style = [0, 0, 1, 0] + +annotation-row(["Autonomous cars", "cars", "nsubj", "shift"], style) + +annotation-row(["insurance liability", "liability", "dobj", "shift"], style) + +annotation-row(["manufacturers", "manufacturers", "pobj", "toward"], style) +h(2, "navigating") Navigating the parse tree p - | spaCy uses the terms #[em head] and #[em child] to describe the words - | connected by a single arc in the dependency tree. The term #[em dep] is - | used for the arc label, which describes the type of syntactic relation - | that connects the child to the head. As with other attributes, the value - | of #[code token.dep] is an integer. You can get the string value with - | #[code token.dep_]. + | spaCy uses the terms #[strong head] and #[strong child] to describe the words + | #[strong connected by a single arc] in the dependency tree. The term + | #[strong dep] is used for the arc label, which describes the type of + | syntactic relation that connects the child to the head. As with other + | attributes, the value of #[code .dep] is an integer. You can get + | the string value with #[code .dep_]. -+aside-code("Example"). - from spacy.symbols import det - the, dog = nlp(u'the dog') - assert the.dep == det - assert the.dep_ == 'det' ++code("Example"). + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + for token in doc: + print(token.text, token.dep_, token.head.text, token.head.pos_, + [child for child in token.children]) + ++aside + | #[strong Text]: The original token text.#[br] + | #[strong Dep]: The syntactic relation connecting child to head.#[br] + | #[strong Head text]: The original text of the token head.#[br] + | #[strong Head POS]: The part-of-speech tag of the token head.#[br] + | #[strong Children]: ... + ++table(["Text", "Dep", "Head text", "Head POS", "Children"]) + - var style = [0, 1, 0, 1, 0] + +annotation-row(["Autonomous", "amod", "cars", "NOUN", ""], style) + +annotation-row(["cars", "nsubj", "shift", "VERB", "Autonomous"], style) + +annotation-row(["shift", "ROOT", "shift", "VERB", "cars, liability"], style) + +annotation-row(["insurance", "compound", "liability", "NOUN", ""], style) + +annotation-row(["liability", "dobj", "shift", "VERB", "insurance, toward"], style) + +annotation-row(["toward", "prep", "liability", "NOUN", "manufacturers"], style) + +annotation-row(["manufacturers", "pobj", "toward", "ADP", ""], style) + ++codepen("dcf8d293367ca185b935ed2ca11ebedd", 370) p - | Because the syntactic relations form a tree, every word has exactly one - | head. You can therefore iterate over the arcs in the tree by iterating - | over the words in the sentence. This is usually the best way to match an - | arc of interest β€” from below: + | Because the syntactic relations form a tree, every word has + | #[strong exactly one head]. You can therefore iterate over the arcs in + | the tree by iterating over the words in the sentence. This is usually + | the best way to match an arc of interest β€” from below: +code. from spacy.symbols import nsubj, VERB + # Finding a verb with a subject from below β€” good verbs = set() for possible_subject in doc: @@ -82,6 +107,8 @@ p | attribute, which provides a sequence of #[+api("token") #[code Token]] | objects. ++h(3, "navigating-around") Iterating around the local tree + p | A few more convenience attributes are provided for iterating around the | local tree from the token. The #[code .lefts] and #[code .rights] @@ -90,55 +117,89 @@ p | two integer-typed attributes, #[code .n_rights] and #[code .n_lefts], | that give the number of left and right children. -+aside-code("Examples"). - apples = nlp(u'bright red apples on the tree')[2] - print([w.text for w in apples.lefts]) - # ['bright', 'red'] - print([w.text for w in apples.rights]) - # ['on'] - assert apples.n_lefts == 2 - assert apples.n_rights == 1 - - from spacy.symbols import nsubj - doc = nlp(u'Credit and mortgage account holders must submit their requests within 30 days.') - root = [w for w in doc if w.head is w][0] - subject = list(root.lefts)[0] - for descendant in subject.subtree: - assert subject.is_ancestor_of(descendant) - - from spacy.symbols import nsubj - doc = nlp(u'Credit and mortgage account holders must submit their requests.') - holders = doc[4] - span = doc[holders.left_edge.i : holders.right_edge.i + 1] - span.merge() - for word in doc: - print(word.text, word.pos_, word.dep_, word.head.text) - # Credit and mortgage account holders nsubj NOUN submit - # must VERB aux submit - # submit VERB ROOT submit - # their DET det requests - # requests NOUN dobj submit ++code. + doc = nlp(u'bright red apples on the tree') + assert [token.text for token in doc[2].lefts]) == [u'bright', u'red'] + assert [token.text for token in doc[2].rights]) == ['on'] + assert doc[2].n_lefts == 2 + assert doc[2].n_rights == 1 p | You can get a whole phrase by its syntactic head using the | #[code .subtree] attribute. This returns an ordered sequence of tokens. - | For the default English model, the parse tree is #[em projective], which - | means that there are no crossing brackets. The tokens returned by - | #[code .subtree] are therefore guaranteed to be contiguous. This is not - | true for the German model, which has many - | #[+a("https://explosion.ai/blog/german-model#word-order", true) non-projective dependencies]. | You can walk up the tree with the #[code .ancestors] attribute, and - | check dominance with the #[code .is_ancestor()] method. + | check dominance with the #[+api("token#is_ancestor") #[code .is_ancestor()]] + | method. + ++aside("Projective vs. non-projective") + | For the #[+a("/docs/usage/models#available") default English model], the + | parse tree is #[strong projective], which means that there are no crossing + | brackets. The tokens returned by #[code .subtree] are therefore guaranteed + | to be contiguous. This is not true for the German model, which has many + | #[+a(COMPANY_URL + "/blog/german-model#word-order", true) non-projective dependencies]. + ++code. + doc = nlp(u'Credit and mortgage account holders must submit their requests') + root = [token for token in doc if token.head is token][0] + subject = list(root.lefts)[0] + for descendant in subject.subtree: + assert subject.is_ancestor(descendant) + print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights, + [ancestor.text for ancestor in descendant.ancestors]) + ++table(["Text", "Dep", "n_lefts", "n_rights", "ancestors"]) + - var style = [0, 1, 1, 1, 0] + +annotation-row(["Credit", "nmod", 0, 2, "holders, submit"], style) + +annotation-row(["and", "cc", 0, 0, "Credit, holders, submit"], style) + +annotation-row(["mortgage", "compound", 0, 0, "account, Credit, holders, submit"], style) + +annotation-row(["account", "conj", 1, 0, "Credit, holders, submit"], style) + +annotation-row(["holders", "nsubj", 1, 0, "submit"], style) p - | Finally, I often find the #[code .left_edge] and #[code right_edge] - | attributes especially useful. They give you the first and last token + | Finally, the #[code .left_edge] and #[code .right_edge] attributes + | can be especially useful, because they give you the first and last token | of the subtree. This is the easiest way to create a #[code Span] object - | for a syntactic phrase β€” a useful operation. + | for a syntactic phrase. Note that #[code .right_edge] gives a token + | #[strong within] the subtree β€” so if you use it as the end-point of a + | range, don't forget to #[code +1]! + ++code. + doc = nlp(u'Credit and mortgage account holders must submit their requests') + span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1] + span.merge() + for token in doc: + print(token.text, token.pos_, token.dep_, token.head.text) + ++table(["Text", "POS", "Dep", "Head text"]) + - var style = [0, 1, 1, 0] + +annotation-row(["Credit and mortgage account holders", "NOUN", "nsubj", "submit"], style) + +annotation-row(["must", "VERB", "aux", "submit"], style) + +annotation-row(["submit", "VERB", "ROOT", "submit"], style) + +annotation-row(["their", "ADJ", "poss", "requests"], style) + +annotation-row(["requests", "NOUN", "dobj", "submit"], style) + ++h(2, "displacy") Visualizing dependencies p - | Note that #[code .right_edge] gives a token #[em within] the subtree β€” - | so if you use it as the end-point of a range, don't forget to #[code +1]! + | The best way to understand spaCy's dependency parser is interactively. + | To make this easier, spaCy v2.0+ comes with a visualization module. Simply + | pass a #[code Doc] or a list of #[code Doc] objects to + | displaCy and run #[+api("displacy#serve") #[code displacy.serve]] to + | run the web server, or #[+api("displacy#render") #[code displacy.render]] + | to generate the raw markup. If you want to know how to write rules that + | hook into some type of syntactic construction, just plug the sentence into + | the visualizer and see how spaCy annotates it. + ++code. + from spacy import displacy + + doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers') + displacy.serve(doc, style='dep') + ++infobox + | For more details and examples, see the + | #[+a("/docs/usage/visualizers") usage workflow on visualizing spaCy]. You + | can also test displaCy in our #[+a(DEMOS_URL + "/displacy", true) online demo]. +h(2, "disabling") Disabling the parser @@ -149,8 +210,6 @@ p | the parser from being loaded: +code. - import spacy - nlp = spacy.load('en', parser=False) p From 7ef7f0b42c98e395f9899bce5f0aef19b2ac1a17 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:37:51 +0200 Subject: [PATCH 022/118] Add linguistic annotations 101 content --- website/docs/usage/spacy-101.jade | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 06f88ace2..2507b9d94 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -2,6 +2,54 @@ include ../../_includes/_mixins ++h(2, "annotations") Linguistic annotations + +p + | spaCy provides a variety of linguistic annotations to give you insights + | into a text's grammatical structure. This includes the word types, + | i.e. the parts of speech, and how the words are related to each other. + | For example, if you're analysing text, it makes a #[em huge] difference + | whether a noun is the subject of a sentence, or the object – or whether + | "google" is used as a verb, or refers to the website or company in a + | specific context. + +p + | Once you've downloaded and installed a #[+a("/docs/usage/models") model], + | you can load it via #[+api("spacy#load") #[code spacy.load()]]. This will + | return a #[code Language] object contaning all components and data needed + | to process text. We usually call it #[code nlp]. Calling the #[code nlp] + | object on a string of text will return a processed #[code Doc]: + ++code. + import spacy + + nlp = spacy.load('en') + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + ++h(3, "annotations-token") Tokenization + +include _spacy-101/_tokenization + + ++h(3, "annotations-pos-deps") Part-of-speech tags and dependencies + +tag-model("dependency parse") + +include _spacy-101/_pos-deps + ++h(3, "annotations-ner") Named Entities + +tag-model("named entities") + +include _spacy-101/_named-entities + ++h(2, "vectors-similarity") Word vectors and similarity + +tag-model("vectors") + +include _spacy-101/_similarity + +include _spacy-101/_word-vectors + ++h(2, "pipelines") Pipelines + +h(2, "architecture") Architecture +image From e6d88dfe08a34aeef61c27c726a0b269257a2f0b Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:38:33 +0200 Subject: [PATCH 023/118] Add features table to 101 --- website/docs/usage/spacy-101.jade | 55 +++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 2507b9d94..4fb758bb4 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -2,6 +2,61 @@ include ../../_includes/_mixins ++h(2, "features") Features + ++aside + | If one of spaCy's functionalities #[strong needs a model], it means that + | you need to have one our the available + | #[+a("/docs/usage/models") statistical models] installed. Models are used + | to #[strong predict] linguistic annotations – for example, if a word is + | a verb or a noun. + ++table(["Name", "Description", "Needs model"]) + +row + +cell #[strong Tokenization] + +cell + +cell #[+procon("con")] + + +row + +cell #[strong Part-of-speech Tagging] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Dependency Parsing] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Sentence Boundary Detection] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Named Entity Recongition] (NER) + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Rule-based Matching] + +cell + +cell #[+procon("con")] + + +row + +cell #[strong Similarity] + +cell + +cell #[+procon("pro")] + + +row + +cell #[strong Training] + +cell + +cell #[+procon("neutral")] + + +row + +cell #[strong Serialization] + +cell + +cell #[+procon("neutral")] + +h(2, "annotations") Linguistic annotations p From 4fb5fb7218dc81b78b0aa737d52bfba9b16b4297 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 23 May 2017 23:40:04 +0200 Subject: [PATCH 024/118] Update v2 docs --- website/docs/usage/v2.jade | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index d3941bba0..4a0e6ca2f 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -242,6 +242,79 @@ p +cell #[code Token.is_ancestor_of] +cell #[+api("token#is_ancestor") #[code Token.is_ancestor]] ++h(2, "migrating") Migrating from spaCy 1.x ++list + +item Saving, loading and serialization. + +item Processing pipelines and language data. + +item Adding patterns and callbacks to the matcher. + +item Models trained with spaCy 1.x. + ++infobox("Some tips") + | Before migrating, we strongly recommend writing a few + | #[strong simple tests] specific to how you're using spaCy in your + | application. This makes it easier to check whether your code requires + | changes, and if so, which parts are affected. + | (By the way, feel free contribute your tests to + | #[+src(gh("spaCy", "spacy/tests")) our test suite] – this will also ensure + | we never accidentally introduce a bug in a workflow that's + | important to you.) If you've trained your own models, keep in mind that + | your train and runtime inputs must match. This means you'll have to + | #[strong retrain your models] with spaCy v2.0 to make them compatible. + + ++h(3, "migrating-saving-loading") Saving, loading and serialization +h(2, "migrating") Migrating from spaCy 1.x +p + | Double-check all calls to #[code spacy.load()] and make sure they don't + | use the #[code path] keyword argument. + ++code-new nlp = spacy.load('/model') ++code-old nlp = spacy.load('en', path='/model') + +p + | Review all other code that writes state to disk or bytes. + | All containers, now share the same, consistent API for saving and + | loading. Replace saving with #[code to_disk()] or #[code to_bytes()], and + | loading with #[code from_disk()] and #[code from_bytes()]. + ++code-new. + nlp.to_disk('/model') + nlp.vocab.to_disk('/vocab') + ++code-old. + nlp.save_to_directory('/model') + nlp.vocab.dump('/vocab') + ++h(3, "migrating-languages") Processing pipelines and language data + +p + | If you're importing language data or #[code Language] classes, make sure + | to change your import statements to import from #[code spacy.lang]. If + | you've added your own custom language, it needs to be moved to + | #[code spacy/lang/xx]. + ++code-new from spacy.lang.en import English ++code-old from spacy.en import English + +p + | All components, e.g. tokenizer exceptions, are now responsible for + | compiling their data in the correct format. The language_data.py files + | have been removed + ++h(3, "migrating-matcher") Adding patterns and callbacks to the matcher + +p + | If you're using the matcher, you can now add patterns in one step. This + | should be easy to update – simply merge the ID, callback and patterns + | into one call to #[+api("matcher#add") #[code matcher.add]]. + ++code-new. + matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}]) + ++code-old. + matcher.add_entity('GoogleNow', on_match=merge_phrases) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) + ++h(3, "migrating-models") Trained models From 697d3d7cb3e18c219d1bad037bcccf6dbea35fe3 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 00:36:38 +0200 Subject: [PATCH 025/118] Fix links to CLI docs --- website/docs/api/util.jade | 2 +- website/docs/usage/adding-languages.jade | 6 +++--- website/docs/usage/saving-loading.jade | 2 +- website/docs/usage/training-ner.jade | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index ed8b5d8e5..f14cdbb6d 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -225,7 +225,7 @@ p p | Print a formatted, text-wrapped message with optional title. If a text | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the #[+a("/docs/usage/cli") CLI]. + | be used for interactive components like the #[+a("/docs/api/cli") CLI]. +aside-code("Example"). data_path = Path('/some/path') diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index f77acdf24..7eadde4b6 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -535,7 +535,7 @@ p | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] | script from the spaCy developer resources. Note that your corpus should | not be preprocessed (i.e. you need punctuation for example). The - | #[+a("/docs/usage/cli#model") #[code model] command] expects a + | #[+a("/docs/api/cli#model") #[code model]] command expects a | tab-separated word frequencies file with three columns: +list("numbers") @@ -651,13 +651,13 @@ p | If your corpus uses the | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, | i.e. files with the extension #[code .conllu], you can use the - | #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to + | #[+a("/docs/api/cli#convert") #[code convert]] command to convert it to | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. p | Once you have your UD corpus transformed into JSON, you can train your | model use the using spaCy's - | #[+a("/docs/usage/cli#train") #[code train] command]: + | #[+a("/docs/api/cli#train") #[code train]] command: +code(false, "bash"). python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index b11007683..3513e9505 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -28,7 +28,7 @@ p | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + | #[+a("/docs/api/cli#package") #[code package]] command documentation. +aside-code("meta.json", "json"). { diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 78eb4905e..4d864ac9d 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -77,8 +77,8 @@ p p | To make the model more convenient to deploy, we recommend wrapping it as | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command] - | to create all required files and directories. + | module. spaCy comes with a handy #[+a("/docs/api/cli#package") #[code package]] + | CLI command to create all required files and directories. +code(false, "bash"). python -m spacy package /home/me/data/en_technology /home/me/my_models From 990a70732a280f87dacd86c83d8cefbbe1e70a4b Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 00:37:21 +0200 Subject: [PATCH 026/118] Move installation troubleshooting to installation docs --- website/docs/usage/index.jade | 130 ++++++++++++++++ website/docs/usage/models.jade | 2 +- website/docs/usage/troubleshooting.jade | 190 ------------------------ 3 files changed, 131 insertions(+), 191 deletions(-) delete mode 100644 website/docs/usage/troubleshooting.jade diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index da13f4d81..61398b431 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -175,6 +175,136 @@ p +cell Python 3.5+ +cell Visual Studio 2015 ++h(2, "troubleshooting") Troubleshooting guide + +p + | This section collects some of the most common errors you may come + | across when installing, loading and using spaCy, as well as their solutions. + ++aside("Help us improve this guide") + | Did you come across a problem like the ones listed here and want to + | share the solution? You can find the "Suggest edits" button at the + | bottom of this page that points you to the source. We always + | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]! + ++h(3, "compatible-model") No compatible model found + ++code(false, "text"). + No compatible model found for [lang] (spaCy v#{SPACY_VERSION}). + +p + | This usually means that the model you're trying to download does not + | exist, or isn't available for your version of spaCy. Check the + | #[+a(gh("spacy-models", "compatibility.json")) compatibility table] + | to see which models are available for your spaCy version. If you're using + | an old version, consider upgrading to the latest release. Note that while + | spaCy supports tokenization for + | #[+a("/docs/api/language-models/#alpha-support") a variety of languages], + | not all of them come with statistical models. To only use the tokenizer, + | import the language's #[code Language] class instead, for example + | #[code from spacy.fr import French]. + ++h(3, "symlink-privilege") Symbolic link privilege not held + ++code(false, "text"). + OSError: symbolic link privilege not held + +p + | To create #[+a("/docs/usage/models/#usage") shortcut links] that let you + | load models by name, spaCy creates a symbolic link in the + | #[code spacy/data] directory. This means your user needs permission to do + | this. The above error mostly occurs when doing a system-wide installation, + | which will create the symlinks in a system directory. Run the + | #[code download] or #[code link] command as administrator, or use a + | #[code virtualenv] to install spaCy in a user directory, instead + | of doing a system-wide installation. + ++h(3, "no-cache-dir") No such option: --no-cache-dir + ++code(false, "text"). + no such option: --no-cache-dir + +p + | The #[code download] command uses pip to install the models and sets the + | #[code --no-cache-dir] flag to prevent it from requiring too much memory. + | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] + | requires pip v6.0 or newer. Run #[code pip install -U pip] to upgrade to + | the latest version of pip. To see which version you have installed, + | run #[code pip --version]. + ++h(3, "import-error") Import error + ++code(false, "text"). + Import Error: No module named spacy + +p + | This error means that the spaCy module can't be located on your system, or in + | your environment. Make sure you have spaCy installed. If you're using a + | #[code virtualenv], make sure it's activated and check that spaCy is + | installed in that environment – otherwise, you're trying to load a system + | installation. You can also run #[code which python] to find out where + | your Python executable is located. + ++h(3, "import-error-models") Import error: models + ++code(false, "text"). + ImportError: No module named 'en_core_web_sm' + +p + | As of spaCy v1.7, all models can be installed as Python packages. This means + | that they'll become importable modules of your application. When creating + | #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try + | to import the model to load its meta data. If this fails, it's usually a + | sign that the package is not installed in the current environment. + | Run #[code pip list] or #[code pip freeze] to check which model packages + | you have installed, and install the + | #[+a("/docs/usage/models#available") correct models] if necessary. If you're + | importing a model manually at the top of a file, make sure to use the name + | of the package, not the shortcut link you've created. + ++h(3, "vocab-strings") File not found: vocab/strings.json + ++code(false, "text"). + FileNotFoundError: No such file or directory: [...]/vocab/strings.json + +p + | This error may occur when using #[code spacy.load()] to load + | a language model – either because you haven't set up a + | #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it + | doesn't actually exist. Set up a + | #[+a("/docs/usage/models/#usage") shortcut link] for the model + | you want to load. This can either be an installed model package, or a + | local directory containing the model data. If you want to use one of the + | #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for + | languages that don't yet have a statistical model, you should import its + | #[code Language] class instead, for example + | #[code from spacy.lang.bn import Bengali]. + ++h(3, "command-not-found") Command not found + ++code(false, "text"). + command not found: spacy + +p + | This error may occur when running the #[code spacy] command from the + | command line. spaCy does not currently add an entry to our #[code PATH] + | environment variable, as this can lead to unexpected results, especially + | when using #[code virtualenv]. Run the command with #[code python -m], + | for example #[code python -m spacy download en]. For more info on this, + | see the #[+a("/docs/api/cli#download") CLI documentation]. + ++h(3, "module-load") 'module' object has no attribute 'load' + ++code(false, "text"). + AttributeError: 'module' object has no attribute 'load' + +p + | While this could technically have many causes, including spaCy being + | broken, the most likely one is that your script's file or directory name + | is "shadowing" the module – e.g. your file is called #[code spacy.py], + | or a directory you're importing from is called #[code spacy]. So, when + | using spaCy, never call anything else #[code spacy]. + +h(2, "tests") Run tests p diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 2dec5197e..832ad8211 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -195,7 +195,7 @@ p | privileges, the #[code spacy link] command may fail. The easiest solution | is to re-run the command as admin, or use a #[code virtualenv]. For more | info on this, see the - | #[+a("/docs/usage/troubleshooting#symlink-privilege") troubleshooting guide]. + | #[+a("/docs/usage/#symlink-privilege") troubleshooting guide]. +h(3, "usage-import") Importing models as modules diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade deleted file mode 100644 index 501a250c8..000000000 --- a/website/docs/usage/troubleshooting.jade +++ /dev/null @@ -1,190 +0,0 @@ -//- πŸ’« DOCS > USAGE > TROUBLESHOOTING - -include ../../_includes/_mixins - -p - | This section collects some of the most common errors you may come - | across when installing, loading and using spaCy, as well as their solutions. - -+aside("Help us improve this guide") - | Did you come across a problem like the ones listed here and want to - | share the solution? You can find the "Suggest edits" button at the - | bottom of this page that points you to the source. We always - | appreciate #[+a(gh("spaCy") + "/pulls") pull requests]! - -+h(2, "install-loading") Installation and loading - -+h(3, "compatible-model") No compatible model found - -+code(false, "text"). - No compatible model found for [lang] (spaCy v#{SPACY_VERSION}). - -p - | This usually means that the model you're trying to download does not - | exist, or isn't available for your version of spaCy. - -+infobox("Solutions") - | Check the #[+a(gh("spacy-models", "compatibility.json")) compatibility table] - | to see which models are available for your spaCy version. If you're using - | an old version, consider upgrading to the latest release. Note that while - | spaCy supports tokenization for - | #[+a("/docs/api/language-models/#alpha-support") a variety of languages], - | not all of them come with statistical models. To only use the tokenizer, - | import the language's #[code Language] class instead, for example - | #[code from spacy.fr import French]. - -+h(3, "symlink-privilege") Symbolic link privilege not held - -+code(false, "text"). - OSError: symbolic link privilege not held - -p - | To create #[+a("/docs/usage/models/#usage") shortcut links] that let you - | load models by name, spaCy creates a symbolic link in the - | #[code spacy/data] directory. This means your user needs permission to do - | this. The above error mostly occurs when doing a system-wide installation, - | which will create the symlinks in a system directory. - -+infobox("Solutions") - | Run the #[code download] or #[code link] command as administrator, - | or use a #[code virtualenv] to install spaCy in a user directory, instead - | of doing a system-wide installation. - -+h(3, "no-cache-dir") No such option: --no-cache-dir - -+code(false, "text"). - no such option: --no-cache-dir - -p - | The #[code download] command uses pip to install the models and sets the - | #[code --no-cache-dir] flag to prevent it from requiring too much memory. - | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] - | requires pip v6.0 or newer. - -+infobox("Solution") - | Run #[code pip install -U pip] to upgrade to the latest version of pip. - | To see which version you have installed, run #[code pip --version]. - -+h(3, "import-error") Import error - -+code(false, "text"). - Import Error: No module named spacy - -p - | This error means that the spaCy module can't be located on your system, or in - | your environment. - -+infobox("Solutions") - | Make sure you have spaCy installed. If you're using a #[code virtualenv], - | make sure it's activated and check that spaCy is installed in that - | environment – otherwise, you're trying to load a system installation. You - | can also run #[code which python] to find out where your Python - | executable is located. - -+h(3, "import-error-models") Import error: models - -+code(false, "text"). - ImportError: No module named 'en_core_web_sm' - -p - | As of spaCy v1.7, all models can be installed as Python packages. This means - | that they'll become importable modules of your application. When creating - | #[+a("/docs/usage/models/#usage") shortcut links], spaCy will also try - | to import the model to load its meta data. If this fails, it's usually a - | sign that the package is not installed in the current environment. - -+infobox("Solutions") - | Run #[code pip list] or #[code pip freeze] to check which model packages - | you have installed, and install the - | #[+a("/docs/usage/models#available") correct models] if necessary. If you're - | importing a model manually at the top of a file, make sure to use the name - | of the package, not the shortcut link you've created. - -+h(3, "vocab-strings") File not found: vocab/strings.json - -+code(false, "text"). - FileNotFoundError: No such file or directory: [...]/vocab/strings.json - -p - | This error may occur when using #[code spacy.load()] to load - | a language model – either because you haven't set up a - | #[+a("/docs/usage/models/#usage") shortcut link] for it, or because it - | doesn't actually exist. - -+infobox("Solutions") - | Set up a #[+a("/docs/usage/models/#usage") shortcut link] for the model - | you want to load. This can either be an installed model package, or a - | local directory containing the model data. If you want to use one of the - | #[+a("/docs/api/language-models/#alpha-support") alpha tokenizers] for - | languages that don't yet have a statistical model, you should import its - | #[code Language] class instead, for example - | #[code from spacy.fr import French]. - -+h(3, "command-not-found") Command not found - -+code(false, "text"). - command not found: spacy - -p - | This error may occur when running the #[code spacy] command from the - | command line. spaCy does not currently add an entry to our #[code PATH] - | environment variable, as this can lead to unexpected results, especially - | when using #[code virtualenv]. Instead, commands need to be prefixed with - | #[code python -m]. - -+infobox("Solution") - | Run the command with #[code python -m], for example - | #[code python -m spacy download en]. For more info on this, see the - | #[+a("/docs/usage/cli") CLI documentation]. - -+h(3, "module-load") 'module' object has no attribute 'load' - -+code(false, "text"). - AttributeError: 'module' object has no attribute 'load' - -p - | While this could technically have many causes, including spaCy being - | broken, the most likely one is that your script's file or directory name - | is "shadowing" the module – e.g. your file is called #[code spacy.py], - | or a directory you're importing from is called #[code spacy]. - -+infobox("Solution") - | When using spaCy, never call anything else #[code spacy]. - -+h(2, "usage") Using spaCy - -+h(3, "pos-lemma-number") POS tag or lemma is returned as number - -+code. - doc = nlp(u'This is text.') - print([word.pos for word in doc]) - # [88, 98, 90, 95] - -p - | Like many NLP libraries, spaCy encodes all strings to integers. This - | reduces memory usage and improves efficiency. The integer mapping also - | makes it easy to interoperate with numpy. To access the string - | representation instead of the integer ID, add an underscore #[code _] - | after the attribute. - -+infobox("Solutions") - | Use #[code pos_] or #[code lemma_] instead. See the - | #[+api("token#attributes") #[code Token] attributes] for a list of available - | attributes and their string representations. - - -+h(3, "pron-lemma") Pronoun lemma is returned as #[code -PRON-] - -+code. - doc = nlp(u'They are') - print(doc[0].lemma_) - # -PRON- - -p - | This is in fact expected behaviour and not a bug. - | Unlike verbs and common nouns, there's no clear base form of a personal - | pronoun. Should the lemma of "me" be "I", or should we normalize person - | as well, giving "it" β€” or maybe "he"? spaCy's solution is to introduce a - | novel symbol, #[code -PRON-], which is used as the lemma for - | all personal pronouns. For more info on this, see the - | #[+api("annotation#lemmatization") annotation specs] on lemmatization. From 10afb3c796cb9739bd969294a7ed973b4e519164 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 00:37:47 +0200 Subject: [PATCH 027/118] Tidy up and merge usage pages --- website/docs/api/philosophy.jade | 14 --- website/docs/usage/_data.json | 91 ++++++++----------- website/docs/usage/adding-languages.jade | 3 + website/docs/usage/customizing-tokenizer.jade | 90 ++++++++++++------ .../usage/language-processing-pipeline.jade | 37 ++++++++ 5 files changed, 140 insertions(+), 95 deletions(-) delete mode 100644 website/docs/api/philosophy.jade diff --git a/website/docs/api/philosophy.jade b/website/docs/api/philosophy.jade deleted file mode 100644 index eda911045..000000000 --- a/website/docs/api/philosophy.jade +++ /dev/null @@ -1,14 +0,0 @@ -//- πŸ’« DOCS > API > PHILOSOPHY - -include ../../_includes/_mixins - -p Every product needs to know why it exists. Here's what we're trying to with spaCy and why it's different from other NLP libraries. - -+h(2) 1. No job too big. -p Most programs get cheaper to run over time, but NLP programs often get more expensive. The data often grows faster than the hardware improves. For web-scale tasks, Moore's law can't save us β€” so if we want to read the web, we have to sweat performance. - -+h(2) 2. Take a stand. -p Most NLP toolkits position themselves as platforms, rather than libraries. They offer a pluggable architecture, and leave it to the user to arrange the components they offer into a useful system. This is fine for researchers, but for production users, this does too little. Components go out of date quickly, and configuring a good system takes very detailed knowledge. Compatibility problems can be extremely subtle. spaCy is therefore extremely opinionated. The API does not expose any algorithmic details. You're free to configure another pipeline, but the core library eliminates redundancy, and only offers one choice of each component. - -+h(2) 3. Stay current. -p There's often significant improvement in NLP models year-on-year. This has been especially true recently, given the success of deep learning models. With spaCy, you should be able to build things you couldn't build yesterday. To deliver on that promise, we need to be giving you the latest stuff. diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 45daa8381..f903c7c1e 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -5,26 +5,23 @@ "Models": "models", "spaCy 101": "spacy-101", "Lightning tour": "lightning-tour", - "Visualizers": "visualizers", - "Troubleshooting": "troubleshooting", "What's new in v2.0": "v2" }, "Workflows": { - "Loading the pipeline": "language-processing-pipeline", - "Processing text": "processing-text", - "spaCy's data model": "data-model", "POS tagging": "pos-tagging", "Using the parse": "dependency-parse", "Entity recognition": "entity-recognition", - "Custom pipelines": "customizing-pipeline", - "Rule-based matching": "rule-based-matching", "Word vectors": "word-vectors-similarities", - "Deep learning": "deep-learning", "Custom tokenization": "customizing-tokenizer", + "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", + "Processing text": "processing-text", + "NLP pipelines": "language-processing-pipeline", + "Deep learning": "deep-learning", "Training": "training", "Training NER": "training-ner", - "Saving & loading": "saving-loading" + "Saving & loading": "saving-loading", + "Visualizers": "visualizers" }, "Examples": { "Tutorials": "tutorials", @@ -38,10 +35,6 @@ "quickstart": true }, - "v2": { - "title": "What's new in v2.0" - }, - "models": { "title": "Models", "next": "spacy-101", @@ -67,27 +60,13 @@ "next": "resources" }, - "resources": { - "title": "Resources" + "v2": { + "title": "What's new in v2.0" }, - "language-processing-pipeline": { - "title": "Loading a language processing pipeline", - "next": "processing-text" - }, - - "customizing-pipeline": { - "title": "Customizing the pipeline", - "next": "customizing-tokenizer" - }, - - "processing-text": { - "title": "Processing text", - "next": "data-model" - }, - - "data-model": { - "title": "Understanding spaCy's data model" + "pos-tagging": { + "title": "Part-of-speech tagging", + "next": "dependency-parse" }, "dependency-parse": { @@ -97,26 +76,44 @@ "entity-recognition": { "title": "Named Entity Recognition", - "next": "rule-based-matching" - }, - - "rule-based-matching": { - "title": "Rule-based matching" + "next": "training-ner" }, "word-vectors-similarities": { - "title": "Using word vectors and semantic similarities" - }, - - "deep-learning": { - "title": "Hooking a deep learning model into spaCy" + "title": "Using word vectors and semantic similarities", + "next": "customizing-tokenizer" }, "customizing-tokenizer": { "title": "Customizing the tokenizer", + "next": "rule-based-matching" + }, + + "rule-based-matching": { + "title": "Rule-based matching", "next": "adding-languages" }, + "adding-languages": { + "title": "Adding languages", + "next": "training" + }, + + "processing-text": { + "title": "Processing text", + "next": "language-processing-pipeline" + }, + + "language-processing-pipeline": { + "title": "Natural language processing pipelines", + "next": "deep-learning" + }, + + "deep-learning": { + "title": "Hooking a deep learning model into spaCy", + "next": "training" + }, + "training": { "title": "Training spaCy's statistical models", "next": "saving-loading" @@ -131,16 +128,6 @@ "title": "Saving and loading models" }, - "pos-tagging": { - "title": "Part-of-speech tagging", - "next": "dependency-parse" - }, - - "adding-languages": { - "title": "Adding languages", - "next": "training" - }, - "showcase": { "title": "Showcase", diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 7eadde4b6..f3648b885 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -104,6 +104,9 @@ p +image include ../../assets/img/docs/language_data.svg + .u-text-right + +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic + +table(["File name", "Variables", "Description"]) +row diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index d43fb438f..5871e1655 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -11,18 +11,56 @@ p | #[code spaces] booleans, which allow you to maintain alignment of the | tokens into the original string. -+aside("See Also") - | If you haven't read up on spaCy's #[+a("data-model") data model] yet, - | you should probably have a look. The main point to keep in mind is that - | spaCy's #[code Doc] doesn't copy or refer to the original string. The - | string is reconstructed from the tokens when required. ++aside("spaCy's data model") + | The main point to keep in mind is that spaCy's #[code Doc] doesn't + | copy or refer to the original string. The string is reconstructed from + | the tokens when required. ++h(2, "101") Tokenizer 101 + +include _spacy-101/_tokenization + + ++h(3, "101-data") Tokenizer data + +p + | #[strong Global] and #[strong language-specific] tokenizer data is + | supplied via the language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang]. + | The tokenizer exceptions define special cases like "don't" in English, + | which needs to be split into two tokens: #[code {ORTH: "do"}] and + | #[code {ORTH: "n't", LEMMA: "not"}]. The prefixes, suffixes and infixes + | mosty define punctuation rules – for example, when to split off periods + | (at the end of a sentence), and when to leave token containing periods + | intact (abbreviations like "U.S."). + ++image + include ../../assets/img/docs/language_data.svg + .u-text-right + +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic + ++infobox + | For more details on the language-specific data, see the + | usage workflow on #[+a("/docs/usage/adding-languages") adding languages]. +h(2, "special-cases") Adding special case tokenization rules p | Most domains have at least some idiosyncracies that require custom - | tokenization rules. Here's how to add a special case rule to an existing + | tokenization rules. This could be very certain expressions, or + | abbreviations only used in this specific field. + ++aside("Language data vs. custom tokenization") + | Tokenization rules that are specific to one language, but can be + | #[strong generalised across that language] should ideally live in the + | language data in #[+src(gh("spaCy", "spacy/lang")) spacy/lang] – we + | always appreciate pull requests! Anything that's specific to a domain or + | text type – like financial trading abbreviations, or Bavarian youth slang + | – should be added as a special case rule to your tokenizer instance. If + | you're dealing with a lot of customisations, it might make sense to create + | an entirely custom subclass. + +p + | Here's how to add a special case rule to an existing | #[+api("tokenizer") #[code Tokenizer]] instance: +code. @@ -30,15 +68,12 @@ p from spacy.symbols import ORTH, LEMMA, POS nlp = spacy.load('en') - assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that'] - nlp.tokenizer.add_special_case(u'gimme', - [ - { - ORTH: u'gim', - LEMMA: u'give', - POS: u'VERB'}, - { - ORTH: u'me'}]) + doc = nlp(u'gimme that') # phrase to tokenize + assert [w.text for w in doc] == [u'gimme', u'that'] # current tokenization + + # add special case rule + special_case = [{ORTH: u'gim', LEMMA: u'give', POS: u'VERB'}, {ORTH: u'me'}] + nlp.tokenizer.add_special_case(u'gimme', special_case) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] @@ -55,9 +90,8 @@ p | The special case rules have precedence over the punctuation splitting: +code. - nlp.tokenizer.add_special_case(u'...gimme...?', - [{ - ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) + special_case = [{ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}] + nlp.tokenizer.add_special_case(u'...gimme...?', special_case) assert len(nlp(u'...gimme...?')) == 1 p @@ -137,8 +171,8 @@ p +h(2, "native-tokenizers") Customizing spaCy's Tokenizer class p - | Let's imagine you wanted to create a tokenizer for a new language. There - | are four things you would need to define: + | Let's imagine you wanted to create a tokenizer for a new language or + | specific domain. There are four things you would need to define: +list("numbers") +item @@ -170,14 +204,14 @@ p import re from spacy.tokenizer import Tokenizer - prefix_re = re.compile(r'''[\[\("']''') - suffix_re = re.compile(r'''[\]\)"']''') - def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search) + prefix_re = re.compile(r'''[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']''') - nlp = spacy.load('en', tokenizer=create_make_doc) + def create_tokenizer(nlp): + return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, + suffix_search=suffix_re.search) + + nlp = spacy.load('en', tokenizer=create_tokenizer) p | If you need to subclass the tokenizer instead, the relevant methods to @@ -191,8 +225,6 @@ p | you're creating the pipeline: +code. - import spacy - nlp = spacy.load('en', make_doc=my_tokenizer) p diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index c372dfbf4..0ea2609d2 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -126,3 +126,40 @@ p +row +cell #[code matcher] +cell Supply a pre-built matcher, instead of creating one. + ++h(2, "customizing") Customizing the pipeline + +p + | spaCy provides several linguistic annotation functions by default. Each + | function takes a Doc object, and modifies it in-place. The default + | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 + | introduced the ability to customise this pipeline with arbitrary + | functions. + ++code. + def arbitrary_fixup_rules(doc): + for token in doc: + if token.text == u'bill' and token.tag_ == u'NNP': + token.tag_ = u'NN' + + def custom_pipeline(nlp): + return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) + + nlp = spacy.load('en', create_pipeline=custom_pipeline) + +p + | The easiest way to customise the pipeline is to pass a + | #[code create_pipeline] callback to the #[code spacy.load()] function. + +p + | The callback you pass to #[code create_pipeline] should take a single + | argument, and return a sequence of callables. Each callable in the + | sequence should accept a #[code Doc] object and modify it in place. + +p + | Instead of passing a callback, you can also write to the + | #[code .pipeline] attribute directly. + ++code. + nlp = spacy.load('en') + nlp.pipeline = [nlp.tagger] From 66088851dcd4fe72056c0d7534d80e28400aad15 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 11:58:17 +0200 Subject: [PATCH 028/118] Add Doc.to_disk() and Doc.from_disk() methods --- spacy/tokens/doc.pyx | 18 ++++++++++++++++++ website/docs/api/doc.jade | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0e4faafbe..611a68186 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -598,6 +598,24 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + def to_disk(self, path): + """Save the current state to a directory. + + path (unicode or Path): A path to a directory, which will be created if + it doesn't exist. Paths may be either strings or `Path`-like objects. + """ + raise NotImplementedError() + + def from_disk(self, path): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode or Path): A path to a directory. Paths may be either + strings or `Path`-like objects. + RETURNS (Doc): The modified `Doc` object. + """ + raise NotImplementedError() + def to_bytes(self): """Serialize, i.e. export the document contents to a binary string. diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 6a9faf4b4..62b1a2a76 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -253,6 +253,44 @@ p +cell #[code Doc] +cell Itself. ++h(2, "to_disk") Doc.to_disk + +tag method + +p Save the current state to a directory. + ++aside-code("Example"). + doc.to_disk('/path/to/doc') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory, which will be created if it doesn't exist. + | Paths may be either strings or #[code Path]-like objects. + ++h(2, "from_disk") Doc.from_disk + +tag method + +p Loads state from a directory. Modifies the object in place and returns it. + ++aside-code("Example"). + from spacy.tokens import Doc + doc = Doc().from_disk('/path/to/doc') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell unicode or #[code Path] + +cell + | A path to a directory. Paths may be either strings or + | #[code Path]-like objects. + + +footrow + +cell returns + +cell #[code Doc] + +cell The modified #[code Doc] object. + +h(2, "to_bytes") Doc.to_bytes +tag method From 8b86b08bedf8143dad696bc6077f4c10a12782b9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 11:59:08 +0200 Subject: [PATCH 029/118] Update usage workflows --- website/docs/api/util.jade | 2 +- website/docs/usage/_data.json | 2 +- website/docs/usage/adding-languages.jade | 11 ++- website/docs/usage/customizing-pipeline.jade | 38 ----------- website/docs/usage/index.jade | 2 +- website/docs/usage/processing-text.jade | 9 ++- website/docs/usage/saving-loading.jade | 70 +++++++++++--------- website/docs/usage/training-ner.jade | 2 +- 8 files changed, 55 insertions(+), 81 deletions(-) delete mode 100644 website/docs/usage/customizing-pipeline.jade diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index f14cdbb6d..bf81a4f61 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -225,7 +225,7 @@ p p | Print a formatted, text-wrapped message with optional title. If a text | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the #[+a("/docs/api/cli") CLI]. + | be used for interactive components like the #[+api("cli") cli]. +aside-code("Example"). data_path = Path('/some/path') diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index f903c7c1e..acd973aa1 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -125,7 +125,7 @@ }, "saving-loading": { - "title": "Saving and loading models" + "title": "Saving, loading and data serialization" }, "showcase": { diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index f3648b885..ae04aad57 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -538,8 +538,8 @@ p | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] | script from the spaCy developer resources. Note that your corpus should | not be preprocessed (i.e. you need punctuation for example). The - | #[+a("/docs/api/cli#model") #[code model]] command expects a - | tab-separated word frequencies file with three columns: + | #[+api("cli#model") #[code model]] command expects a tab-separated word + | frequencies file with three columns: +list("numbers") +item The number of times the word occurred in your language sample. @@ -654,13 +654,12 @@ p | If your corpus uses the | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, | i.e. files with the extension #[code .conllu], you can use the - | #[+a("/docs/api/cli#convert") #[code convert]] command to convert it to - | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. + | #[+api("cli#convert") #[code convert]] command to convert it to spaCy's + | #[+a("/docs/api/annotation#json-input") JSON format] for training. p | Once you have your UD corpus transformed into JSON, you can train your - | model use the using spaCy's - | #[+a("/docs/api/cli#train") #[code train]] command: + | model use the using spaCy's #[+api("cli#train") #[code train]] command: +code(false, "bash"). python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] diff --git a/website/docs/usage/customizing-pipeline.jade b/website/docs/usage/customizing-pipeline.jade deleted file mode 100644 index a4846d02e..000000000 --- a/website/docs/usage/customizing-pipeline.jade +++ /dev/null @@ -1,38 +0,0 @@ -//- πŸ’« DOCS > USAGE > CUSTOMIZING THE PIPELINE - -include ../../_includes/_mixins - -p - | spaCy provides several linguistic annotation functions by default. Each - | function takes a Doc object, and modifies it in-place. The default - | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 - | introduced the ability to customise this pipeline with arbitrary - | functions. - -+code. - def arbitrary_fixup_rules(doc): - for token in doc: - if token.text == u'bill' and token.tag_ == u'NNP': - token.tag_ = u'NN' - - def custom_pipeline(nlp): - return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) - - nlp = spacy.load('en', create_pipeline=custom_pipeline) - -p - | The easiest way to customise the pipeline is to pass a - | #[code create_pipeline] callback to the #[code spacy.load()] function. - -p - | The callback you pass to #[code create_pipeline] should take a single - | argument, and return a sequence of callables. Each callable in the - | sequence should accept a #[code Doc] object and modify it in place. - -p - | Instead of passing a callback, you can also write to the - | #[code .pipeline] attribute directly. - -+code. - nlp = spacy.load('en') - nlp.pipeline = [nlp.tagger] diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 61398b431..cb1ab5754 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -291,7 +291,7 @@ p | environment variable, as this can lead to unexpected results, especially | when using #[code virtualenv]. Run the command with #[code python -m], | for example #[code python -m spacy download en]. For more info on this, - | see the #[+a("/docs/api/cli#download") CLI documentation]. + | see #[+api("cli#download") download]. +h(3, "module-load") 'module' object has no attribute 'load' diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade index 4bd6132d2..2562d9fc4 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/processing-text.jade @@ -10,14 +10,19 @@ p doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') p - | The library should perform equally well with short or long documents. + | The library should perform equally well with #[strong short or long documents]. | All algorithms are linear-time in the length of the string, and once the | data is loaded, there's no significant start-up cost to consider. This | means that you don't have to strategically merge or split your text β€” | you should feel free to feed in either single tweets or whole novels. p - | If you run #[code nlp = spacy.load('en')], the #[code nlp] object will + | If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will + | load the #[+a("/docs/usage/models") model] associated with the name + | #[code 'en']. Each model is a Python package containing an + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py] + +the #[code nlp] object will | be an instance of #[code spacy.en.English]. This means that when you run | #[code doc = nlp(text)], you're executing | #[code spacy.en.English.__call__], which is implemented on its parent diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 3513e9505..63c951d40 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -1,5 +1,8 @@ include ../../_includes/_mixins + ++h(2, "models") Saving models + p | After training your model, you'll usually want to save its state, and load | it back later. You can do this with the @@ -14,28 +17,28 @@ p | will be written out. To make the model more convenient to deploy, we | recommend wrapping it as a Python package. -+h(2, "generating") Generating a model package ++h(3, "models-generating") Generating a model package +infobox("Important note") | The model packages are #[strong not suitable] for the public | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not | designed for binary data and files over 50 MB. However, if your company - | is running an internal installation of pypi, publishing your models on - | there can be a convenient solution to share them with your team. + | is running an #[strong internal installation] of PyPi, publishing your + | models on there can be a convenient way to share them with your team. p | spaCy comes with a handy CLI command that will create all required files, | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a - | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/api/cli#package") #[code package]] command documentation. + | path to it using the #[code --meta] flag. For more info on this, see + | the #[+api("cli#package") #[code package]] docs. +aside-code("meta.json", "json"). { "name": "example_model", "lang": "en", "version": "1.0.0", - "spacy_version": ">=1.7.0,<2.0.0", + "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", @@ -58,7 +61,7 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+src(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The @@ -66,44 +69,49 @@ p | respective #[code Language] class in spaCy, which will later be returned | by the model's #[code load()] method. -+h(2, "building") Building a model package - p - | To build the package, run the following command from within the + | To #[strong build the package], run the following command from within the | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. + | #[code /dist]. For more information on building Python packages, see the + | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + +code(false, "bash"). python setup.py sdist -p - | For more information on building Python packages, see the - | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. - - -+h(2, "loading") Loading a model package ++h(2, "loading") Loading a custom model package p - | Model packages can be installed by pointing pip to the model's - | #[code .tar.gz] archive: + | To load a model from a data directory, you can use + | #[+api("spacy#load") #[code spacy.load()]] with the local path: + ++code. + nlp = spacy.load('/path/to/model') + +p + | If you have generated a model package, you can also install it by + | pointing pip to the model's #[code .tar.gz] archive – this is pretty + | much exactly what spaCy's #[+api("cli#download") #[code download]] + | command does under the hood. +code(false, "bash"). pip install /path/to/en_example_model-1.0.0.tar.gz -p You'll then be able to load the model as follows: ++aside-code("Custom model names", "bash"). + # optional: assign custom name to model + python -m spacy link en_example_model my_cool_model + +p + | You'll then be able to load the model via spaCy's loader, or by importing + | it as a module. For larger code bases, we usually recommend native + | imports, as this will make it easier to integrate models with your + | existing build process, continuous integration workflow and testing + | framework. +code. + # option 1: import model as module import en_example_model nlp = en_example_model.load() -p - | To load the model via #[code spacy.load()], you can also - | create a #[+a("/docs/usage/models#usage") shortcut link] that maps the - | package name to a custom model name of your choice: - -+code(false, "bash"). - python -m spacy link en_example_model example - -+code. - import spacy - nlp = spacy.load('example') + # option 2: use spacy.load() + nlp = spacy.load('en_example_model') diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 4d864ac9d..8b8789485 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -77,7 +77,7 @@ p p | To make the model more convenient to deploy, we recommend wrapping it as | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+a("/docs/api/cli#package") #[code package]] + | module. spaCy comes with a handy #[+api("cli#package") #[code package]] | CLI command to create all required files and directories. +code(false, "bash"). From 823d22100b0335687e4ef4e9ba7734ecaa4211bb Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:21:12 +0200 Subject: [PATCH 030/118] Tidy up architecture.svg --- website/assets/img/docs/architecture.svg | 124 +++++++++++------------ 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/docs/architecture.svg index d62d08f88..1025fbaaf 100644 --- a/website/assets/img/docs/architecture.svg +++ b/website/assets/img/docs/architecture.svg @@ -3,126 +3,126 @@ .text-large { fill: #1a1e23; font: 20px "Source Sans Pro" } .text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" } .text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" } - .text-code { fill: #1a1e23; font: bold 12px "Source Code Pro" } + .text-code { fill: #1a1e23; font: 600 12px "Source Code Pro" } - + Language - - + + MAKES - - + + nlp.vocab.morphology - + Vocab - - + + nlp.vocab - + StringStore - - + + nlp.vocab.strings - - + + nlp.tokenizer.vocab - + Tokenizer - - + + nlp.make_doc() - - + + nlp.pipeline - - + + nlp.pipeline[i].vocab - + pt - + en - + de - + fr - + es - + it - + nl - + sv - + fi - + nb - + hu - + he - + bn - + ja - + zh - - - - + + + + doc.vocab - - + + MAKES - + Doc - - + + MAKES - - + + token.doc - + Token - + Span - - + + lexeme.vocab - + Lexeme - - + + MAKES - - + + span.doc - + Dependency Parser - + Entity Recognizer - + Tagger - + Matcher - + Lemmatizer - + Morphology From b546bcb05f0b47fb2ff40906123525c5193813a1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:21:18 +0200 Subject: [PATCH 031/118] Add pipeline illustration --- website/assets/img/docs/pipeline.svg | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 website/assets/img/docs/pipeline.svg diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg new file mode 100644 index 000000000..ddd1171ef --- /dev/null +++ b/website/assets/img/docs/pipeline.svg @@ -0,0 +1,30 @@ + + + + + Doc + + + + Text + + + + nlp + + tokenizer + + vectorizer + + + + tagger + + parser + + ner + From 54885b5e8812b0e400934d06ace8cede8657fea6 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:24:40 +0200 Subject: [PATCH 032/118] Add serialization 101 --- .../docs/usage/_spacy-101/_serialization.jade | 35 +++++++++++++++++++ website/docs/usage/saving-loading.jade | 10 ++++++ website/docs/usage/spacy-101.jade | 4 +++ 3 files changed, 49 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_serialization.jade diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade new file mode 100644 index 000000000..b6a889014 --- /dev/null +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -0,0 +1,35 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > SERIALIZATION + +p + | If you've been modifying the pipeline, vocabulary vectors and entities, or made + | updates to the model, you'll eventually want + | to #[strong save your progress] – for example, everything that's in your #[code nlp] + | object. This means you'll have to translate its contents and structure + | into a format that can be saved, like a file or a byte string. This + | process is called serialization. spaCy comes with + | #[strong built-in serialization methods] and supports the + | #[+a("http://www.diveintopython3.net/serializing.html#dump") Pickle protocol]. + ++aside("What's pickle?") + | Pickle is Python's built-in object persistance system. It lets you + | transfer arbitrary Python objects between processes. This is usually used + | to load an object to and from disk, but it's also used for distributed + | computing, e.g. with + | #[+a("https://spark.apache.org/docs/0.9.0/python-programming-guide.html") PySpark] + | or #[+a("http://dask.pydata.org/en/latest/") Dask]. When you unpickle an + | object, you're agreeing to execute whatever code it contains. It's like + | calling #[code eval()] on a string – so don't unpickle objects from + | untrusted sources. + +p + | All container classes and pipeline components, i.e. + for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"] + | #[+api(cls.toLowerCase()) #[code=cls]], + | have the following methods available: + ++table(["Method", "Returns", "Example"]) + - style = [1, 0, 1] + +annotation-row(["to_bytes", "bytes", "nlp.to_bytes()"], style) + +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style) + +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) + +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 63c951d40..e580bca25 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -1,5 +1,15 @@ include ../../_includes/_mixins ++h(2, "101") Serialization 101 + +include _spacy-101/_serialization + ++infobox("Important note") + | In spaCy v2.0, the API for saving and loading has changed to only use the + | four methods listed above consistently across objects and classes. For an + | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] + | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. + +h(2, "models") Saving models diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 4fb758bb4..958200637 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -105,6 +105,10 @@ include _spacy-101/_word-vectors +h(2, "pipelines") Pipelines ++h(2, "serialization") Serialization + +include _spacy-101/_serialization + +h(2, "architecture") Architecture +image From 8aaed8bea79c9df11fd6c799ddfd31bae2c81318 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:25:13 +0200 Subject: [PATCH 033/118] Add pipelines 101 and rewrite pipelines workflow --- website/docs/usage/_data.json | 2 +- website/docs/usage/_spacy-101/_pipelines.jade | 44 ++ .../usage/language-processing-pipeline.jade | 452 ++++++++++++------ website/docs/usage/spacy-101.jade | 2 + 4 files changed, 349 insertions(+), 151 deletions(-) create mode 100644 website/docs/usage/_spacy-101/_pipelines.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index acd973aa1..4d065522b 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -105,7 +105,7 @@ }, "language-processing-pipeline": { - "title": "Natural language processing pipelines", + "title": "Language processing pipelines", "next": "deep-learning" }, diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade new file mode 100644 index 000000000..fe6c149f6 --- /dev/null +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -0,0 +1,44 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > PIPELINES + +p + | When you call #[code nlp] on a text, spaCy first tokenizes the text to + | produce a #[code Doc] object. The #[code Doc] is the processed in several + | different steps – this is also referred to as the + | #[strong processing pipeline]. The pipeline used by our + | #[+a("/docs/usage/models") default models] consists of a + | vectorizer, a tagger, a parser and an entity recognizer. Each pipeline + | component returns the processed #[code Doc], which is then passed on to + | the next component. + ++image + include ../../../assets/img/docs/pipeline.svg + .u-text-right + +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic + ++table(["Name", "Component", "Creates"]) + +row + +cell tokenizer + +cell #[+api("tokenizer") #[code Tokenizer]] + +cell #[code Doc] + + +row("divider") + +cell vectorizer + +cell #[code Vectorizer] + +cell #[code Doc.tensor] + + +row + +cell tagger + +cell #[+api("tagger") #[code Tagger]] + +cell #[code Doc[i].tag] + + +row + +cell parser + +cell #[+api("dependencyparser") #[code DependencyParser]] + +cell + | #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents], + | #[code Doc.noun_chunks] + + +row + +cell ner + +cell #[+api("entityrecognizer") #[code EntityRecognizer]] + +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type] diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 0ea2609d2..3b41ad5de 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -2,164 +2,316 @@ include ../../_includes/_mixins -p - | The standard entry point into spaCy is the #[code spacy.load()] - | function, which constructs a language processing pipeline. The standard - | variable name for the language processing pipeline is #[code nlp], for - | Natural Language Processing. The #[code nlp] variable is usually an - | instance of class #[code spacy.language.Language]. For English, the - | #[code spacy.en.English] class is the default. ++h(2, "101") Pipelines 101 + +include _spacy-101/_pipelines + ++h(2, "pipelines") How pipelines work p - | You'll use the nlp instance to produce #[+api("doc") #[code Doc]] - | objects. You'll then use the #[code Doc] object to access linguistic - | annotations to help you with whatever text processing task you're - | trying to do. - -+code. - import spacy # See "Installing spaCy" - nlp = spacy.load('en') # You are here. - doc = nlp(u'Hello, spacy!') # See "Using the pipeline" - print((w.text, w.pos_) for w in doc) # See "Doc, Span and Token" - -+aside("Why do we have to preload?") - | Loading the models takes ~200x longer than - | processing a document. We therefore want to amortize the start-up cost - | across multiple invocations. It's often best to wrap the pipeline as a - | singleton. The library avoids doing that for you, because it's a - | difficult design to back out of. - -p The #[code load] function takes the following positional arguments: - -+table([ "Name", "Description" ]) - +row - +cell #[code lang_id] - +cell - | An ID that is resolved to a class or factory function by - | #[code spacy.util.get_lang_class()]. Common values are - | #[code 'en'] for the English pipeline, or #[code 'de'] for the - | German pipeline. You can register your own factory function or - | class with #[code spacy.util.set_lang_class()]. + | spaCy makes it very easy to create your own pipelines consisting of + | reusable components – this includes spaCy's default vectorizer, tagger, + | parser and entity regcognizer, but also your own custom processing + | functions. A pipeline component can be added to an already existing + | #[code nlp] object, specified when initialising a #[code Language] class, + | or defined within a + | #[+a("/docs/usage/saving-loading#models-generating") model package]. p - | All keyword arguments are passed forward to the pipeline factory. No - | keyword arguments are required. The built-in factories (e.g. - | #[code spacy.en.English], #[code spacy.de.German]), which are subclasses - | of #[+api("language") #[code Language]], respond to the following - | keyword arguments: + | When you load a model, spaCy first consults the model's + | #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its + | #[code setup] details. This typically includes the ID of a language class, + | and an optional list of pipeline components. spaCy then does the + | following: -+table([ "Name", "Description"]) - +row - +cell #[code path] - +cell - | Where to load the data from. If None, the default data path is - | fetched via #[code spacy.util.get_data_path()]. You can - | configure this default using #[code spacy.util.set_data_path()]. - | The data path is expected to be either a string, or an object - | responding to the #[code pathlib.Path] interface. If the path is - | a string, it will be immediately transformed into a - | #[code pathlib.Path] object. spaCy promises to never manipulate - | or open file-system paths as strings. All access to the - | file-system is done via the #[code pathlib.Path] interface. - | spaCy also promises to never check the type of path objects. - | This allows you to customize the loading behaviours in arbitrary - | ways, by creating your own object that implements the - | #[code pathlib.Path] interface. ++aside-code("meta.json (excerpt)", "json"). + { + "name": "example_model", + "description": "Example model for spaCy", + "setup": { + "lang": "en", + "pipeline": ["token_vectors", "tagger"] + } + } - +row - +cell #[code pipeline] - +cell - | A sequence of functions that take the Doc object and modify it - | in-place. See - | #[+a("customizing-pipeline") Customizing the pipeline]. - - +row - +cell #[code create_pipeline] - +cell - | Callback to construct the pipeline sequence. It should accept - | the #[code nlp] instance as its only argument, and return a - | sequence of functions that take the #[code Doc] object and - | modify it in-place. - | See #[+a("customizing-pipeline") Customizing the pipeline]. If - | a value is supplied to the pipeline keyword argument, the - | #[code create_pipeline] keyword argument is ignored. - - +row - +cell #[code make_doc] - +cell A function that takes the input and returns a document object. - - +row - +cell #[code create_make_doc] - +cell - | Callback to construct the #[code make_doc] function. It should - | accept the #[code nlp] instance as its only argument. To use the - | built-in annotation processes, it should return an object of - | type #[code Doc]. If a value is supplied to the #[code make_doc] - | keyword argument, the #[code create_make_doc] keyword argument - | is ignored. - - +row - +cell #[code vocab] - +cell Supply a pre-built Vocab instance, instead of constructing one. - - +row - +cell #[code add_vectors] - +cell - | Callback that installs word vectors into the Vocab instance. The - | #[code add_vectors] callback should take a - | #[+api("vocab") #[code Vocab]] instance as its only argument, - | and set the word vectors and #[code vectors_length] in-place. See - | #[+a("word-vectors-similarities") Word Vectors and Similarities]. - - +row - +cell #[code tagger] - +cell Supply a pre-built tagger, instead of creating one. - - +row - +cell #[code parser] - +cell Supply a pre-built parser, instead of creating one. - - +row - +cell #[code entity] - +cell Supply a pre-built entity recognizer, instead of creating one. - - +row - +cell #[code matcher] - +cell Supply a pre-built matcher, instead of creating one. - -+h(2, "customizing") Customizing the pipeline ++list("numbers") + +item + | Look up #[strong pipeline IDs] in the available + | #[strong pipeline factories]. + +item + | Initialise the #[strong pipeline components] by calling their + | factories with the #[code Vocab] as an argument. This gives each + | factory and component access to the pipeline's shared data, like + | strings, morphology and annotation scheme. + +item + | Load the #[strong language class and data] for the given ID via + | #[+api("util.get_lang_class") #[code get_lang_class]]. + +item + | Pass the path to the #[strong model data] to the #[code Language] + | class and return it. p - | spaCy provides several linguistic annotation functions by default. Each - | function takes a Doc object, and modifies it in-place. The default - | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 - | introduced the ability to customise this pipeline with arbitrary - | functions. - -+code. - def arbitrary_fixup_rules(doc): - for token in doc: - if token.text == u'bill' and token.tag_ == u'NNP': - token.tag_ = u'NN' - - def custom_pipeline(nlp): - return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) - - nlp = spacy.load('en', create_pipeline=custom_pipeline) - -p - | The easiest way to customise the pipeline is to pass a - | #[code create_pipeline] callback to the #[code spacy.load()] function. - -p - | The callback you pass to #[code create_pipeline] should take a single - | argument, and return a sequence of callables. Each callable in the - | sequence should accept a #[code Doc] object and modify it in place. - -p - | Instead of passing a callback, you can also write to the - | #[code .pipeline] attribute directly. + | So when you call this... +code. nlp = spacy.load('en') - nlp.pipeline = [nlp.tagger] + +p + | ... the model tells spaCy to use the pipeline + | #[code ["vectorizer", "tagger", "parser", "ner"]]. spaCy will then look + | up each string in its internal factories registry and initialise the + | individual components. It'll then load #[code spacy.lang.en.English], + | pass it the path to the model's data directory, and return it for you + | to use as the #[code nlp] object. + +p + | When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and + | then #[strong call each component] on the #[code Doc], in order. + | Components all return the modified document, which is then processed by + | the component next in the pipeline. + ++code("The pipeline under the hood"). + doc = nlp.make_doc(u'This is a sentence') + for proc in nlp.pipeline: + doc = proc(doc) + ++h(2, "creating") Creating pipeline components and factories + +p + | spaCy lets you customise the pipeline with your own components. Components + | are functions that receive a #[code Doc] object, modify and return it. + | If your component is stateful, you'll want to create a new one for each + | pipeline. You can do that by defining and registering a factory which + | receives the shared #[code Vocab] object and returns a component. + ++h(3, "creating-component") Creating a component + +p + | A component receives a #[code Doc] object and + | #[strong performs the actual processing] – for example, using the current + | weights to make a prediction and set some annotation on the document. By + | adding a component to the pipeline, you'll get access to the #[code Doc] + | at any point #[strong during] processing – instead of only being able to + | modify it afterwards. + ++aside-code("Example"). + def my_component(doc): + # do something to the doc here + return doc + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The #[code Doc] object processed by the previous component. + + +footrow + +cell returns + +cell #[code Doc] + +cell The #[code Doc] object processed by this pipeline component. + +p + | When creating a new #[code Language] class, you can pass it a list of + | pipeline component functions to execute in that order. You can also + | add it to an existing pipeline by modifying #[code nlp.pipeline] – just + | be careful not to overwrite a pipeline or its components by accident! + ++code. + # Create a new Language object with a pipeline + from spacy.language import Language + nlp = Language(pipeline=[my_component]) + + # Modify an existing pipeline + nlp = spacy.load('en') + nlp.pipeline.append(my_component) + ++h(3, "creating-factory") Creating a factory + +p + | A factory is a #[strong function that returns a pipeline component]. + | It's called with the #[code Vocab] object, to give it access to the + | shared data between components – for example, the strings, morphology, + | vectors or annotation scheme. Factories are useful for creating + | #[strong stateful components], especially ones which + | #[strong depend on shared data]. + ++aside-code("Example"). + def my_factory(vocab): + # load some state + def my_component(doc): + # process the doc + return doc + return my_component + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code vocab] + +cell #[coce Vocab] + +cell + | Shared data between components, including strings, morphology, + | vectors etc. + + +footrow + +cell returns + +cell callable + +cell The pipeline component. + +p + | By creating a factory, you're essentially telling spaCy how to get the + | pipeline component #[strong once the vocab is available]. Factories need to + | be registered via #[+api("spacy#set_factory") #[code set_factory()]] and + | by assigning them a unique ID. This ID can be added to the pipeline as a + | string. When creating a pipeline, you're free to mix strings and + | callable components: + ++code. + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory', my_other_component]) + +p + | If spaCy comes across a string in the pipeline, it will try to resolve it + | by looking it up in the available factories. The factory will then be + | initialised with the #[code Vocab]. Providing factory names instead of + | callables also makes it easy to specify them in the model's + | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. If you're + | training your own model and want to use one of spaCy's default components, + | you won't have to worry about finding and implementing it either – to use + | the default tagger, simply add #[code "tagger"] to the pipeline, and + | #[strong spaCy will know what to do]. + + ++infobox("Important note") + | Because factories are #[strong resolved on initialisation] of the + | #[code Language] class, it's #[strong not possible] to add them to the + | pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only + | works with individual component functions. To use factories, you need to + | create a new #[code Language] object, or generate a + | #[+a("/docs/usage/saving-loading#models-generating") model package] with + | a custom pipeline. + ++h(2, "example1") Example: Custom sentence segmentation logic + ++aside("Real-world examples") + | To see real-world examples of pipeline factories and components in action, + | you can have a look at the source of spaCy's built-in components, e.g. + | the #[+src(gh("spacy")) tagger], #[+src(gh("spacy")) parser] or + | #[+src(gh("spacy")) entity recognizer]. + +p + | Let's say you want to implement custom logic to improve spaCy's sentence + | boundary detection. Currently, sentence segmentation is based on the + | dependency parse, which doesn't always produce ideal results. The custom + | logic should therefore be applied #[strong after] tokenization, but + | #[strong before] the dependency parsing – this way, the parser can also + | take advantage of the sentence boundaries. + ++code. + def sbd_component(doc): + for i, token in enumerate(doc[:-2]): + # define sentence start if period + titlecase token + if token.text == '.' and doc[i+1].is_title: + doc[i+1].sent_start = True + return doc + +p + | In this case, we simply want to add the component to the existing + | pipeline of the English model. We can do this by inserting it at index 0 + | of #[code nlp.pipeline]: + ++code. + nlp = spacy.load('en') + nlp.pipeline.insert(0, sbd_component) + +p + | When you call #[code nlp] on some text, spaCy will tokenize it to create + | a #[code Doc] object, and first call #[code sbd_component] on it, followed + | by the model's default pipeline. + ++h(2, "example2") Example: Sentiment model + +p + | Let's say you have trained your own document sentiment model on English + | text. After tokenization, you want spaCy to first execute the + | #[strong default vectorizer], followed by a custom + | #[strong sentiment component] that adds a #[code .sentiment] + | property to the #[code Doc], containing your model's sentiment precition. + +p + | Your component class will have a #[code from_disk()] method that spaCy + | calls to load the model data. When called, the component will compute + | the sentiment score, add it to the #[code Doc] and return the modified + | document. Optionally, the component can include an #[code update()] method + | to allow training the model. + ++code. + import pickle + from pathlib import Path + + class SentimentComponent(object): + def __init__(self, vocab): + self.weights = None + + def __call__(self, doc): + doc.sentiment = sum(self.weights*doc.vector) # set sentiment property + return doc + + def from_disk(self, path): # path = model path + factory ID ('sentiment') + self.weights = pickle.load(Path(path) / 'weights.bin') # load weights + return self + + def update(self, doc, gold): # update weights – allows training! + prediction = sum(self.weights*doc.vector) + self.weights -= 0.001*doc.vector*(prediction-gold.sentiment) + +p + | The factory will initialise the component with the #[code Vocab] object. + | To be able to add it to your model's pipeline as #[code 'sentiment'], + | it also needs to be registered via + | #[+api("spacy#set_factory") #[code set_factory()]]. + ++code. + def sentiment_factory(vocab): + component = SentimentComponent(vocab) # initialise component + return component + + spacy.set_factory('sentiment', sentiment_factory) + +p + | The above code should be #[strong shipped with your model]. You can use + | the #[+api("cli#package") #[code package]] command to create all required + | files and directories. The model package will include an + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py] + | with a #[code load()] method, that will initialise the language class with + | the model's pipeline and call the #[code from_disk()] method to load + | the model data. + +p + | In the model package's meta.json, specify the language class and pipeline + | IDs in #[code setup]: + ++code("meta.json (excerpt)", "json"). + { + "name": "my_sentiment_model", + "version": "1.0.0", + "spacy_version": ">=2.0.0,<3.0.0", + "setup": { + "lang": "en", + "pipeline": ["vectorizer", "sentiment"] + } + } + +p + | When you load your new model, spaCy will call the model's #[code load()] + | method. This will return a #[code Language] object with a pipeline + | containing the default vectorizer, and the sentiment component returned + | by your custom #[code "sentiment"] factory. + ++code. + nlp = spacy.load('my_sentiment_model') + doc = nlp(u'I love pizza') + assert doc.sentiment + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 958200637..f8779b52f 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -105,6 +105,8 @@ include _spacy-101/_word-vectors +h(2, "pipelines") Pipelines +include _spacy-101/_pipelines + +h(2, "serialization") Serialization include _spacy-101/_serialization From 4f396236f66ff56a168846bdd682d8c8bbaa5c79 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 19:25:49 +0200 Subject: [PATCH 034/118] Update saving and loading docs --- website/docs/usage/models.jade | 2 +- website/docs/usage/saving-loading.jade | 32 ++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 832ad8211..a837b4d29 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -233,4 +233,4 @@ p +infobox("Saving and loading models") | For more information and a detailed guide on how to package your model, | see the documentation on - | #[+a("/docs/usage/saving-loading") saving and loading models]. + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index e580bca25..74370bbb1 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -10,6 +10,27 @@ include _spacy-101/_serialization | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. + | save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]], + | and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]]. + | This will overwrite the existing object and return it. + ++code. + import spacy + from spacy.tokens import Span + + text = u'Netflix is hiring a new VP of global policy' + + nlp = spacy.load('en') + doc = nlp(text) + assert len(doc.ents) == 0 # Doc has no entities + doc.ents += ((Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])) # add entity + doc.to_disk('/path/to/doc') # save Doc to disk + + new_doc = nlp(text) + assert len(new_doc.ents) == 0 # new Doc has no entities + new_doc = new_doc.from_disk('path/to/doc') # load from disk and overwrite + assert len(new_doc.ents) == 1 # entity is now recognised! + assert [(ent.text, ent.label_) for ent in new_doc.ents] == [(u'Netflix', u'ORG')] +h(2, "models") Saving models @@ -46,13 +67,16 @@ p +aside-code("meta.json", "json"). { "name": "example_model", - "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", - "license": "CC BY-SA 3.0" + "license": "CC BY-SA 3.0", + "setup": { + "lang": "en", + "pipeline": ["token_vectors", "tagger"] + } } +code(false, "bash"). @@ -71,10 +95,10 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+src(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+src(gh("spacy-dev-resources", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of - | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The + | #[code lang_name] and #[code lang_name-version]. | #[code lang] setting in the meta.json is also used to create the | respective #[code Language] class in spaCy, which will later be returned | by the model's #[code load()] method. From 764bfa3239f4edb2cd73708643c9cb10102c675d Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 20:53:43 +0200 Subject: [PATCH 035/118] Add section on using displaCy in a web app --- website/docs/usage/visualizers.jade | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index fe779add9..385fa0fd0 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -315,3 +315,61 @@ p 'ents': [{'start': 4, 'end': 10, 'label': 'ORG'}], 'title': None } + ++h(2, "webapp") Using displaCy in a web application + +p + | If you want to use the visualizers as part of a web application, for + | example to create something like our + | #[+a(DEMOS_URL + "/displacy") online demo], it's not recommended to + | simply wrap and serve the displaCy renderer. Instead, you should only + | rely on the server to perform spaCy's processing capabilities, and use + | #[+a(gh("displacy")) displaCy.js] to render the JSON-formatted output. + ++aside("Why not return the HTML by the server?") + | It's certainly possible to just have your server return the markup. + | But outputting raw, unsanitised HTML is risky and makes your app vulnerable to + | #[+a("https://en.wikipedia.org/wiki/Cross-site_scripting") cross-site scripting] + | (XSS). All your user needs to do is find a way to make spaCy return one + | token #[code <script src="malicious-code.js"><script>]. + | Instead of relying on the server to render and sanitize HTML, you + | can do this on the client in JavaScript. displaCy.js creates + | the SVG markup as DOM nodes and will never insert raw HTML. + +p + | The #[code parse_deps] function takes a #[code Doc] object and returns + | a dictionary in a format that can be rendered by displaCy. + ++code("Example"). + import spacy + from spacy import displacy + + nlp = spacy.load('en') + + def displacy_service(text): + doc = nlp(text) + return displacy.parse_deps(doc) + +p + | Using a library like #[+a("https://falconframework.org/") Falcon] or + | #[+a("http://www.hug.rest/") Hug], you can easily turn the above code + | into a simple REST API that receives a text and returns a JSON-formatted + | parse. In your front-end, include #[+a(gh("displacy")) displacy.js] and + | initialise it with the API URL and the ID or query selector of the + | container to render the visualisation in, e.g. #[code '#displacy'] for + | #[code <div id="displacy">]. + ++code("script.js", "javascript"). + var displacy = new displaCy('http://localhost:8080', { + container: '#displacy' + }) + + function parse(text) { + displacy.parse(text); + } + +p + | When you call #[code parse()], it will make a request to your API, + | receive the JSON-formatted parse and render it in your container. To + | create an interactive experience, you could trigger this function by + | a button and read the text from an #[code <input>] field. From f4658ff0539f36560bf1776a2ef6a1090713bf99 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 20:54:02 +0200 Subject: [PATCH 036/118] Rewrite usage workflow on saving and loading --- website/docs/usage/saving-loading.jade | 124 ++++++++++++++++++------- 1 file changed, 93 insertions(+), 31 deletions(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 74370bbb1..413b86477 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -10,6 +10,13 @@ include _spacy-101/_serialization | overview of the changes, see #[+a("/docs/usage/v2#incompat") this table] | and the notes on #[+a("/docs/usage/v2#migrating-saving-loading") migrating]. ++h(3, "example-doc") Example: Saving and loading a document + +p + | For simplicity, let's assume you've + | #[+a("/docs/usage/entity-recognition#setting") added custom entities] to + | a #[code Doc], either manually, or by using a + | #[+a("/docs/usage/rule-based-matching#on_match") match pattern]. You can | save it locally by calling #[+api("doc#to_disk") #[code Doc.to_disk()]], | and load it again via #[+api("doc#from_disk") #[code Doc.from_disk()]]. | This will overwrite the existing object and return it. @@ -99,53 +106,108 @@ p | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code lang_name] and #[code lang_name-version]. - | #[code lang] setting in the meta.json is also used to create the - | respective #[code Language] class in spaCy, which will later be returned - | by the model's #[code load()] method. + ++h(3, "models-custom") Customising the model setup p - | To #[strong build the package], run the following command from within the - | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. For more information on building Python packages, see the - | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + | The meta.json includes a #[code setup] key that lets you customise how + | the model should be initialised and loaded. You can define the language + | data to be loaded and the + | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to + | execute. ++table(["Setting", "Type", "Description"]) + +row + +cell #[code lang] + +cell unicode + +cell ID of the language class to initialise. + + +row + +cell #[code pipeline] + +cell list + +cell + | A list of strings mapping to the IDs of pipeline factories to + | apply in that order. If not set, spaCy's + | #[+a("/docs/usage/language-processing/pipelines") default pipeline] + | will be used. + +p + | The #[code load()] method that comes with our model package + | templates will take care of putting all this together and returning a + | #[code Language] object with the loaded pipeline and data. If your model + | requires custom pipeline components, you should + | #[strong ship then with your model] and register their + | #[+a("/docs/usage/language-processing-pipeline#creating-factory") factories] + | via #[+api("spacy#set_factory") #[code set_factory()]]. + ++aside-code("Factory example"). + def my_factory(vocab): + # load some state + def my_component(doc): + # process the doc + return doc + return my_component + ++code. + spacy.set_factory('custom_component', custom_component_factory) + ++infobox("Custom models with pipeline components") + | For more details and an example of how to package a sentiment model + | with a custom pipeline component, see the usage workflow on + | #[+a("/docs/usage/language-processing-pipeline#example2") language processing pipelines]. + ++h(3, "models-building") Building the model package + +p + | To build the package, run the following command from within the + | directory. For more information on building Python packages, see the + | docs on Python's + | #[+a("https://setuptools.readthedocs.io/en/latest/") Setuptools]. +code(false, "bash"). python setup.py sdist +p + | This will create a #[code .tar.gz] archive in a directory #[code /dist]. + | The model can be installed by pointing pip to the path of the archive: + ++code(false, "bash"). + pip install /path/to/en_example_model-1.0.0.tar.gz + +p + | You can then load the model via its name, #[code en_example_model], or + | import it directly as a module and then call its #[code load()] method. + +h(2, "loading") Loading a custom model package p | To load a model from a data directory, you can use - | #[+api("spacy#load") #[code spacy.load()]] with the local path: + | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will + | look for a meta.json in the directory and use the #[code setup] details + | to initialise a #[code Language] class with a processing pipeline and + | load in the model data. +code. nlp = spacy.load('/path/to/model') p - | If you have generated a model package, you can also install it by - | pointing pip to the model's #[code .tar.gz] archive – this is pretty - | much exactly what spaCy's #[+api("cli#download") #[code download]] - | command does under the hood. - -+code(false, "bash"). - pip install /path/to/en_example_model-1.0.0.tar.gz - -+aside-code("Custom model names", "bash"). - # optional: assign custom name to model - python -m spacy link en_example_model my_cool_model - -p - | You'll then be able to load the model via spaCy's loader, or by importing - | it as a module. For larger code bases, we usually recommend native - | imports, as this will make it easier to integrate models with your - | existing build process, continuous integration workflow and testing - | framework. + | If you want to #[strong load only the binary data], you'll have to create + | a #[code Language] class and call + | #[+api("language#from_disk") #[code from_disk]] instead. +code. - # option 1: import model as module - import en_example_model - nlp = en_example_model.load() + from spacy.lang.en import English + nlp = English().from_disk('/path/to/data') - # option 2: use spacy.load() - nlp = spacy.load('en_example_model') ++infobox("Important note: Loading data in v2.x") + .o-block + | In spaCy 1.x, the distinction between #[code spacy.load()] and the + | #[code Language] class constructor was quite unclear. You could call + | #[code spacy.load()] when no model was present, and it would silently + | return an empty object. Likewise, you could pass a path to + | #[code English], even if the mode required a different language. + | spaCy v2.0 solves this with a clear distinction between setting up + | the instance and loading the data. + + +code-new nlp = English.from_disk('/path/to/data') + +code-old nlp = spacy.load('en', path='/path/to/data') From c25f3133ca6ce1147b84860cd820d945fe45e322 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 20:54:37 +0200 Subject: [PATCH 037/118] Update section on new v2.0 features --- website/docs/usage/v2.jade | 131 ++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 68 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 4a0e6ca2f..a058c5c13 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -8,6 +8,65 @@ p +h(2, "features") New features ++h(3, "features-pipelines") Improved processing pipelines + ++aside-code("Example"). + # Modify an existing pipeline + nlp = spacy.load('en') + nlp.pipeline.append(my_component) + + # Register a factory to create a component + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory', mycomponent]) + +p + | It's now much easier to customise the pipeline with your own components. + | Components are functions that receive a #[code Doc] object, modify and + | return it. If your component is stateful, you'll want to create a new one + | for each pipeline. You can do that by defining and registering a factory + | which receives the shared #[code Vocab] object and returns a component. + +p + | spaCy's default components – the vectorizer, tagger, parser and entity + | recognizer, can be added to your pipeline by using their string IDs. + | This way, you won't have to worry about finding and implementing them – + | to use the default tagger, simply add #[code "tagger"] to the pipeline, + | and spaCy will know what to do. + ++infobox + | #[strong API:] #[+api("language") #[code Language]] + | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] + ++h(3, "features-serializer") Saving, loading and serialization + ++aside-code("Example"). + nlp = spacy.load('en') # shortcut link + nlp = spacy.load('en_core_web_sm') # package + nlp = spacy.load('/path/to/en') # unicode path + nlp = spacy.load(Path('/path/to/en')) # pathlib Path + + nlp.to_disk('/path/to/nlp') + nlp = English().from_disk('/path/to/nlp') + +p + | spay's serialization API has been made consistent across classes and + | objects. All container classes and pipeline components now have a + | #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and + | #[code from_disk()] method that supports the Pickle protocol. + +p + | The improved #[code spacy.load] makes loading models easier and more + | transparent. You can load a model by supplying its + | #[+a("/docs/usage/models#usage") shortcut link], the name of an installed + | #[+a("/docs/usage/saving-loading#generating") model package] or a path. + | The #[code Language] class to initialise will be determined based on the + | model's settings. For a blank language, you can import the class directly, + | e.g. #[code from spacy.lang.en import English]. + ++infobox + | #[strong API:] #[+api("spacy#load") #[code spacy.load]], #[+api("binder") #[code Binder]] + | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] + +h(3, "features-displacy") displaCy visualizer with Jupyter support +aside-code("Example"). @@ -28,33 +87,6 @@ p | #[strong API:] #[+api("displacy") #[code displacy]] | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy] -+h(3, "features-loading") Loading - -+aside-code("Example"). - nlp = spacy.load('en') # shortcut link - nlp = spacy.load('en_core_web_sm') # package - nlp = spacy.load('/path/to/en') # unicode path - nlp = spacy.load(Path('/path/to/en')) # pathlib Path - -p - | The improved #[code spacy.load] makes loading models easier and more - | transparent. You can load a model by supplying its - | #[+a("/docs/usage/models#usage") shortcut link], the name of an installed - | #[+a("/docs/usage/saving-loading#generating") model package], a unicode - | path or a #[code Path]-like object. spaCy will try resolving the load - | argument in this order. The #[code path] keyword argument is now deprecated. - -p - | The #[code Language] class to initialise will be determined based on the - | model's settings. If no model is found, spaCy will let you know and won't - | just return an empty #[code Language] object anymore. If you want a blank - | language, you can always import the class directly, e.g. - | #[code from spacy.lang.en import English]. - -+infobox - | #[strong API:] #[+api("spacy#load") #[code spacy.load]] - | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] - +h(3, "features-language") Improved language data and lazy loading p @@ -65,46 +97,15 @@ p | complex regular expressions. The language data has also been tidied up | and simplified. It's now also possible to overwrite the functions that | compute lexical attributes like #[code like_num], and supply - | language-specific syntax iterators, e.g. to determine noun chunks. + | language-specific syntax iterators, e.g. to determine noun chunks. spaCy + | now also supports simple lookup-based lemmatization. The data is stored + | in a dictionary mapping a string to its lemma. +infobox + | #[strong API:] #[+api("language") #[code Language]] | #[strong Code:] #[+src(gh("spaCy", "spacy/lang")) spacy/lang] | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] -+h(3, "features-pipelines") Improved processing pipelines - -+aside-code("Example"). - from spacy.language import Language - nlp = Language(pipeline=['token_vectors', 'tags', - 'dependencies']) - -+infobox - | #[strong API:] #[+api("language") #[code Language]] - | #[strong Usage:] #[+a("/docs/usage/processing-text") Processing text] - -+h(3, "features-lemmatizer") Simple lookup-based lemmatization - -+aside-code("Example"). - LOOKUP = { - "aba": "abar", - "ababa": "abar", - "ababais": "abar", - "ababan": "abar", - "ababanes": "ababΓ‘n" - } - -p - | spaCy now supports simple lookup-based lemmatization. The data is stored - | in a dictionary mapping a string to its lemma. To determine a token's - | lemma, spaCy simply looks it up in the table. The lookup lemmatizer can - | be imported from #[code spacy.lemmatizerlookup]. It's initialised with - | the lookup table, and should be returned by the #[code create_lemmatizer] - | classmethod of the language's defaults. - -+infobox - | #[strong API:] #[+api("language") #[code Language]] - | #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages] - +h(3, "features-matcher") Revised matcher API +aside-code("Example"). @@ -129,12 +130,6 @@ p | #[strong API:] #[+api("matcher") #[code Matcher]] | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] -+h(3, "features-serializer") Serialization - -+infobox - | #[strong API:] #[+api("serializer") #[code Serializer]] - | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] - +h(3, "features-models") Neural network models for English, German, French and Spanish +infobox From 9337866dae5915f7b1a385b9d903c1310c8884d9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 22:46:18 +0200 Subject: [PATCH 038/118] Add aside to pipeline 101 table --- website/docs/usage/_spacy-101/_pipelines.jade | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index fe6c149f6..d984a4708 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -15,6 +15,12 @@ p .u-text-right +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic ++aside + | #[strong Name:] ID of the pipeline component.#[br] + | #[strong Component:] spaCy's implementation of the component.#[br] + | #[strong Creates:] Objects, attributes and properties modified and set by + | the component. + +table(["Name", "Component", "Creates"]) +row +cell tokenizer From 9efa662345e89b93ce2cf1c569c30cd7abd4ba19 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:09:51 +0200 Subject: [PATCH 039/118] Update dependency parse docs and add note on disabling parser --- website/docs/usage/dependency-parse.jade | 66 ++++++++++++++---------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/website/docs/usage/dependency-parse.jade b/website/docs/usage/dependency-parse.jade index abfa1f825..dfb37f786 100644 --- a/website/docs/usage/dependency-parse.jade +++ b/website/docs/usage/dependency-parse.jade @@ -6,18 +6,20 @@ p | spaCy features a fast and accurate syntactic dependency parser, and has | a rich API for navigating the tree. The parser also powers the sentence | boundary detection, and lets you iterate over base noun phrases, or - | "chunks". - -p - | You can check whether a #[+api("doc") #[code Doc]] object has been - | parsed with the #[code doc.is_parsed] attribute, which returns a boolean - | value. If this attribute is #[code False], the default sentence iterator - | will raise an exception. + | "chunks". You can check whether a #[+api("doc") #[code Doc]] object has + | been parsed with the #[code doc.is_parsed] attribute, which returns a + | boolean value. If this attribute is #[code False], the default sentence + | iterator will raise an exception. +h(2, "noun-chunks") Noun chunks +tag-model("dependency parse") -p Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pretium a orci eget, varius dignissim augue. Nam eu dictum mauris, id tincidunt nisi. Integer commodo pellentesque tincidunt. Nam at turpis finibus tortor gravida sodales tincidunt sit amet est. Nullam euismod arcu in tortor auctor. +p + | Noun chunks are "base noun phrases" – flat phrases that have a noun as + | their head. You can think of noun chunks as a noun plus the words describing + | the noun – for example, "the lavish green grass" or "the world’s largest + | tech fund". To get the noun chunks in a document, simply iterate over + | #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]. +code("Example"). nlp = spacy.load('en') @@ -28,9 +30,10 @@ p Lorem ipsum dolor sit amet, consectetur adipiscing elit. Quisque enim ante, pr +aside | #[strong Text:] The original noun chunk text.#[br] - | #[strong Root text:] ...#[br] - | #[strong Root dep:] ...#[br] - | #[strong Root head text:] ...#[br] + | #[strong Root text:] The original text of the word connecting the noun + | chunk to the rest of the parse.#[br] + | #[strong Root dep:] Dependcy relation connecting the root to its head.#[br] + | #[strong Root head text:] The text of the root token's head.#[br] +table(["Text", "root.text", "root.dep_", "root.head.text"]) - var style = [0, 0, 1, 0] @@ -59,7 +62,7 @@ p | #[strong Dep]: The syntactic relation connecting child to head.#[br] | #[strong Head text]: The original text of the token head.#[br] | #[strong Head POS]: The part-of-speech tag of the token head.#[br] - | #[strong Children]: ... + | #[strong Children]: The immediate syntactic dependents of the token. +table(["Text", "Dep", "Head text", "Head POS", "Children"]) - var style = [0, 1, 0, 1, 0] @@ -204,20 +207,31 @@ p +h(2, "disabling") Disabling the parser p - | The parser is loaded and enabled by default. If you don't need any of - | the syntactic information, you should disable the parser. Disabling the - | parser will make spaCy load and run much faster. Here's how to prevent - | the parser from being loaded: + | In the #[+a("/docs/usage/models/available") default models], the parser + | is loaded and enabled as part of the + | #[+a("docs/usage/language-processing-pipelines") standard processing pipeline]. + | If you don't need any of the syntactic information, you should disable + | the parser. Disabling the parser will make spaCy load and run much faster. + | If you want to load the parser, but need to disable it for specific + | documents, you can also control its use on the #[code nlp] object. +code. - nlp = spacy.load('en', parser=False) + nlp = spacy.load('en', disable=['parser']) + nlp = English().from_disk('/model', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) -p - | If you need to load the parser, but need to disable it for specific - | documents, you can control its use with the #[code parse] keyword - | argument: - -+code. - nlp = spacy.load('en') - doc1 = nlp(u'Text I do want parsed.') - doc2 = nlp(u"Text I don't want parsed", parse=False) ++infobox("Important note: disabling pipeline components") + .o-block + | Since spaCy v2.0 comes with better support for customising the + | processing pipeline components, the #[code parser] keyword argument + | has been replaced with #[code disable], which takes a list of + | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | This lets you disable both default and custom components when loading + | a model, or initialising a Language class via + | #[+api("language-from_disk") #[code from_disk]]. + +code-new. + nlp = spacy.load('en', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old. + nlp = spacy.load('en', parser=False) + doc = nlp(u"I don't want parsed", parse=False) From 419d265ff047370e025797395cef5543efce9773 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:10:06 +0200 Subject: [PATCH 040/118] Add section on disabling pipeline components --- .../usage/language-processing-pipeline.jade | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 3b41ad5de..7124bdadc 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -315,3 +315,43 @@ p | For more information and a detailed guide on how to package your model, | see the documentation on | #[+a("/docs/usage/saving-loading#models") saving and loading models]. + ++h(2, "disabling") Disabling pipeline components + +p + | If you don't need a particular component of the pipeline – for + | example, the tagger or the parser, you can disable loading it. This can + | sometimes make a big difference and improve loading speed. Disabled + | component names can be provided to #[code spacy.load], #[code from_disk] + | or the #[code nlp] object itself as a list: + ++code. + nlp = spacy.load('en', disable['parser', 'tagger']) + nlp = English().from_disk('/model', disable=['vectorizer', 'ner']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +p + | Note that you can't write directly to #[code nlp.pipeline], as this list + | holds the #[em actual components], not the IDs. However, if you know the + | order of the components, you can still slice the list: + ++code. + nlp = spacy.load('en') + nlp.pipeline = nlp.pipeline[:2] # only use the first two components + ++infobox("Important note: disabling pipeline components") + .o-block + | Since spaCy v2.0 comes with better support for customising the + | processing pipeline components, the #[code parser], #[code tagger] + | and #[code entity] keyword arguments have been replaced with + | #[code disable], which takes a list of + | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | This lets you disable both default and custom components when loading + | a model, or initialising a Language class via + | #[+api("language-from_disk") #[code from_disk]]. + +code-new. + nlp = spacy.load('en', disable=['parser']) + doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old. + nlp = spacy.load('en', parser=False) + doc = nlp(u"I don't want parsed", parse=False) From 0f48fb1f9702f702715cddc95a2b3e57fb4e1cfb Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:10:33 +0200 Subject: [PATCH 041/118] Rename processing text to production use and remove linear feature scheme --- website/docs/api/_data.json | 7 +- website/docs/api/features.jade | 138 ------------------ website/docs/usage/_data.json | 13 +- ...ocessing-text.jade => production-use.jade} | 63 -------- 4 files changed, 8 insertions(+), 213 deletions(-) delete mode 100644 website/docs/api/features.jade rename website/docs/usage/{processing-text.jade => production-use.jade} (58%) diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 443ee9a67..f3f996846 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -27,8 +27,7 @@ "GoldCorpus": "goldcorpus" }, "Other": { - "Annotation Specs": "annotation", - "Feature Scheme": "features" + "Annotation Specs": "annotation" } }, @@ -143,9 +142,5 @@ "annotation": { "title": "Annotation Specifications" - }, - - "features": { - "title": "Linear Model Feature Scheme" } } diff --git a/website/docs/api/features.jade b/website/docs/api/features.jade deleted file mode 100644 index 018790145..000000000 --- a/website/docs/api/features.jade +++ /dev/null @@ -1,138 +0,0 @@ -//- πŸ’« DOCS > API > LINEAR MOEL FEATURES - -include ../../_includes/_mixins - -p - | There are two popular strategies for putting together machine learning - | models for NLP: sparse linear models, and neural networks. To solve NLP - | problems with linear models, feature templates need to be assembled that - | combine multiple atomic predictors. This page documents the atomic - | predictors used in the spaCy 1.0 #[+api("parser") #[code Parser]], - | #[+api("tagger") #[code Tagger]] and - | #[+api("entityrecognizer") #[code EntityRecognizer]]. - -p - | To understand the scheme, recall that spaCy's #[code Parser] and - | #[code EntityRecognizer] are implemented as push-down automata. They - | maintain a "stack" that holds the current entity, and a "buffer" - | consisting of the words to be processed. - -p - | Each state consists of the words on the stack (if any), which consistute - | the current entity being constructed. We also have the current word, and - | the two subsequent words. Finally, we also have the entities previously - | built. - -p - | This gives us a number of tokens to ask questions about, to make the - | features. About each of these tokens, we can ask about a number of - | different properties. Each feature identifier asks about a specific - | property of a specific token of the context. - -+h(2, "tokens") Context tokens - -+table([ "ID", "Description" ]) - +row - +cell #[code S0] - +cell - | The first word on the stack, i.e. the token most recently added - | to the current entity. - - +row - +cell #[code S1] - +cell The second word on the stack, i.e. the second most recently added. - - +row - +cell #[code S2] - +cell The third word on the stack, i.e. the third most recently added. - - +row - +cell #[code N0] - +cell The first word of the buffer, i.e. the current word being tagged. - - +row - +cell #[code N1] - +cell The second word of the buffer. - - +row - +cell #[code N2] - +cell The third word of the buffer. - - +row - +cell #[code P1] - +cell The word immediately before #[code N0]. - - +row - +cell #[code P2] - +cell The second word before #[code N0]. - - +row - +cell #[code E0] - +cell The first word of the previously constructed entity. - - +row - +cell #[code E1] - +cell The first word of the second previously constructed entity. - -p About each of these tokens, we can ask: - -+table([ "ID", "Attribute", "Description" ]) - +row - +cell #[code N0w] - +cell #[code token.orth] - +cell The word form. - - +row - +cell #[code N0W] - +cell #[code token.lemma] - +cell The word's lemma. - - +row - +cell #[code N0p] - +cell #[code token.tag] - +cell The word's (full) POS tag. - - +row - +cell #[code N0c] - +cell #[code token.cluster] - +cell The word's (full) Brown cluster. - - +row - +cell #[code N0c4] - +cell - - +cell First four digit prefix of the word's Brown cluster. - - +row - +cell #[code N0c6] - +cell - - +cell First six digit prefix of the word's Brown cluster. - - +row - +cell #[code N0L] - +cell - - +cell The word's dependency label. Not used as a feature in the NER. - - +row - +cell #[code N0_prefix] - +cell #[code token.prefix] - +cell The first three characters of the word. - - +row - +cell #[code N0_suffix] - +cell #[code token.suffix] - +cell The last three characters of the word. - - +row - +cell #[code N0_shape] - +cell #[code token.shape] - +cell The word's shape, i.e. is it alphabetic, numeric, etc. - - +row - +cell #[code N0_ne_iob] - +cell #[code token.ent_iob] - +cell The Inside/Outside/Begin code of the word's NER tag. - - +row - +cell #[code N0_ne_type] - +cell #[code token.ent_type] - +cell The word's NER type. diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 4d065522b..3a24a38df 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -15,9 +15,9 @@ "Custom tokenization": "customizing-tokenizer", "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", - "Processing text": "processing-text", "NLP pipelines": "language-processing-pipeline", "Deep learning": "deep-learning", + "Production use": "production-use", "Training": "training", "Training NER": "training-ner", "Saving & loading": "saving-loading", @@ -99,11 +99,6 @@ "next": "training" }, - "processing-text": { - "title": "Processing text", - "next": "language-processing-pipeline" - }, - "language-processing-pipeline": { "title": "Language processing pipelines", "next": "deep-learning" @@ -111,9 +106,15 @@ "deep-learning": { "title": "Hooking a deep learning model into spaCy", + "next": "production use" + }, + + "production-use": { + "title": "Production use", "next": "training" }, + "training": { "title": "Training spaCy's statistical models", "next": "saving-loading" diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/production-use.jade similarity index 58% rename from website/docs/usage/processing-text.jade rename to website/docs/usage/production-use.jade index 2562d9fc4..68a313d8a 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/production-use.jade @@ -6,69 +6,6 @@ p | Once you have loaded the #[code nlp] object, you can call it as though | it were a function. This allows you to process a single unicode string. -+code. - doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') - -p - | The library should perform equally well with #[strong short or long documents]. - | All algorithms are linear-time in the length of the string, and once the - | data is loaded, there's no significant start-up cost to consider. This - | means that you don't have to strategically merge or split your text β€” - | you should feel free to feed in either single tweets or whole novels. - -p - | If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will - | load the #[+a("/docs/usage/models") model] associated with the name - | #[code 'en']. Each model is a Python package containing an - | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py] - -the #[code nlp] object will - | be an instance of #[code spacy.en.English]. This means that when you run - | #[code doc = nlp(text)], you're executing - | #[code spacy.en.English.__call__], which is implemented on its parent - | class, #[+api("language") #[code Language]]. - -+code. - doc = nlp.make_doc(text) - for proc in nlp.pipeline: - proc(doc) - -p - | I've tried to make sure that the #[code Language.__call__] function - | doesn't do any "heavy lifting", so that you won't have complicated logic - | to replicate if you need to make your own pipeline class. This is all it - | does. - -p - | The #[code .make_doc()] method and #[code .pipeline] attribute make it - | easier to customise spaCy's behaviour. If you're using the default - | pipeline, we can desugar one more time. - -+code. - doc = nlp.tokenizer(text) - nlp.tagger(doc) - nlp.parser(doc) - nlp.entity(doc) - -p Finally, here's where you can find out about each of those components: - -+table(["Name", "Source"]) - +row - +cell #[code tokenizer] - +cell #[+src(gh("spacy", "spacy/tokenizer.pyx")) spacy.tokenizer.Tokenizer] - - +row - +cell #[code tagger] - +cell #[+src(gh("spacy", "spacy/tagger.pyx")) spacy.pipeline.Tagger] - - +row - +cell #[code parser] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.DependencyParser] - - +row - +cell #[code entity] - +cell #[+src(gh("spacy", "spacy/syntax/parser.pyx")) spacy.pipeline.EntityRecognizer] - +h(2, "multithreading") Multi-threading with #[code .pipe()] p From d122bbc9084adcb9aa0e6af57f5df828d0753ffb Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:30:21 +0200 Subject: [PATCH 042/118] Rewrite custom tokenizer docs --- website/docs/usage/customizing-tokenizer.jade | 101 +++++++++++------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index 5871e1655..86040a4eb 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -11,16 +11,10 @@ p | #[code spaces] booleans, which allow you to maintain alignment of the | tokens into the original string. -+aside("spaCy's data model") - | The main point to keep in mind is that spaCy's #[code Doc] doesn't - | copy or refer to the original string. The string is reconstructed from - | the tokens when required. - +h(2, "101") Tokenizer 101 include _spacy-101/_tokenization - +h(3, "101-data") Tokenizer data p @@ -221,27 +215,68 @@ p +h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline p - | You can pass a custom tokenizer using the #[code make_doc] keyword, when - | you're creating the pipeline: + | The tokenizer is the first component of the processing pipeline and the + | only one that can't be replaced by writing to #[code nlp.pipeline]. This + | is because it has a different signature from all the other components: + | it takes a text and returns a #[code Doc], whereas all other components + | expect to already receive a tokenized #[code Doc]. + ++image + include ../../assets/img/docs/pipeline.svg + .u-text-right + +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic -+code. - nlp = spacy.load('en', make_doc=my_tokenizer) p - | However, this approach often leaves us with a chicken-and-egg problem. - | To construct the tokenizer, we usually want attributes of the #[code nlp] - | pipeline. Specifically, we want the tokenizer to hold a reference to the - | pipeline's vocabulary object. Let's say we have the following class as - | our tokenizer: - + | To overwrite the existing tokenizer, you need to replace + | #[code nlp.tokenizer] with a custom function that takes a text, and + | returns a #[code Doc]. + ++code. + nlp = spacy.load('en') + nlp.tokenizer = my_tokenizer + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code text] + +cell unicode + +cell The raw text to tokenize. + + +footrow + +cell returns + +cell #[code Doc] + +cell The tokenized document. + ++infobox("Important note: using a custom tokenizer") + .o-block + | In spaCy v1.x, you had to add a custom tokenizer by passing it to the + | #[code make_doc] keyword argument, or by passing a tokenizer "factory" + | to #[code create_make_doc]. This was unnecessarily complicated. Since + | spaCy v2.0, you can simply write to #[code nlp.tokenizer]. If your + | tokenizer needs the vocab, you can write a function and use + | #[code nlp.vocab]. + + +code-new. + nlp.tokenizer = my_tokenizer + nlp.tokenizer = my_tokenizer_factory(nlp.vocab) + +code-old. + nlp = spacy.load('en', make_doc=my_tokenizer) + nlp = spacy.load('en', create_make_doc=my_tokenizer_factory) + ++h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer + +p + | To construct the tokenizer, we usually want attributes of the #[code nlp] + | pipeline. Specifically, we want the tokenizer to hold a reference to the + | vocabulary object. Let's say we have the following class as + | our tokenizer: +code. - import spacy from spacy.tokens import Doc class WhitespaceTokenizer(object): - def __init__(self, nlp): - self.vocab = nlp.vocab + def __init__(self, vocab): + self.vocab = vocab def __call__(self, text): words = text.split(' ') @@ -250,28 +285,12 @@ p return Doc(self.vocab, words=words, spaces=spaces) p - | As you can see, we need a #[code vocab] instance to construct this β€” but - | we won't get the #[code vocab] instance until we get back the #[code nlp] - | object from #[code spacy.load()]. The simplest solution is to build the - | object in two steps: + | As you can see, we need a #[code Vocab] instance to construct this β€” but + | we won't have it until we get back the loaded #[code nlp] object. The + | simplest solution is to build the tokenizer in two steps. This also means + | that you can reuse the "tokenizer factory" and initialise it with + | different instances of #[code Vocab]. +code. nlp = spacy.load('en') - nlp.make_doc = WhitespaceTokenizer(nlp) - -p - | You can instead pass the class to the #[code create_make_doc] keyword, - | which is invoked as callback once the #[code nlp] object is ready: - -+code. - nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer) - -p - | Finally, you can of course create your own subclasses, and create a bound - | #[code make_doc] method. The disadvantage of this approach is that spaCy - | uses inheritance to give each language-specific pipeline its own class. - | If you're working with multiple languages, a naive solution will - | therefore require one custom class per language you're working with. - | This might be at least annoying. You may be able to do something more - | generic by doing some clever magic with metaclasses or mixins, if that's - | the sort of thing you're into. + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) From 709ea589909bf1b290ad4d4a1fb7545961bcf683 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:56:16 +0200 Subject: [PATCH 043/118] Tidy up workflows --- website/docs/usage/_data.json | 10 +- website/docs/usage/data-model.jade | 264 ------------------ .../usage/language-processing-pipeline.jade | 4 +- website/docs/usage/resources.jade | 118 -------- 4 files changed, 4 insertions(+), 392 deletions(-) delete mode 100644 website/docs/usage/data-model.jade delete mode 100644 website/docs/usage/resources.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 3a24a38df..9f51df5c4 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -15,7 +15,7 @@ "Custom tokenization": "customizing-tokenizer", "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", - "NLP pipelines": "language-processing-pipeline", + "Processing pipelines": "language-processing-pipeline", "Deep learning": "deep-learning", "Production use": "production-use", "Training": "training", @@ -48,18 +48,13 @@ "lightning-tour": { "title": "Lightning tour", - "next": "visualizers" + "next": "v2" }, "visualizers": { "title": "Visualizers" }, - "troubleshooting": { - "title": "Troubleshooting", - "next": "resources" - }, - "v2": { "title": "What's new in v2.0" }, @@ -114,7 +109,6 @@ "next": "training" }, - "training": { "title": "Training spaCy's statistical models", "next": "saving-loading" diff --git a/website/docs/usage/data-model.jade b/website/docs/usage/data-model.jade deleted file mode 100644 index 6be205178..000000000 --- a/website/docs/usage/data-model.jade +++ /dev/null @@ -1,264 +0,0 @@ -//- πŸ’« DOCS > USAGE > SPACY'S DATA MODEL - -include ../../_includes/_mixins - -p After reading this page, you should be able to: - -+list - +item Understand how spaCy's Doc, Span, Token and Lexeme object work - +item Start using spaCy's Cython API - +item Use spaCy more efficiently - -+h(2, "architecture") Architecture - -+image - include ../../assets/img/docs/architecture.svg - -+h(2, "design-considerations") Design considerations - -+h(3, "no-job-too-big") No job too big - -p - | When writing spaCy, one of my mottos was #[em no job too big]. I wanted - | to make sure that if Google or Facebook were founded tomorrow, spaCy - | would be the obvious choice for them. I wanted spaCy to be the obvious - | choice for web-scale NLP. This meant sweating about performance, because - | for web-scale tasks, Moore's law can't save you. - -p - | Most computational work gets less expensive over time. If you wrote a - | program to solve fluid dynamics in 2008, and you ran it again in 2014, - | you would expect it to be cheaper. For NLP, it often doesn't work out - | that way. The problem is that we're writing programs where the task is - | something like "Process all articles in the English Wikipedia". Sure, - | compute prices dropped from $0.80 per hour to $0.20 per hour on AWS in - | 2008-2014. But the size of Wikipedia grew from 3GB to 11GB. Maybe the - | job is a #[em little] cheaper in 2014 β€” but not by much. - -+h(3, "annotation-layers") Multiple layers of annotation - -p - | When I tell a certain sort of person that I'm a computational linguist, - | this comic is often the first thing that comes to their mind: - -+image("http://i.imgur.com/n3DTzqx.png", 450) - +image-caption © #[+a("http://xkcd.com") xkcd] - -p - | I've thought a lot about what this comic is really trying to say. It's - | probably not talking about #[em data models] β€” but in that sense at - | least, it really rings true. - -p - | You'll often need to model a document as a sequence of sentences. Other - | times you'll need to model it as a sequence of words. Sometimes you'll - | care about paragraphs, other times you won't. Sometimes you'll care - | about extracting quotes, which can cross paragraph boundaries. A quote - | can also occur within a sentence. When we consider sentence structure, - | things get even more complicated and contradictory. We have syntactic - | trees, sequences of entities, sequences of phrases, sub-word units, - | multi-word units... - -p - | Different applications are going to need to query different, - | overlapping, and often contradictory views of the document. They're - | often going to need to query them jointly. You need to be able to get - | the syntactic head of a named entity, or the sentiment of a paragraph. - -+h(2, "solutions") Solutions - -+h(3) Fat types, thin tokens - -+h(3) Static model, dynamic views - -p - | Different applications are going to need to query different, - | overlapping, and often contradictory views of the document. For this - | reason, I think it's a bad idea to have too much of the document - | structure reflected in the data model. If you structure the data - | according to the needs of one layer of annotation, you're going to need - | to copy the data and transform it in order to use a different layer of - | annotation. You'll soon have lots of copies, and no single source of - | truth. - -+h(3) Never go full stand-off - -+h(3) Implementation - -+h(3) Cython 101 - -+h(3) #[code cdef class Doc] - -p - | Let's start at the top. Here's the memory layout of the - | #[+api("doc") #[code Doc]] class, minus irrelevant details: - -+code. - from cymem.cymem cimport Pool - from ..vocab cimport Vocab - from ..structs cimport TokenC - - cdef class Doc: - cdef Pool mem - cdef Vocab vocab - - cdef TokenC* c - - cdef int length - cdef int max_length - -p - | So, our #[code Doc] class is a wrapper around a TokenC* array β€” that's - | where the actual document content is stored. Here's the #[code TokenC] - | struct, in its entirety: - -+h(3) #[code cdef struct TokenC] - -+code. - cdef struct TokenC: - const LexemeC* lex - uint64_t morph - univ_pos_t pos - bint spacy - int tag - int idx - int lemma - int sense - int head - int dep - bint sent_start - - uint32_t l_kids - uint32_t r_kids - uint32_t l_edge - uint32_t r_edge - - int ent_iob - int ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. - hash_t ent_id - -p - | The token owns all of its linguistic annotations, and holds a const - | pointer to a #[code LexemeC] struct. The #[code LexemeC] struct owns all - | of the #[em vocabulary] data about the word β€” all the dictionary - | definition stuff that we want to be shared by all instances of the type. - | Here's the #[code LexemeC] struct, in its entirety: - -+h(3) #[code cdef struct LexemeC] - -+code. - cdef struct LexemeC: - - int32_t id - - int32_t orth # Allows the string to be retrieved - int32_t length # Length of the string - - uint64_t flags # These are the most useful parts. - int32_t cluster # Distributional similarity cluster - float prob # Probability - float sentiment # Slot for sentiment - - int32_t lang - - int32_t lower # These string views made sense - int32_t norm # when NLP meant linear models. - int32_t shape # Now they're less relevant, and - int32_t prefix # will probably be revised. - int32_t suffix - - float* vector # <-- This was a design mistake, and will change. - -+h(2, "dynamic-views") Dynamic views - -+h(3) Text - -p - | You might have noticed that in all of the structs above, there's not a - | string to be found. The strings are all stored separately, in the - | #[+api("stringstore") #[code StringStore]] class. The lexemes don't know - | the strings β€” they only know their integer IDs. The document string is - | never stored anywhere, either. Instead, it's reconstructed by iterating - | over the tokens, which look up the #[code orth] attribute of their - | underlying lexeme. Once we have the orth ID, we can fetch the string - | from the vocabulary. Finally, each token knows whether a single - | whitespace character (#[code ' ']) should be used to separate it from - | the subsequent tokens. This allows us to preserve whitespace. - -+code. - cdef print_text(Vocab vocab, const TokenC* tokens, int length): - for i in range(length): - word_string = vocab.strings[tokens.lex.orth] - if tokens.lex.spacy: - word_string += ' ' - print(word_string) - -p - | This is why you get whitespace tokens in spaCy β€” we need those tokens, - | so that we can reconstruct the document string. I also think you should - | have those tokens anyway. Most NLP libraries strip them, making it very - | difficult to recover the paragraph information once you're at the token - | level. You'll never have that sort of problem with spaCy β€” because - | there's a single source of truth. - -+h(3) #[code cdef class Token] - -p When you do... - -+code. - doc[i] - -p - | ...you get back an instance of class #[code spacy.tokens.token.Token]. - | This instance owns no data. Instead, it holds the information - | #[code (doc, i)], and uses these to retrieve all information via the - | parent container. - -+h(3) #[code cdef class Span] - -p When you do... - -+code. - doc[i : j] - -p - | ...you get back an instance of class #[code spacy.tokens.span.Span]. - | #[code Span] instances are also returned by the #[code .sents], - | #[code .ents] and #[code .noun_chunks] iterators of the #[code Doc] - | object. A #[code Span] is a slice of tokens, with an optional label - | attached. Its data model is: - -+code. - cdef class Span: - cdef readonly Doc doc - cdef int start - cdef int end - cdef int start_char - cdef int end_char - cdef int label - -p - | Once again, the #[code Span] owns almost no data. Instead, it refers - | back to the parent #[code Doc] container. - -p - | The #[code start] and #[code end] attributes refer to token positions, - | while #[code start_char] and #[code end_char] record the character - | positions of the span. By recording the character offsets, we can still - | use the #[code Span] object if the tokenization of the document changes. - -+h(3) #[code cdef class Lexeme] - -p When you do... - -+code. - vocab[u'the'] - -p - | ...you get back an instance of class #[code spacy.lexeme.Lexeme]. The - | #[code Lexeme]'s data model is: - -+code. - cdef class Lexeme: - cdef LexemeC* c - cdef readonly Vocab vocab diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 7124bdadc..8bb92caae 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -350,8 +350,8 @@ p | a model, or initialising a Language class via | #[+api("language-from_disk") #[code from_disk]]. +code-new. - nlp = spacy.load('en', disable=['parser']) + nlp = spacy.load('en', disable=['tagger', 'ner']) doc = nlp(u"I don't want parsed", disable=['parser']) +code-old. - nlp = spacy.load('en', parser=False) + nlp = spacy.load('en', tagger=False, entity=False) doc = nlp(u"I don't want parsed", parse=False) diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade deleted file mode 100644 index 56e92a1e7..000000000 --- a/website/docs/usage/resources.jade +++ /dev/null @@ -1,118 +0,0 @@ -//- πŸ’« DOCS > USAGE > RESOURCES - -include ../../_includes/_mixins - -p Many of the associated tools and resources that we're developing alongside spaCy can be found in their own repositories. - -+h(2, "developer") Developer tools - -+table(["Name", "Description"]) - +row - +cell - +src(gh("spacy-models")) spaCy Models - - +cell - | Model releases for spaCy. - - +row - +cell - +src(gh("spacy-dev-resources")) spaCy Dev Resources - - +cell - | Scripts, tools and resources for developing spaCy, adding new - | languages and training new models. - - +row - +cell - +src("spacy-benchmarks") spaCy Benchmarks - - +cell - | Runtime performance comparison of spaCy against other NLP - | libraries. - - +row - +cell - +src(gh("spacy-services")) spaCy Services - - +cell - | REST microservices for spaCy demos and visualisers. - - +row - +cell - +src(gh("spacy-notebooks")) spaCy Notebooks - - +cell - | Jupyter notebooks for spaCy examples and tutorials. - -+h(2, "libraries") Libraries and projects -+table(["Name", "Description"]) - +row - +cell - +src(gh("sense2vec")) sense2vec - - +cell - | Use spaCy to go beyond vanilla - | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec]. - -+h(2, "utility") Utility libraries and dependencies - -+table(["Name", "Description"]) - +row - +cell - +src(gh("thinc")) Thinc - - +cell - | spaCy's Machine Learning library for NLP in Python. - - +row - +cell - +src(gh("cymem")) Cymem - - +cell - | Gate Cython calls to malloc/free behind Python ref-counted - | objects. - - +row - +cell - +src(gh("preshed")) Preshed - - +cell - | Cython hash tables that assume keys are pre-hashed - - +row - +cell - +src(gh("murmurhash")) MurmurHash - - +cell - | Cython bindings for - | #[+a("https://en.wikipedia.org/wiki/MurmurHash") MurmurHash2]. - -+h(2, "visualizers") Visualisers and demos - -+table(["Name", "Description"]) - +row - +cell - +src(gh("displacy")) displaCy.js - - +cell - | A lightweight dependency visualisation library for the modern - | web, built with JavaScript, CSS and SVG. - | #[+a(DEMOS_URL + "/displacy") Demo here]. - - +row - +cell - +src(gh("displacy-ent")) displaCy#[sup ENT] - - +cell - | A lightweight and modern named entity visualisation library - | built with JavaScript and CSS. - | #[+a(DEMOS_URL + "/displacy-ent") Demo here]. - - +row - +cell - +src(gh("sense2vec-demo")) sense2vec Demo - - +cell - | Source of our Semantic Analysis of the Reddit Hivemind - | #[+a(DEMOS_URL + "/sense2vec") demo] using - | #[+a(gh("sense2vec")) sense2vec]. From fe2b0b8b8ded38fa6ba59f951f2ca437d64d8521 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:56:35 +0200 Subject: [PATCH 044/118] Update migrating docs --- website/docs/usage/v2.jade | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index a058c5c13..9bf32bf96 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -260,12 +260,16 @@ p +h(3, "migrating-saving-loading") Saving, loading and serialization -+h(2, "migrating") Migrating from spaCy 1.x p | Double-check all calls to #[code spacy.load()] and make sure they don't - | use the #[code path] keyword argument. + | use the #[code path] keyword argument. If you're only loading in binary + | data and not a model package that can construct its own #[code Language] + | class and pipeline, you should now use the + | #[+api("language#from_disk") #[code Language.from_disk()]] method. -+code-new nlp = spacy.load('/model') ++code-new. + nlp = spacy.load('/model') + nlp = English().from_disk('/model/data') +code-old nlp = spacy.load('en', path='/model') p @@ -288,15 +292,26 @@ p | If you're importing language data or #[code Language] classes, make sure | to change your import statements to import from #[code spacy.lang]. If | you've added your own custom language, it needs to be moved to - | #[code spacy/lang/xx]. + | #[code spacy/lang/xx] and adjusted accordingly. +code-new from spacy.lang.en import English +code-old from spacy.en import English p - | All components, e.g. tokenizer exceptions, are now responsible for - | compiling their data in the correct format. The language_data.py files - | have been removed + | If you've been using custom pipeline components, check out the new + | guide on #[+a("/docs/usage/language-processing-pipelines") processing pipelines]. + | Appending functions to the pipeline still works – but you might be able + | to make this more convenient by registering "component factories". + | Components of the processing pipeline can now be disabled by passing a + | list of their names to the #[code disable] keyword argument on loading + | or processing. + ++code-new. + nlp = spacy.load('en', disable=['tagger', 'ner']) + doc = nlp(u"I don't want parsed", disable=['parser']) ++code-old. + nlp = spacy.load('en', tagger=False, entity=False) + doc = nlp(u"I don't want parsed", parse=False) +h(3, "migrating-matcher") Adding patterns and callbacks to the matcher From 87c976e04c15ff9c440d875a93f7937398cdf8a5 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 01:58:22 +0200 Subject: [PATCH 045/118] Update model tag --- website/docs/usage/pos-tagging.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/pos-tagging.jade b/website/docs/usage/pos-tagging.jade index 245156b77..dd72efeba 100644 --- a/website/docs/usage/pos-tagging.jade +++ b/website/docs/usage/pos-tagging.jade @@ -8,7 +8,7 @@ p | processes. They can also be useful features in some statistical models. +h(2, "101") Part-of-speech tagging 101 - +tag-model("dependency parse") + +tag-model("tagger", "dependency parse") include _spacy-101/_pos-deps From 4b5540cc63a611812d98477901b3fae60fff6700 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 01:58:33 +0200 Subject: [PATCH 046/118] Rewrite examples in lightning tour --- website/docs/usage/lightning-tour.jade | 260 +++++++++++++------------ 1 file changed, 134 insertions(+), 126 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 24654b853..a946beb55 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -6,40 +6,138 @@ p | The following examples and code snippets give you an overview of spaCy's | functionality and its usage. -+h(2, "models") Install and load models ++h(2, "models") Install models and process text +code(false, "bash"). python -m spacy download en + python -m spacy download de +code. import spacy nlp = spacy.load('en') + doc = nlp(u'Hello, world. Here are two sentences.') -+h(2, "examples-resources") Load resources and process text + nlp_de = spacy.load('de') + doc_de = nlp_de(u'Ich bin ein Berliner.') + ++infobox + | #[strong API:] #[+api("spacy#load") #[code spacy.load()]] + | #[strong Usage:] #[+a("/docs/usage/models") Models], + | #[+a("/docs/usage/spacy-101") spaCy 101] + ++h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences + +tag-model("dependency parse") + ++code. + doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " + u"emoji. It's outranking eggplant πŸ‘ ") + + assert doc[0].text == u'Peach' + assert doc[1].text == u'emoji' + assert doc[-1].text == u'πŸ‘' + assert doc[17:19] == u'outranking eggplant' + assert doc.noun_chunks[0].text == u'Peach emoji' + + sentences = list(doc.sents) + assert len(sentences) == 3 + assert sentences[0].text == u'Peach is the superior emoji.' + ++infobox + | #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101] + ++h(2, "examples-pos-tags") Get part-of-speech tags and flags + +tag-model("tagger") + ++code. + doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') + apple = doc[0] + assert [apple.pos_, apple.pos] == [u'PROPN', 94] + assert [apple.tag_, apple.tag] == [u'NNP', 475] + assert [apple.shape_, apple.shape] == [u'Xxxxx', 684] + assert apple.is_alpha == True + assert apple.is_punct == False + + billion = doc[10] + assert billion.is_digit == False + assert billion.like_num == True + assert billion.like_email == False + ++infobox + | #[strong API:] #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging] + ++h(2, "examples-integer-ids") Use integer IDs for any string + ++code. + hello_id = nlp.vocab.strings['Hello'] + hello_str = nlp.vocab.strings[hello_id] + assert token.text == hello_id == 3125 + assert token.text == hello_str == 'Hello' + ++h(2, "examples-entities") Recongnise and update named entities + +tag-model("NER") + ++code. + doc = nlp(u'San Francisco considers banning sidewalk delivery robots') + ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] + assert ents == [(u'San Francisco', 0, 13, u'GPE')] + + from spacy.tokens import Span + doc = nlp(u'Netflix is hiring a new VP of global policy') + doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] + ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents] + assert ents == [(0, 7, u'ORG')] + ++infobox + | #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition] + ++h(2, "displacy") Visualize a dependency parse and named entities in your browser + +tag-model("dependency parse", "NER") + ++code. + from spacy import displacy + + doc_dep = nlp(u'This is a sentence.') + displacy.serve(doc_dep, style='dep') + + doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at ' + u'Google in 2007, few people outside of the company took him seriously.') + displacy.serve(doc_ent, style='ent') + ++infobox + | #[strong API:] #[+api("displacy") #[code displacy]] + | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers] + ++h(2, "examples-word-vectors") Word vectors + +tag-model("word vectors") + ++code. + doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") + apple = doc[0] + banana = doc[2] + pasta = doc[6] + hippo = doc[8] + assert apple.similarity(banana) > pasta.similarity(hippo) + ++infobox + | #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity] + ++h(2, "examples-serialization") Simple and efficient serialization +code. import spacy - en_nlp = spacy.load('en') - de_nlp = spacy.load('de') - en_doc = en_nlp(u'Hello, world. Here are two sentences.') - de_doc = de_nlp(u'ich bin ein Berliner.') + from spacy.tokens.doc import Doc -+h(2, "displacy-dep") Visualize a dependency parse in your browser + nlp = spacy.load('en') + moby_dick = open('moby_dick.txt', 'r') + doc = nlp(moby_dick) + doc.to_disk('/moby_dick.bin') -+code. - from spacy import displacy + new_doc = Doc().from_disk('/moby_dick.bin') - doc = nlp(u'This is a sentence.') - displacy.serve(doc, style='dep') - -+h(2, "displacy-ent") Visualize named entities in your browser - -+code. - from spacy import displacy - - doc = nlp(u'When Sebastian Thrun started working on self-driving cars at ' - u'Google in 2007, few people outside of the company took him seriously.') - displacy.serve(doc, style='ent') ++infobox + | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] +h(2, "multi-threaded") Multi-threaded generator @@ -52,37 +150,25 @@ p if i == 100: break -+h(2, "examples-tokens-sentences") Get tokens and sentences ++infobox + | #[strong API:] #[+api("doc") #[code Doc]] + | #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage] + ++h(2, "examples-dependencies") Get syntactic dependencies + +tag-model("dependency parse") +code. - token = doc[0] - sentence = next(doc.sents) - assert token is sentence[0] - assert sentence.text == 'Hello, world.' + def dependency_labels_to_root(token): + """Walk up the syntactic tree, collecting the arc labels.""" + dep_labels = [] + while token.head is not token: + dep_labels.append(token.dep) + token = token.head + return dep_labels -+h(2, "examples-integer-ids") Use integer IDs for any string - -+code. - hello_id = nlp.vocab.strings['Hello'] - hello_str = nlp.vocab.strings[hello_id] - - assert token.orth == hello_id == 3125 - assert token.orth_ == hello_str == 'Hello' - -+h(2, "examples-string-views-flags") Get and set string views and flags - -+code. - assert token.shape_ == 'Xxxxx' - for lexeme in nlp.vocab: - if lexeme.is_alpha: - lexeme.shape_ = 'W' - elif lexeme.is_digit: - lexeme.shape_ = 'D' - elif lexeme.is_punct: - lexeme.shape_ = 'P' - else: - lexeme.shape_ = 'M' - assert token.shape_ == 'W' ++infobox + | #[strong API:] #[+api("token") #[code Token]] + | #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse] +h(2, "examples-numpy-arrays") Export to numpy arrays @@ -97,70 +183,6 @@ p assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] -+h(2, "examples-word-vectors") Word vectors - -+code. - doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - - apples = doc[0] - oranges = doc[2] - boots = doc[6] - hippos = doc[8] - - assert apples.similarity(oranges) > boots.similarity(hippos) - -+h(2, "examples-pos-tags") Part-of-speech tags - -+code. - from spacy.parts_of_speech import ADV - - def is_adverb(token): - return token.pos == spacy.parts_of_speech.ADV - - # These are data-specific, so no constants are provided. You have to look - # up the IDs from the StringStore. - NNS = nlp.vocab.strings['NNS'] - NNPS = nlp.vocab.strings['NNPS'] - def is_plural_noun(token): - return token.tag == NNS or token.tag == NNPS - - def print_coarse_pos(token): - print(token.pos_) - - def print_fine_pos(token): - print(token.tag_) - -+h(2, "examples-dependencies") Syntactic dependencies - -+code. - def dependency_labels_to_root(token): - '''Walk up the syntactic tree, collecting the arc labels.''' - dep_labels = [] - while token.head is not token: - dep_labels.append(token.dep) - token = token.head - return dep_labels - -+h(2, "examples-entities") Named entities - -+code. - def iter_products(docs): - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PRODUCT': - yield ent - - def word_is_in_entity(word): - return word.ent_type != 0 - - def count_parent_verb_by_person(docs): - counts = defaultdict(lambda: defaultdict(int)) - for doc in docs: - for ent in doc.ents: - if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - counts[ent.orth_][ent.root.head.lemma_] += 1 - return counts - +h(2, "examples-inline") Calculate inline mark-up on original string +code. @@ -187,17 +209,3 @@ p string = string.replace('\n', '') string = string.replace('\t', ' ') return string - -+h(2, "examples-binary") Efficient binary serialization - -+code. - import spacy - from spacy.tokens.doc import Doc - - byte_string = doc.to_bytes() - open('moby_dick.bin', 'wb').write(byte_string) - - nlp = spacy.load('en') - for byte_string in Doc.read_bytes(open('moby_dick.bin', 'rb')): - doc = Doc(nlp.vocab) - doc.from_bytes(byte_string) From 467bbeadb8db8f1874f3b4f175624784aab7c570 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 24 May 2017 20:09:51 -0500 Subject: [PATCH 047/118] Add hidden layers for tagger --- spacy/pipeline.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 7ca2ed99d..98b79d709 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -119,7 +119,7 @@ class TokenVectorEncoder(object): assert tokvecs.shape[0] == len(doc) doc.tensor = tokvecs - def update(self, docs, golds, state=None, drop=0., sgd=None): + def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): """Update the model. docs (iterable): A batch of `Doc` objects. @@ -199,7 +199,7 @@ class NeuralTagger(object): vocab.morphology.assign_tag_id(&doc.c[j], tag_id) idx += 1 - def update(self, docs_tokvecs, golds, drop=0., sgd=None): + def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): docs, tokvecs = docs_tokvecs if self.model.nI is None: @@ -248,7 +248,8 @@ class NeuralTagger(object): vocab.morphology.lemmatizer) token_vector_width = pipeline[0].model.nO self.model = with_flatten( - Softmax(self.vocab.morphology.n_tags, token_vector_width)) + chain(Maxout(token_vector_width, token_vector_width), + Softmax(self.vocab.morphology.n_tags, token_vector_width))) def use_params(self, params): with self.model.use_params(params): @@ -274,7 +275,8 @@ class NeuralLabeller(NeuralTagger): self.labels[dep] = len(self.labels) token_vector_width = pipeline[0].model.nO self.model = with_flatten( - Softmax(len(self.labels), token_vector_width)) + chain(Maxout(token_vector_width, token_vector_width), + Softmax(len(self.labels), token_vector_width))) def get_loss(self, docs, golds, scores): scores = self.model.ops.flatten(scores) From 135a13790c68296fd120f108107ba33ca0afc33a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 24 May 2017 20:10:20 -0500 Subject: [PATCH 048/118] Disable gold preprocessing --- spacy/cli/train.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 07e97fe1e..bba972df1 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -68,14 +68,16 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") for i in range(n_iter): with tqdm.tqdm(total=n_train_docs) as pbar: - train_docs = corpus.train_docs(nlp, shuffle=i, projectivize=True) + train_docs = corpus.train_docs(nlp, shuffle=i, projectivize=True, + gold_preproc=False) + losses = {} idx = 0 while idx < n_train_docs: batch = list(cytoolz.take(int(batch_size), train_docs)) if not batch: break docs, golds = zip(*batch) - nlp.update(docs, golds, drop=dropout, sgd=optimizer) + nlp.update(docs, golds, drop=dropout, sgd=optimizer, losses=losses) pbar.update(len(docs)) idx += len(docs) batch_size *= batch_accel @@ -83,12 +85,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx) with nlp.use_params(optimizer.averages): start = timer() - scorer = nlp.evaluate(corpus.dev_docs(nlp)) + scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) end = timer() n_words = scorer.tokens.tp + scorer.tokens.fn assert n_words != 0 wps = n_words / (end-start) - print_progress(i, {}, scorer.scores, wps=wps) + print_progress(i, losses, scorer.scores, wps=wps) with (output_path / 'model.bin').open('wb') as file_: with nlp.use_params(optimizer.averages): dill.dump(nlp, file_, -1) @@ -109,9 +111,10 @@ def print_progress(itn, losses, dev_scores, wps=0.0): for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', 'ents_p', 'ents_r', 'ents_f', 'wps']: scores[col] = 0.0 - scores.update(losses) + scores['dep_loss'] = losses.get('parser', 0.0) + scores['tag_loss'] = losses.get('tagger', 0.0) scores.update(dev_scores) - scores[wps] = wps + scores['wps'] = wps tpl = '\t'.join(( '{:d}', '{dep_loss:.3f}', From e6cc927ab17e052f09f62c7c57b10e9d0abdb41c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 24 May 2017 20:10:54 -0500 Subject: [PATCH 049/118] Rearrange multi-task learning --- spacy/language.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 23bbe1719..d48fec048 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -6,7 +6,8 @@ import dill import numpy from thinc.neural import Model from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.optimizers import Adam +from thinc.neural.optimizers import Adam, SGD +import random from .tokenizer import Tokenizer from .vocab import Vocab @@ -194,7 +195,7 @@ class Language(object): proc(doc) return doc - def update(self, docs, golds, drop=0., sgd=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. @@ -211,12 +212,20 @@ class Language(object): """ tok2vec = self.pipeline[0] feats = tok2vec.doc2feats(docs) - for proc in self.pipeline[1:]: + procs = list(self.pipeline[1:]) + random.shuffle(procs) + grads = {} + def get_grads(W, dW, key=None): + grads[key] = (W, dW) + for proc in procs: if not hasattr(proc, 'update'): continue tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) - d_tokvecses = proc.update((docs, tokvecses), golds, sgd=sgd, drop=drop) + d_tokvecses = proc.update((docs, tokvecses), golds, + drop=drop, sgd=sgd, losses=losses) bp_tokvecses(d_tokvecses, sgd=sgd) + for key, (W, dW) in grads.items(): + sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. # If we don't do this, the memory leak gets pretty # bad, because we may be holding part of a batch. From e1cb5be0c7a5d370d1329d38fdcb17dc7d09d3ee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 24 May 2017 20:11:41 -0500 Subject: [PATCH 050/118] Adjust dropout, depth and multi-task in parser --- spacy/syntax/nn_parser.pyx | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 6f23a08b5..645e5d9e6 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -416,7 +416,9 @@ cdef class Parser: free(scores) free(token_ids) - def update(self, docs_tokvecs, golds, drop=0., sgd=None): + def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + if losses is not None and self.name not in losses: + losses[self.name] = 0. docs, tokvec_lists = docs_tokvecs tokvecs = self.model[0].ops.flatten(tokvec_lists) if isinstance(docs, Doc) and isinstance(golds, GoldParse): @@ -436,18 +438,20 @@ cdef class Parser: backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) cdef float loss = 0. - while len(todo) >= 3: + while len(todo) >= 2: states, golds = zip(*todo) token_ids = self.get_token_ids(states) vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) - mask = vec2scores.ops.get_dropout_mask(vector.shape, drop) - vector *= mask + if drop != 0: + mask = vec2scores.ops.get_dropout_mask(vector.shape, drop) + vector *= mask scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) d_vector = bp_scores(d_scores, sgd=sgd) - d_vector *= mask + if drop != 0: + d_vector *= mask if isinstance(self.model[0].ops, CupyOps) \ and not isinstance(token_ids, state2vec.ops.xp.ndarray): @@ -461,10 +465,12 @@ cdef class Parser: backprops.append((token_ids, d_vector, bp_vector)) self.transition_batch(states, scores) todo = [st for st in todo if not st[0].is_final()] - if len(backprops) >= 50: + if len(backprops) >= 20: self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) backprops = [] + if losses is not None: + losses[self.name] += (d_scores**2).sum() if backprops: self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) From dcb10da61596aa2249882e7d7ca8a404fb33c6ea Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 11:15:56 +0200 Subject: [PATCH 051/118] Update and fix lightning tour examples --- website/docs/usage/lightning-tour.jade | 50 ++++++++++++++++---------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index a946beb55..473f10c5e 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -101,15 +101,15 @@ p doc_dep = nlp(u'This is a sentence.') displacy.serve(doc_dep, style='dep') - doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at ' - u'Google in 2007, few people outside of the company took him seriously.') + doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google ' + u'in 2007, few people outside of the company took him seriously.') displacy.serve(doc_ent, style='ent') +infobox | #[strong API:] #[+api("displacy") #[code displacy]] | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers] -+h(2, "examples-word-vectors") Word vectors ++h(2, "examples-word-vectors") Get word vectors and similarity +tag-model("word vectors") +code. @@ -119,6 +119,7 @@ p pasta = doc[6] hippo = doc[8] assert apple.similarity(banana) > pasta.similarity(hippo) + assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector +infobox | #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity] @@ -139,6 +140,23 @@ p +infobox | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] ++h(2, "rule-matcher") Match text with token rules + ++code. + import spacy + from spacy.matcher import Matcher + + nlp = spacy.load('en') + matcher = Matcher(nlp.vocab) + # match "Google I/O" or "Google i/o" + pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] + matcher.add('GoogleIO', None, pattern) + matches = nlp(LOTS_OF TEXT) + ++infobox + | #[strong API:] #[+api("matcher") #[code Matcher]] + | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] + +h(2, "multi-threaded") Multi-threaded generator +code. @@ -183,28 +201,24 @@ p assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] -+h(2, "examples-inline") Calculate inline mark-up on original string ++h(2, "examples-inline") Calculate inline markup on original string +code. def put_spans_around_tokens(doc, get_classes): - '''Given some function to compute class names, put each token in a - span element, with the appropriate classes computed. - - All whitespace is preserved, outside of the spans. (Yes, I know HTML - won't display it. But the point is no information is lost, so you can - calculate what you need, e.g.
tags,

tags, etc.) - ''' + """Given some function to compute class names, put each token in a + span element, with the appropriate classes computed. All whitespace is + preserved, outside of the spans. (Of course, HTML won't display more than + one whitespace character it – but the point is, no information is lost + and you can calculate what you need, e.g. <br />, <p> etc.) + """ output = [] - template = '{word}{space}' + html = '<span class="{classes}">{word}</span>{space}' for token in doc: if token.is_space: - output.append(token.orth_) + output.append(token.text) else: - output.append( - template.format( - classes=' '.join(get_classes(token)), - word=token.orth_, - space=token.whitespace_)) + classes = ' '.join(get_classes(token)) + output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) string = ''.join(output) string = string.replace('\n', '') string = string.replace('\t', ' ') From b2324be3e90d40f9442d326763d8dd9622603562 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 11:17:21 +0200 Subject: [PATCH 052/118] Fix typos, text, examples and formatting --- website/docs/usage/_data.json | 2 +- website/docs/usage/_spacy-101/_pipelines.jade | 4 +- website/docs/usage/_spacy-101/_pos-deps.jade | 2 +- .../docs/usage/_spacy-101/_serialization.jade | 5 ++ .../docs/usage/_spacy-101/_tokenization.jade | 10 ++-- .../docs/usage/_spacy-101/_word-vectors.jade | 2 +- website/docs/usage/entity-recognition.jade | 2 +- .../usage/language-processing-pipeline.jade | 3 +- website/docs/usage/production-use.jade | 8 +-- website/docs/usage/saving-loading.jade | 2 +- website/docs/usage/spacy-101.jade | 6 +++ website/docs/usage/visualizers.jade | 50 +++++++++---------- 12 files changed, 51 insertions(+), 45 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 9f51df5c4..a611151b3 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -7,7 +7,7 @@ "Lightning tour": "lightning-tour", "What's new in v2.0": "v2" }, - "Workflows": { + "Guides": { "POS tagging": "pos-tagging", "Using the parse": "dependency-parse", "Entity recognition": "entity-recognition", diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index d984a4708..db095ef04 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -2,9 +2,9 @@ p | When you call #[code nlp] on a text, spaCy first tokenizes the text to - | produce a #[code Doc] object. The #[code Doc] is the processed in several + | produce a #[code Doc] object. The #[code Doc] is then processed in several | different steps – this is also referred to as the - | #[strong processing pipeline]. The pipeline used by our + | #[strong processing pipeline]. The pipeline used by the | #[+a("/docs/usage/models") default models] consists of a | vectorizer, a tagger, a parser and an entity recognizer. Each pipeline | component returns the processed #[code Doc], which is then passed on to diff --git a/website/docs/usage/_spacy-101/_pos-deps.jade b/website/docs/usage/_spacy-101/_pos-deps.jade index 5aa719c23..b42847aee 100644 --- a/website/docs/usage/_spacy-101/_pos-deps.jade +++ b/website/docs/usage/_spacy-101/_pos-deps.jade @@ -28,7 +28,7 @@ p | #[strong Text:] The original word text.#[br] | #[strong Lemma:] The base form of the word.#[br] | #[strong POS:] The simple part-of-speech tag.#[br] - | #[strong Tag:] ...#[br] + | #[strong Tag:] The detailed part-of-speech tag.#[br] | #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br] | #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br] | #[strong is alpha:] Is the token an alpha character?#[br] diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade index b6a889014..f3926dd9c 100644 --- a/website/docs/usage/_spacy-101/_serialization.jade +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -33,3 +33,8 @@ p +annotation-row(["from_bytes", "object", "nlp.from_bytes(bytes)"], style) +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) + ++code. + moby_dick = open('moby_dick.txt', 'r') # open a large document + doc = nlp(moby_dick) # process it + doc.to_disk('/moby_dick.bin') # save the processed Doc diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade index 28fd448b4..64e3f5881 100644 --- a/website/docs/usage/_spacy-101/_tokenization.jade +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -2,11 +2,11 @@ p | During processing, spaCy first #[strong tokenizes] the text, i.e. - | segments it into words, punctuation and so on. For example, punctuation - | at the end of a sentence should be split off – whereas "U.K." should - | remain one token. This is done by applying rules specific to each - | language. Each #[code Doc] consists of individual tokens, and we can - | simply iterate over them: + | segments it into words, punctuation and so on. This is done by applying + | rules specific to each language. For example, punctuation at the end of a + | sentence should be split off – whereas "U.K." should remain one token. + | Each #[code Doc] consists of individual tokens, and we can simply iterate + | over them: +code. for token in doc: diff --git a/website/docs/usage/_spacy-101/_word-vectors.jade b/website/docs/usage/_spacy-101/_word-vectors.jade index 4ed8e4c78..cbb9d06f2 100644 --- a/website/docs/usage/_spacy-101/_word-vectors.jade +++ b/website/docs/usage/_spacy-101/_word-vectors.jade @@ -6,7 +6,7 @@ p | vectors can be generated using an algorithm like | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's | #[+a("/docs/usage/models") default models] come with - | #[strong 300-dimensional vectors], that look like this: + | #[strong 300-dimensional vectors] that look like this: +code("banana.vector", false, false, 250). array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index bcad07baa..527c14dde 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -52,7 +52,7 @@ p assert ent_san == [u'San', u'B', u'GPE'] assert ent_francisco == [u'Francisco', u'I', u'GPE'] -+table(["Text", "ent_iob", "ent.iob_", "ent_type", "ent_type_", "Description"]) ++table(["Text", "ent_iob", "ent_iob_", "ent_type", "ent_type_", "Description"]) - var style = [0, 1, 1, 1, 1, 0] +annotation-row(["San", 3, "B", 381, "GPE", "beginning of an entity"], style) +annotation-row(["Francisco", 1, "I", 381, "GPE", "inside an entity"], style) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 8bb92caae..948212d82 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -344,8 +344,7 @@ p | Since spaCy v2.0 comes with better support for customising the | processing pipeline components, the #[code parser], #[code tagger] | and #[code entity] keyword arguments have been replaced with - | #[code disable], which takes a list of - | #[+a("/docs/usage/language-processing-pipeline") pipeline component names]. + | #[code disable], which takes a list of pipeline component names. | This lets you disable both default and custom components when loading | a model, or initialising a Language class via | #[+api("language-from_disk") #[code from_disk]]. diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade index 68a313d8a..c7f872c6d 100644 --- a/website/docs/usage/production-use.jade +++ b/website/docs/usage/production-use.jade @@ -2,16 +2,12 @@ include ../../_includes/_mixins -p - | Once you have loaded the #[code nlp] object, you can call it as though - | it were a function. This allows you to process a single unicode string. - +h(2, "multithreading") Multi-threading with #[code .pipe()] p | If you have a sequence of documents to process, you should use the - | #[+api("language#pipe") #[code .pipe()]] method. The #[code .pipe()] - | method takes an iterator of texts, and accumulates an internal buffer, + | #[+api("language#pipe") #[code .pipe()]] method. The method takes an + | iterator of texts, and accumulates an internal buffer, | which it works on in parallel. It then yields the documents in order, | one-by-one. After a long and bitter struggle, the global interpreter | lock was freed around spaCy's main parsing loop in v0.100.3. This means diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 413b86477..477db925c 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -209,5 +209,5 @@ p | spaCy v2.0 solves this with a clear distinction between setting up | the instance and loading the data. - +code-new nlp = English.from_disk('/path/to/data') + +code-new nlp = English().from_disk('/path/to/data') +code-old nlp = spacy.load('en', path='/path/to/data') diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index f8779b52f..47d49ad40 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -81,6 +81,12 @@ p nlp = spacy.load('en') doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') +p + | Even though a #[code Doc] is processed – e.g. split into individual words + | and annotated – it still holds #[strong all information of the original text], + | like whitespace characters. This way, you'll never lose any information + | when processing text with spaCy. + +h(3, "annotations-token") Tokenization include _spacy-101/_tokenization diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 385fa0fd0..90a343700 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -180,8 +180,8 @@ p p | If you don't need the web server and just want to generate the markup | – for example, to export it to a file or serve it in a custom - | way – you can use #[+api("displacy#render") #[code displacy.render]] - | instead. It works the same, but returns a string containing the markup. + | way – you can use #[+api("displacy#render") #[code displacy.render]]. + | It works the same way, but returns a string containing the markup. +code("Example"). import spacy @@ -220,10 +220,32 @@ p | a standalone graphic.) So instead of rendering all #[code Doc]s at one, | loop over them and export them separately. + ++h(3, "examples-export-svg") Example: Export SVG graphics of dependency parses + ++code("Example"). + import spacy + from spacy import displacy + from pathlib import Path + + nlp = spacy.load('en') + sentences = ["This is an example.", "This is another one."] + for sent in sentences: + doc = nlp(sentence) + svg = displacy.render(doc, style='dep') + file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' + output_path = Path('/images/' + file_name) + output_path.open('w', encoding='utf-8').write(svg) + +p + | The above code will generate the dependency visualizations and them to + | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. + + +h(2, "jupyter") Using displaCy in Jupyter notebooks p - | displaCy is able to detect whether you're within a + | displaCy is able to detect whether you're working in a | #[+a("https://jupyter.org") Jupyter] notebook, and will return markup | that can be rendered in a cell straight away. When you export your | notebook, the visualizations will be included as HTML. @@ -257,28 +279,6 @@ p html = displacy.render(doc, style='dep') return display(HTML(html)) -+h(2, "examples") Usage examples - -+h(3, "examples-export-svg") Export SVG graphics of dependency parses - -+code("Example"). - import spacy - from spacy import displacy - from pathlib import Path - - nlp = spacy.load('en') - sentences = ["This is an example.", "This is another one."] - for sent in sentences: - doc = nlp(sentence) - svg = displacy.render(doc, style='dep') - file_name = '-'.join([w.text for w in doc if not w.is_punct]) + '.svg' - output_path = Path('/images/' + file_name) - output_path.open('w', encoding='utf-8').write(svg) - -p - | The above code will generate the dependency visualizations and them to - | two files, #[code This-is-an-example.svg] and #[code This-is-another-one.svg]. - +h(2, "manual-usage") Rendering data manually p From 9063654a1ad2dd2b9b04f39b34ccf5395953f4b9 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 11:18:02 +0200 Subject: [PATCH 053/118] Add Training 101 stub --- website/docs/usage/_spacy-101/_training.jade | 3 +++ website/docs/usage/spacy-101.jade | 4 ++++ website/docs/usage/training.jade | 4 ++++ 3 files changed, 11 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_training.jade diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/docs/usage/_spacy-101/_training.jade new file mode 100644 index 000000000..59861434c --- /dev/null +++ b/website/docs/usage/_spacy-101/_training.jade @@ -0,0 +1,3 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > TRAINING + +p diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 47d49ad40..9373f182a 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -117,6 +117,10 @@ include _spacy-101/_pipelines include _spacy-101/_serialization ++h(2, "training") Training + +include _spacy-101/_training + +h(2, "architecture") Architecture +image diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 8a5c111bd..9df71851a 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -6,6 +6,10 @@ p | Once the model is trained, you can then | #[+a("/docs/usage/saving-loading") save and load] it. ++h(2, "101") Training 101 + +include _spacy-101/_training + +h(2, "train-pos-tagger") Training the part-of-speech tagger +code. From b27c5878005fddb749bf36eabfb4497135b91bdf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 06:46:59 -0500 Subject: [PATCH 054/118] Fix pieces argument to PrecomputedMaxout --- spacy/_ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 4667798b2..f589704a6 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -86,10 +86,10 @@ class PrecomputableAffine(Model): d_b=Gradient("b") ) class PrecomputableMaxouts(Model): - def __init__(self, nO=None, nI=None, nF=None, pieces=3, **kwargs): + def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs): Model.__init__(self, **kwargs) self.nO = nO - self.nP = pieces + self.nP = nP self.nI = nI self.nF = nF From 8500d9b1da9f0c4badabcc377c340283f66c0a17 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 06:47:42 -0500 Subject: [PATCH 055/118] Only train one task per iter, holding grads --- spacy/language.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d48fec048..65416f208 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -222,8 +222,9 @@ class Language(object): continue tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, - drop=drop, sgd=sgd, losses=losses) - bp_tokvecses(d_tokvecses, sgd=sgd) + drop=drop, sgd=get_grads, losses=losses) + bp_tokvecses(d_tokvecses, sgd=get_grads) + break for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. From 679efe79c8f1dc2615d0f1534ca70f24d93cf86e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 06:49:00 -0500 Subject: [PATCH 056/118] Make parser update less hacky --- spacy/syntax/nn_parser.pyx | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 645e5d9e6..cc76d5e7f 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -438,7 +438,7 @@ cdef class Parser: backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) cdef float loss = 0. - while len(todo) >= 2: + while todo: states, golds = zip(*todo) token_ids = self.get_token_ids(states) @@ -465,15 +465,10 @@ cdef class Parser: backprops.append((token_ids, d_vector, bp_vector)) self.transition_batch(states, scores) todo = [st for st in todo if not st[0].is_final()] - if len(backprops) >= 20: - self._make_updates(d_tokvecs, - backprops, sgd, cuda_stream) - backprops = [] if losses is not None: losses[self.name] += (d_scores**2).sum() - if backprops: - self._make_updates(d_tokvecs, - backprops, sgd, cuda_stream) + self._make_updates(d_tokvecs, + backprops, sgd, cuda_stream) return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): From c245ff6b27a62ba64437294027341fb1c329a6fd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 11:18:59 -0500 Subject: [PATCH 057/118] Rebatch parser inputs, with mid-sentence states --- spacy/syntax/nn_parser.pyx | 51 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index cc76d5e7f..e1f7871de 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -426,9 +426,11 @@ cdef class Parser: golds = [golds] cuda_stream = get_cuda_stream() - golds = [self.moves.preprocess_gold(g) for g in golds] - states = self.moves.init_batch(docs) + states, golds = self._init_gold_batch(docs, golds) + max_length = min([len(doc) for doc in docs]) + #golds = [self.moves.preprocess_gold(g) for g in golds] + #states = self.moves.init_batch(docs) state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) @@ -438,6 +440,7 @@ cdef class Parser: backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) cdef float loss = 0. + #while len(todo and len(todo) >= len(states): while todo: states, golds = zip(*todo) @@ -467,10 +470,54 @@ cdef class Parser: todo = [st for st in todo if not st[0].is_final()] if losses is not None: losses[self.name] += (d_scores**2).sum() + if len(backprops) >= (max_length * 2): + break self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) + def _init_gold_batch(self, docs, golds): + """Make a square batch, of length equal to the shortest doc. A long + doc will get multiple states. Let's say we have a doc of length 2*N, + where N is the shortest doc. We'll make two states, one representing + long_doc[:N], and another representing long_doc[N:].""" + cdef StateClass state + lengths = [len(doc) for doc in docs] + # Cap to min length + min_length = min(lengths) + offset = 0 + states = [] + extra_golds = [] + cdef np.ndarray py_costs = numpy.zeros((self.moves.n_moves,), dtype='f') + cdef np.ndarray py_is_valid = numpy.zeros((self.moves.n_moves,), dtype='i') + costs = py_costs.data + is_valid = py_is_valid.data + for doc, gold in zip(docs, golds): + gold = self.moves.preprocess_gold(gold) + state = StateClass(doc, offset=offset) + self.moves.initialize_state(state.c) + states.append(state) + extra_golds.append(gold) + start = min(min_length, len(doc)) + while start < len(doc): + length = min(min_length, len(doc)-start) + state = StateClass(doc, offset=offset) + self.moves.initialize_state(state.c) + while state.B(0) < start and not state.is_final(): + py_is_valid.fill(0) + py_costs.fill(0) + self.moves.set_costs(is_valid, costs, state, gold) + for i in range(self.moves.n_moves): + if is_valid[i] and costs[i] <= 0: + self.moves.c[i].do(state.c, self.moves.c[i].label) + break + start += length + if not state.is_final(): + states.append(state) + extra_golds.append(gold) + offset += len(doc) + return states, extra_golds + def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): # Tells CUDA to block, so our async copies complete. if cuda_stream is not None: From f403c2cd5f62a3213a9348597b4f779ac558416e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 11:19:26 -0500 Subject: [PATCH 058/118] Add env opts for optimizer --- spacy/language.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 65416f208..18fdfccc2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -276,7 +276,15 @@ class Language(object): context = proc.begin_training(get_gold_tuples(), pipeline=self.pipeline) contexts.append(context) - optimizer = Adam(Model.ops, 0.001) + learn_rate = util.env_opt('learn_rate', 0.001) + beta1 = util.env_opt('optimizer_B1', 0.9) + beta2 = util.env_opt('optimizer_B2', 0.999) + eps = util.env_opt('optimizer_eps', 1e-08) + L2 = util.env_opt('L2_penalty', 1e-6) + max_grad_norm = util.env_opt('grad_norm_clip', 1.) + optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, + beta2=beta2, eps=eps) + optimizer.max_grad_norm = max_grad_norm return optimizer def evaluate(self, docs_golds): From 2cb7cc2db772e93fddb3dd84b0c34f3a956aa574 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 14:55:09 -0500 Subject: [PATCH 059/118] Remove commented code from parser --- spacy/syntax/nn_parser.pyx | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index e1f7871de..341b8c041 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -429,18 +429,14 @@ cdef class Parser: states, golds = self._init_gold_batch(docs, golds) max_length = min([len(doc) for doc in docs]) - #golds = [self.moves.preprocess_gold(g) for g in golds] - #states = self.moves.init_batch(docs) state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) - todo = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) cdef float loss = 0. - #while len(todo and len(todo) >= len(states): while todo: states, golds = zip(*todo) @@ -483,34 +479,33 @@ cdef class Parser: long_doc[:N], and another representing long_doc[N:].""" cdef StateClass state lengths = [len(doc) for doc in docs] - # Cap to min length min_length = min(lengths) offset = 0 states = [] extra_golds = [] - cdef np.ndarray py_costs = numpy.zeros((self.moves.n_moves,), dtype='f') - cdef np.ndarray py_is_valid = numpy.zeros((self.moves.n_moves,), dtype='i') - costs = py_costs.data - is_valid = py_is_valid.data + cdef Pool mem = Pool() + costs = mem.alloc(self.moves.n_moves, sizeof(float)) + is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) for doc, gold in zip(docs, golds): gold = self.moves.preprocess_gold(gold) state = StateClass(doc, offset=offset) self.moves.initialize_state(state.c) - states.append(state) - extra_golds.append(gold) + if not state.is_final(): + states.append(state) + extra_golds.append(gold) start = min(min_length, len(doc)) while start < len(doc): length = min(min_length, len(doc)-start) state = StateClass(doc, offset=offset) self.moves.initialize_state(state.c) while state.B(0) < start and not state.is_final(): - py_is_valid.fill(0) - py_costs.fill(0) self.moves.set_costs(is_valid, costs, state, gold) for i in range(self.moves.n_moves): if is_valid[i] and costs[i] <= 0: self.moves.c[i].do(state.c, self.moves.c[i].label) break + else: + raise ValueError("Could not find gold move") start += length if not state.is_final(): states.append(state) From b9cea9cd93bd6359b7450463cc798aeb1a9bb6d5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 16:16:10 -0500 Subject: [PATCH 060/118] Add compounding and decaying functions --- spacy/util.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index f27df54a8..54a6d17b5 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -313,6 +313,36 @@ def normalize_slice(length, start, stop, step=None): return start, stop +def compounding(start, stop, compound): + '''Yield an infinite series of compounding values. Each time the + generator is called, a value is produced by multiplying the previous + value by the compound rate. + + EXAMPLE + + >>> sizes = compounding(1., 10., 1.5) + >>> assert next(sizes) == 1. + >>> assert next(sizes) == 1 * 1.5 + >>> assert next(sizes) == 1.5 * 1.5 + ''' + def clip(value): + return max(value, stop) if (start>stop) else min(value, start) + curr = float(start) + while True: + yield clip(curr) + curr *= compound + + +def decaying(start, stop, decay): + '''Yield an infinite series of linearly decaying values.''' + def clip(value): + return max(value, stop) if (start>stop) else min(value, start) + nr_upd = 1. + while True: + yield clip(start * 1./(1. + decay * nr_upd)) + nr_upd += 1 + + def check_renamed_kwargs(renamed, kwargs): for old, new in renamed.items(): if old in kwargs: From 702fe74a4dd3d757ed315b246d2954a40f1f5bd1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 16:16:30 -0500 Subject: [PATCH 061/118] Clean up spacy.cli.train --- spacy/cli/train.py | 51 ++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index bba972df1..8a90b8b7d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -14,7 +14,7 @@ from timeit import default_timer as timer from ..tokens.doc import Doc from ..scorer import Scorer from ..gold import GoldParse, merge_sents -from ..gold import GoldCorpus +from ..gold import GoldCorpus, minibatch from ..util import prints from .. import util from .. import displacy @@ -53,44 +53,38 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') if no_entities and 'entities' in pipeline: pipeline.remove('entities') + # Take dropout and batch size as generators of values -- dropout + # starts high and decays sharply, to force the optimizer to explore. + # Batch size starts at 1 and grows, so that we make updates quickly + # at the beginning of training. + dropout_rates = util.decaying(util.env_opt('dropout_from', 0.0), + util.env_opt('dropout_to', 0.0), + util.env_opt('dropout_decay', 0.0)) + batch_sizes = util.compounding(util.env_opt('batch_from', 1), + util.env_opt('batch_to', 64), + util.env_opt('batch_compound', 1.001)) + nlp = lang_class(pipeline=pipeline) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) - - dropout = util.env_opt('dropout', 0.0) - dropout_decay = util.env_opt('dropout_decay', 0.0) - orig_dropout = dropout + n_train_docs = corpus.count_train() optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) - n_train_docs = corpus.count_train() - batch_size = float(util.env_opt('min_batch_size', 4)) - max_batch_size = util.env_opt('max_batch_size', 64) - batch_accel = util.env_opt('batch_accel', 1.001) + print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") for i in range(n_iter): - with tqdm.tqdm(total=n_train_docs) as pbar: - train_docs = corpus.train_docs(nlp, shuffle=i, projectivize=True, - gold_preproc=False) + with tqdm.tqdm(total=corpus.count_train()) as pbar: + train_docs = corpus.train_docs(nlp, projectivize=True, + gold_preproc=False, shuffle=i) losses = {} - idx = 0 - while idx < n_train_docs: - batch = list(cytoolz.take(int(batch_size), train_docs)) - if not batch: - break + for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) - nlp.update(docs, golds, drop=dropout, sgd=optimizer, losses=losses) + nlp.update(docs, golds, sgd=optimizer, + drop=next(dropout_rates), losses=losses) pbar.update(len(docs)) - idx += len(docs) - batch_size *= batch_accel - batch_size = min(batch_size, max_batch_size) - dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx) + with nlp.use_params(optimizer.averages): - start = timer() scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) - end = timer() - n_words = scorer.tokens.tp + scorer.tokens.fn - assert n_words != 0 - wps = n_words / (end-start) - print_progress(i, losses, scorer.scores, wps=wps) + print_progress(i, losses, scorer.scores) with (output_path / 'model.bin').open('wb') as file_: with nlp.use_params(optimizer.averages): dill.dump(nlp, file_, -1) @@ -118,7 +112,6 @@ def print_progress(itn, losses, dev_scores, wps=0.0): tpl = '\t'.join(( '{:d}', '{dep_loss:.3f}', - '{tag_loss:.3f}', '{uas:.3f}', '{ents_p:.3f}', '{ents_r:.3f}', From 3a6e59cc53fd49293336ced657050022aedb1df5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 17:15:09 -0500 Subject: [PATCH 062/118] Add minibatch function in spacy.gold --- spacy/gold.pyx | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 53bd25890..579010e6d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -6,6 +6,7 @@ import io import re import ujson import random +import cytoolz from .syntax import nonproj from .util import ensure_path @@ -141,6 +142,19 @@ def _min_edit_path(cand_words, gold_words): return prev_costs[n_gold], previous_row[-1] +def minibatch(items, size=8): + '''Iterate over batches of items. `size` may be an iterator, + so that batch-size can vary on each step. + ''' + items = iter(items) + while True: + batch_size = next(size) #if hasattr(size, '__next__') else size + batch = list(cytoolz.take(int(batch_size), items)) + if len(batch) == 0: + break + yield list(batch) + + class GoldCorpus(object): """An annotated corpus, using the JSON file format. Manages annotations for tagging, dependency parsing and NER.""" @@ -396,7 +410,10 @@ cdef class GoldParse: else: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] - self.heads[i] = self.gold_to_cand[heads[gold_i]] + if heads[gold_i] is None: + self.heads[i] = None + else: + self.heads[i] = self.gold_to_cand[heads[gold_i]] self.labels[i] = deps[gold_i] self.ner[i] = entities[gold_i] From df8015f05d6b70b9ceac50b1156a9c157c06473c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 17:15:24 -0500 Subject: [PATCH 063/118] Tweaks to train script --- spacy/cli/train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8a90b8b7d..ee0ee53a2 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -57,9 +57,9 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. - dropout_rates = util.decaying(util.env_opt('dropout_from', 0.0), - util.env_opt('dropout_to', 0.0), - util.env_opt('dropout_decay', 0.0)) + dropout_rates = util.decaying(util.env_opt('dropout_from', 0.5), + util.env_opt('dropout_to', 0.2), + util.env_opt('dropout_decay', 1e-4)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 64), util.env_opt('batch_compound', 1.001)) @@ -72,7 +72,7 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") for i in range(n_iter): - with tqdm.tqdm(total=corpus.count_train()) as pbar: + with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: train_docs = corpus.train_docs(nlp, projectivize=True, gold_preproc=False, shuffle=i) losses = {} From 80cf42e33b83490f9bb81c63c85bb8409d35cebb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 17:15:39 -0500 Subject: [PATCH 064/118] Fix compounding and decaying utils --- spacy/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 54a6d17b5..c0768ff23 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -326,7 +326,7 @@ def compounding(start, stop, compound): >>> assert next(sizes) == 1.5 * 1.5 ''' def clip(value): - return max(value, stop) if (start>stop) else min(value, start) + return max(value, stop) if (start>stop) else min(value, stop) curr = float(start) while True: yield clip(curr) @@ -336,7 +336,7 @@ def compounding(start, stop, compound): def decaying(start, stop, decay): '''Yield an infinite series of linearly decaying values.''' def clip(value): - return max(value, stop) if (start>stop) else min(value, start) + return max(value, stop) if (start>stop) else min(value, stop) nr_upd = 1. while True: yield clip(start * 1./(1. + decay * nr_upd)) From 82b11b0320bf6732824ca0fcba92bb6904f6b50a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 17:15:59 -0500 Subject: [PATCH 065/118] Remove print statement --- spacy/language.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 18fdfccc2..b20bb4617 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -270,7 +270,6 @@ class Language(object): if cfg.get('use_gpu'): Model.ops = CupyOps() Model.Ops = CupyOps - print("Use GPU") for proc in self.pipeline: if hasattr(proc, 'begin_training'): context = proc.begin_training(get_gold_tuples(), From dbf2a4cf577f0e66bf1591289728ed4ec56d1c5c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 19:46:56 -0500 Subject: [PATCH 066/118] Update all models on each epoch --- spacy/language.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index b20bb4617..1d9f232a7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -223,8 +223,7 @@ class Language(object): tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) - bp_tokvecses(d_tokvecses, sgd=get_grads) - break + bp_tokvecses(d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. From 22d7b448a541863efd62b60e3b674f2a1b356af7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 25 May 2017 19:47:12 -0500 Subject: [PATCH 067/118] Fix convert command --- spacy/cli/convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index c7730ab9e..847051e3f 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -25,7 +25,7 @@ CONVERTERS = { n_sents=("Number of sentences per doc", "option", "n", float), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(input_file, output_dir, n_sents, morphology): +def convert(_, input_file, output_dir, n_sents, morphology): """Convert files into JSON format for use with train command and other experiment management functions. """ @@ -39,4 +39,4 @@ def convert(input_file, output_dir, n_sents, morphology): if not file_ext in CONVERTERS: prints("Can't find converter for %s" % input_path.parts[-1], title="Unknown format", exits=1) - CONVERTERS[file_ext](input_path, output_path, *args) + CONVERTERS[file_ext](input_path, output_path, n_sents, morphology) From 353f0ef8d750b0b96867e1e3f4922389ab8329bb Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:33:54 +0200 Subject: [PATCH 068/118] Use disable argument (list) for serialization --- spacy/language.py | 46 ++++++++++-------- website/docs/api/language.jade | 89 +++++++++++++++++++++++++++------- 2 files changed, 97 insertions(+), 38 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index b20bb4617..39e60c017 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -173,13 +173,13 @@ class Language(object): flat_list.append(pipe) self.pipeline = flat_list - def __call__(self, text, **disabled): + def __call__(self, text, disable=[]): """'Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. - **disabled: Elements of the pipeline that should not be run. + disable (list): Names of the pipeline components to disable. RETURNS (Doc): A container for accessing the annotations. EXAMPLE: @@ -190,7 +190,7 @@ class Language(object): doc = self.make_doc(text) for proc in self.pipeline: name = getattr(proc, 'name', None) - if name in disabled and not disabled[name]: + if name in disable: continue proc(doc) return doc @@ -323,7 +323,7 @@ class Language(object): except StopIteration: pass - def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): + def pipe(self, texts, n_threads=2, batch_size=1000, disable=[]): """Process texts as a stream, and yield `Doc` objects in order. Supports GIL-free multi-threading. @@ -331,7 +331,7 @@ class Language(object): n_threads (int): The number of worker threads to use. If -1, OpenMP will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. - **disabled: Pipeline components to exclude. + disable (list): Names of the pipeline components to disable. YIELDS (Doc): Documents in the order of the original text. EXAMPLE: @@ -343,7 +343,7 @@ class Language(object): docs = texts for proc in self.pipeline: name = getattr(proc, 'name', None) - if name in disabled and not disabled[name]: + if name in disable: continue if hasattr(proc, 'pipe'): docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) @@ -353,12 +353,14 @@ class Language(object): for doc in docs: yield doc - def to_disk(self, path, **exclude): - """Save the current state to a directory. + def to_disk(self, path, disable=[]): + """Save the current state to a directory. If a model is loaded, this + will include the model. path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. - **exclude: Named attributes to prevent from being saved. + disable (list): Nameds of pipeline components to disable and prevent + from being saved. EXAMPLE: >>> nlp.to_disk('/path/to/models') @@ -370,7 +372,7 @@ class Language(object): raise IOError("Output path must be a directory") props = {} for name, value in self.__dict__.items(): - if name in exclude: + if name in disable: continue if hasattr(value, 'to_disk'): value.to_disk(path / name) @@ -379,13 +381,14 @@ class Language(object): with (path / 'props.pickle').open('wb') as file_: dill.dump(props, file_) - def from_disk(self, path, **exclude): + def from_disk(self, path, disable=[]): """Loads state from a directory. Modifies the object in place and - returns it. + returns it. If the saved `Language` object contains a model, the + model will be loaded. path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. - **exclude: Named attributes to prevent from being loaded. + disable (list): Names of the pipeline components to disable. RETURNS (Language): The modified `Language` object. EXAMPLE: @@ -394,35 +397,36 @@ class Language(object): """ path = util.ensure_path(path) for name in path.iterdir(): - if name not in exclude and hasattr(self, str(name)): + if name not in disable and hasattr(self, str(name)): getattr(self, name).from_disk(path / name) with (path / 'props.pickle').open('rb') as file_: bytes_data = file_.read() - self.from_bytes(bytes_data, **exclude) + self.from_bytes(bytes_data, disable) return self - def to_bytes(self, **exclude): + def to_bytes(self, disable=[]): """Serialize the current state to a binary string. - **exclude: Named attributes to prevent from being serialized. + disable (list): Nameds of pipeline components to disable and prevent + from being serialized. RETURNS (bytes): The serialized form of the `Language` object. """ props = dict(self.__dict__) - for key in exclude: + for key in disable: if key in props: props.pop(key) return dill.dumps(props, -1) - def from_bytes(self, bytes_data, **exclude): + def from_bytes(self, bytes_data, disable=[]): """Load state from a binary string. bytes_data (bytes): The data to load from. - **exclude: Named attributes to prevent from being loaded. + disable (list): Names of the pipeline components to disable. RETURNS (Language): The `Language` object. """ props = dill.loads(bytes_data) for key, value in props.items(): - if key not in exclude: + if key not in disable: setattr(self, key, value) return self diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 455165bca..a22bee5f1 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -73,15 +73,26 @@ p +cell The text to be processed. +row - +cell #[code **disabled] - +cell - - +cell Elements of the pipeline that should not be run. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Doc] +cell A container for accessing the annotations. ++infobox("⚠️ Deprecation note") + .o-block + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new doc = nlp(u"I don't want parsed", disable=['parser']) + +code-old doc = nlp(u"I don't want parsed", parse=False) + +h(2, "pipe") Language.pipe +tag method @@ -112,6 +123,13 @@ p +cell int +cell The number of texts to buffer. + +row + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. + +footrow +cell yields +cell #[code Doc] @@ -227,8 +245,11 @@ p +h(2, "to_disk") Language.to_disk +tag method + +tag-new(2) -p Save the current state to a directory. +p + | Save the current state to a directory. If a model is loaded, this will + | #[strong include the model]. +aside-code("Example"). nlp.to_disk('/path/to/models') @@ -242,14 +263,21 @@ p Save the current state to a directory. | Paths may be either strings or #[code Path]-like objects. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being saved. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | and prevent from being saved. +h(2, "from_disk") Language.from_disk +tag method + +tag-new(2) -p Loads state from a directory. Modifies the object in place and returns it. +p + | Loads state from a directory. Modifies the object in place and returns + | it. If the saved #[code Language] object contains a model, the + | #[strong model will be loaded]. +aside-code("Example"). from spacy.language import Language @@ -264,15 +292,28 @@ p Loads state from a directory. Modifies the object in place and returns it. | #[code Path]-like objects. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell The modified #[code Language] object. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy v2.0, the #[code save_to_directory] method has been + | renamed to #[code to_disk], to improve consistency across classes. + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new nlp = English().from_disk(disable=['tagger', 'ner']) + +code-old nlp = spacy.load('en', tagger=False, entity=False) + +h(2, "to_bytes") Language.to_bytes +tag method @@ -283,9 +324,12 @@ p Serialize the current state to a binary string. +table(["Name", "Type", "Description"]) +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being serialized. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable] + | and prevent from being serialized. +footrow +cell returns @@ -310,15 +354,26 @@ p Load state from a binary string. +cell The data to load from. +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell The #[code Language] object. ++infobox("⚠️ Deprecation note") + .o-block + | Pipeline components to prevent from being loaded can now be added as + | a list to #[code disable], instead of specifying one keyword argument + | per component. + + +code-new nlp = English().from_bytes(bytes, disable=['tagger', 'ner']) + +code-old nlp = English().from_bytes('en', tagger=False, entity=False) + +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) From 51882c49842c873db75c1f260091349c6295af28 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:37:45 +0200 Subject: [PATCH 069/118] Fix formatting --- spacy/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/util.py b/spacy/util.py index c0768ff23..e42bde810 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -174,6 +174,7 @@ def get_async(stream, numpy_array): array.set(numpy_array, stream=stream) return array + def itershuffle(iterable, bufsize=1000): """Shuffle an iterator. This works by holding `bufsize` items back and yielding them sometime later. Obviously, this is not unbiased -- From 10ca6d150725bf643fd4c576dc44daea410ad609 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:39:59 +0200 Subject: [PATCH 070/118] Set additional min-width on icons Prevents icons from being scaled in flexbox containers --- website/_includes/_mixins-base.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index c42994e8f..c6132df74 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -37,7 +37,8 @@ mixin svg(file, name, width, height) size - [integer] icon width and height (default: 20) mixin icon(name, size) - +svg("icons", name, size || 20).o-icon&attributes(attributes) + - var size = size || 20 + +svg("icons", name, size).o-icon(style="min-width: #{size}px")&attributes(attributes) //- Pro/Con/Neutral icon From ea9474f71c35f5e9e01d5428ddf59d762be8572b Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:42:36 +0200 Subject: [PATCH 071/118] Add version tag mixin to label new features --- website/_includes/_mixins.jade | 13 ++++++++++++- website/docs/api/displacy.jade | 2 ++ website/docs/api/doc.jade | 2 ++ website/docs/api/goldcorpus.jade | 1 + website/docs/api/matcher.jade | 5 +++++ website/docs/api/stringstore.jade | 2 ++ website/docs/api/tokenizer.jade | 2 ++ website/docs/api/util.jade | 3 +++ website/docs/api/vocab.jade | 2 ++ 9 files changed, 31 insertions(+), 1 deletion(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index f815d9c4a..fc4d66841 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -178,7 +178,7 @@ mixin label() //- Tag mixin tag() - span.u-text-tag.u-text-tag--spaced(aria-hidden="true") + span.u-text-tag.u-text-tag--spaced(aria-hidden="true")&attributes(attributes) block @@ -192,6 +192,17 @@ mixin tag-model(...capabs) +help(intro + ext + ".").u-color-theme +//- "New" tag to label features new in a specific version + By using a separate mixin with a version ID, it becomes easy to quickly + enable/disable tags without having to modify the markup in the docs. + version - [string or integer] version number, without "v" prefix + +mixin tag-new(version) + - var version = (typeof version == 'number') ? version.toFixed(1) : version + +tag(data-tooltip="This feature is new and was introduced in spaCy v#{version}.") + | v#{version} + + //- List type - [string] "numbers", "letters", "roman" (bulleted list if none set) start - [integer] start number diff --git a/website/docs/api/displacy.jade b/website/docs/api/displacy.jade index a5352ade8..a96d8a397 100644 --- a/website/docs/api/displacy.jade +++ b/website/docs/api/displacy.jade @@ -10,6 +10,7 @@ p +h(2, "serve") displacy.serve +tag method + +tag-new(2) p | Serve a dependency parse tree or named entity visualization to view it @@ -71,6 +72,7 @@ p +h(2, "render") displacy.render +tag method + +tag-new(2) p Render a dependency parse tree or named entity visualization. diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 62b1a2a76..bb56331f7 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -255,6 +255,7 @@ p +h(2, "to_disk") Doc.to_disk +tag method + +tag-new(2) p Save the current state to a directory. @@ -271,6 +272,7 @@ p Save the current state to a directory. +h(2, "from_disk") Doc.from_disk +tag method + +tag-new(2) p Loads state from a directory. Modifies the object in place and returns it. diff --git a/website/docs/api/goldcorpus.jade b/website/docs/api/goldcorpus.jade index bfff92ad5..3b3d92823 100644 --- a/website/docs/api/goldcorpus.jade +++ b/website/docs/api/goldcorpus.jade @@ -8,6 +8,7 @@ p +h(2, "init") GoldCorpus.__init__ +tag method + +tag-new(2) p Create a #[code GoldCorpus]. diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index 5d0e8af95..541cceeda 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -118,6 +118,7 @@ p Match a stream of documents, yielding them in turn. +h(2, "len") Matcher.__len__ +tag method + +tag-new(2) p | Get the number of rules added to the matcher. Note that this only returns @@ -138,6 +139,7 @@ p +h(2, "contains") Matcher.__contains__ +tag method + +tag-new(2) p Check whether the matcher contains rules for a match ID. @@ -159,6 +161,7 @@ p Check whether the matcher contains rules for a match ID. +h(2, "add") Matcher.add +tag method + +tag-new(2) p | Add a rule to the matcher, consisting of an ID key, one or more patterns, and @@ -200,6 +203,7 @@ p +h(2, "remove") Matcher.remove +tag method + +tag-new(2) p | Remove a rule from the matcher. A #[code KeyError] is raised if the match @@ -219,6 +223,7 @@ p +h(2, "get") Matcher.get +tag method + +tag-new(2) p | Retrieve the pattern stored for a key. Returns the rule as an diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index 5f5912edd..f684d48ad 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -104,6 +104,7 @@ p +h(2, "to_disk") StringStore.to_disk +tag method + +tag-new(2) p Save the current state to a directory. @@ -120,6 +121,7 @@ p Save the current state to a directory. +h(2, "from_disk") Tokenizer.from_disk +tag method + +tag-new(2) p Loads state from a directory. Modifies the object in place and returns it. diff --git a/website/docs/api/tokenizer.jade b/website/docs/api/tokenizer.jade index 87929e91b..87e1ac81e 100644 --- a/website/docs/api/tokenizer.jade +++ b/website/docs/api/tokenizer.jade @@ -200,6 +200,7 @@ p +h(2, "to_disk") Tokenizer.to_disk +tag method + +tag-new(2) p Save the current state to a directory. @@ -216,6 +217,7 @@ p Save the current state to a directory. +h(2, "from_disk") Tokenizer.from_disk +tag method + +tag-new(2) p Loads state from a directory. Modifies the object in place and returns it. diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index bf81a4f61..717abf34a 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -76,6 +76,7 @@ p +h(2, "resolve_model_path") util.resolve_model_path +tag function + +tag-new(2) p Resolve a model name or string to a model path. @@ -169,6 +170,7 @@ p +h(2, "is_in_jupyter") util.is_in_jupyter +tag function + +tag-new(2) p | Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter] @@ -221,6 +223,7 @@ p +h(2, "prints") util.prints +tag function + +tag-new(2) p | Print a formatted, text-wrapped message with optional title. If a text diff --git a/website/docs/api/vocab.jade b/website/docs/api/vocab.jade index bd18a17da..277fed5d3 100644 --- a/website/docs/api/vocab.jade +++ b/website/docs/api/vocab.jade @@ -159,6 +159,7 @@ p +h(2, "to_disk") Vocab.to_disk +tag method + +tag-new(2) p Save the current state to a directory. @@ -175,6 +176,7 @@ p Save the current state to a directory. +h(2, "from_disk") Vocab.from_disk +tag method + +tag-new(2) p Loads state from a directory. Modifies the object in place and returns it. From d48530835afca10b07eefe97a03d4bb36234aa28 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:43:16 +0200 Subject: [PATCH 072/118] Update API docs and fix typos --- website/docs/api/doc.jade | 3 +- website/docs/api/lexeme.jade | 36 ++++++++++++------- website/docs/api/matcher.jade | 51 ++++++++++++++++---------- website/docs/api/spacy.jade | 60 ++++++++++++++++++++++++++----- website/docs/api/stringstore.jade | 12 +++---- website/docs/api/token.jade | 59 ++++++++++++++++++------------ 6 files changed, 153 insertions(+), 68 deletions(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index bb56331f7..9b8392fcb 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -278,7 +278,8 @@ p Loads state from a directory. Modifies the object in place and returns it. +aside-code("Example"). from spacy.tokens import Doc - doc = Doc().from_disk('/path/to/doc') + from spacy.vocab import Vocab + doc = Doc(Vocab()).from_disk('/path/to/doc') +table(["Name", "Type", "Description"]) +row diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade index dba6fdf59..a0487be9b 100644 --- a/website/docs/api/lexeme.jade +++ b/website/docs/api/lexeme.jade @@ -212,62 +212,74 @@ p The L2 norm of the lexeme's vector representation. +row +cell #[code is_alpha] +cell bool - +cell Equivalent to #[code word.orth_.isalpha()]. + +cell + | Does the lexeme consist of alphabetic characters? Equivalent to + | #[code lexeme.text.isalpha()]. +row +cell #[code is_ascii] +cell bool - +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. + +cell + | Does the lexeme consist of ASCII characters? Equivalent to + | #[code [any(ord(c) >= 128 for c in lexeme.text)]]. +row +cell #[code is_digit] +cell bool - +cell Equivalent to #[code word.orth_.isdigit()]. + +cell + | Does the lexeme consist of digits? Equivalent to + | #[code lexeme.text.isdigit()]. +row +cell #[code is_lower] +cell bool - +cell Equivalent to #[code word.orth_.islower()]. + +cell + | Is the lexeme in lowercase? Equivalent to + | #[code lexeme.text.islower()]. +row +cell #[code is_title] +cell bool - +cell Equivalent to #[code word.orth_.istitle()]. + +cell + | Is the lexeme in titlecase? Equivalent to + | #[code lexeme.text.istitle()]. +row +cell #[code is_punct] +cell bool - +cell Equivalent to #[code word.orth_.ispunct()]. + +cell Is the lexeme punctuation? +row +cell #[code is_space] +cell bool - +cell Equivalent to #[code word.orth_.isspace()]. + +cell + | Does the lexeme consist of whitespace characters? Equivalent to + | #[code lexeme.text.isspace()]. +row +cell #[code like_url] +cell bool - +cell Does the word resemble a URL? + +cell Does the lexeme resemble a URL? +row +cell #[code like_num] +cell bool - +cell Does the word represent a number? e.g. β€œ10.9”, β€œ10”, β€œten”, etc. + +cell Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. +row +cell #[code like_email] +cell bool - +cell Does the word resemble an email address? + +cell Does the lexeme resemble an email address? +row +cell #[code is_oov] +cell bool - +cell Is the word out-of-vocabulary? + +cell Is the lexeme out-of-vocabulary? +row +cell #[code is_stop] +cell bool - +cell Is the word part of a "stop list"? + +cell Is the lexeme part of a "stop list"? +row +cell #[code lang] diff --git a/website/docs/api/matcher.jade b/website/docs/api/matcher.jade index 541cceeda..e2972fdc0 100644 --- a/website/docs/api/matcher.jade +++ b/website/docs/api/matcher.jade @@ -5,13 +5,14 @@ include ../../_includes/_mixins p Match sequences of tokens, based on pattern rules. +infobox("⚠️ Deprecation note") - | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] - | are deprecated and have been replaced with a simpler - | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of - | patterns and a callback for a given match ID. #[code Matcher.get_entity] - | is now called #[+api("matcher#get") #[code matcher.get]]. - | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), - | and #[code Matcher.has_entity] (now redundant) have been removed. + .o-block + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. #[code Matcher.get_entity] + | is now called #[+api("matcher#get") #[code matcher.get]]. + | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks), + | and #[code Matcher.has_entity] (now redundant) have been removed. +h(2, "init") Matcher.__init__ +tag method @@ -56,17 +57,6 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. doc = nlp(u'hello world!') matches = matcher(doc) -+infobox("Important note") - | By default, the matcher #[strong does not perform any action] on matches, - | like tagging matched phrases with entity types. Instead, actions need to - | be specified when #[strong adding patterns or entities], by - | passing in a callback function as the #[code on_match] argument on - | #[+api("matcher#add") #[code add]]. This allows you to define custom - | actions per pattern within the same matcher. For example, you might only - | want to merge some entity types, and set custom flags for other matched - | patterns. For more details and examples, see the usage workflow on - | #[+a("/docs/usage/rule-based-matching") rule-based matching]. - +table(["Name", "Type", "Description"]) +row +cell #[code doc] @@ -81,6 +71,17 @@ p Find all token sequences matching the supplied patterns on the #[code Doc]. | matches. A match tuple describes a span #[code doc[start:end]]. | The #[code match_id] is the ID of the added match pattern. ++infobox("Important note") + | By default, the matcher #[strong does not perform any action] on matches, + | like tagging matched phrases with entity types. Instead, actions need to + | be specified when #[strong adding patterns or entities], by + | passing in a callback function as the #[code on_match] argument on + | #[+api("matcher#add") #[code add]]. This allows you to define custom + | actions per pattern within the same matcher. For example, you might only + | want to merge some entity types, and set custom flags for other matched + | patterns. For more details and examples, see the usage workflow on + | #[+a("/docs/usage/rule-based-matching") rule-based matching]. + +h(2, "pipe") Matcher.pipe +tag method @@ -201,6 +202,20 @@ p | Match pattern. A pattern consists of a list of dicts, where each | dict describes a token. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity] + | are deprecated and have been replaced with a simpler + | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of + | patterns and a callback for a given match ID. + + +code-new. + matcher.add('GoogleNow', merge_phrases, [{ORTH: 'Google'}, {ORTH: 'Now'}]) + + +code-old. + matcher.add_entity('GoogleNow', on_match=merge_phrases) + matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}]) + +h(2, "remove") Matcher.remove +tag method +tag-new(2) diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade index 6ad88c1a8..f2fcfde2c 100644 --- a/website/docs/api/spacy.jade +++ b/website/docs/api/spacy.jade @@ -20,12 +20,7 @@ p nlp = spacy.load('/path/to/en') # unicode path nlp = spacy.load(Path('/path/to/en')) # pathlib Path -+infobox("⚠️ Deprecation note") - | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy - | will also raise an error if no model could be loaded and never just - | return an empty #[code Language] object. If you need a blank language, - | you need to import it explicitly (#[code from spacy.lang.en import English]) - | or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. + nlp = spacy.load('en', disable['parser', 'tagger']) +table(["Name", "Type", "Description"]) +row @@ -34,15 +29,28 @@ p +cell Model to load, i.e. shortcut link, package name or path. +row - +cell #[code **overrides] - +cell - - +cell Override or disable components. + +cell #[code disable] + +cell list + +cell + | Names of pipeline components to + | #[+a("/docs/usage/language-processing-pipeline#disabling") disable]. +footrow +cell returns +cell #[code Language] +cell A #[code Language] object with the loaded model. ++infobox("⚠️ Deprecation note") + .o-block + | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy + | will also raise an error if no model could be loaded and never just + | return an empty #[code Language] object. If you need a blank language, + | you need to import it explicitly (#[code from spacy.lang.en import English]) + | or use #[+api("util#get_lang_class") #[code util.get_lang_class]]. + + +code-new nlp = spacy.load('/model') + +code-old nlp = spacy.load('en', path='/model') + +h(2, "info") spacy.info +tag function @@ -98,3 +106,37 @@ p +cell returns +cell unicode +cell The explanation, or #[code None] if not found in the glossary. + ++h(2, "set_factory") spacy.set_factory + +tag function + +tag-new(2) + +p + | Set a factory that returns a custom + | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] + | component. Factories are useful for creating stateful components, especially ones which depend on shared data. + ++aside-code("Example"). + def my_factory(vocab): + def my_component(doc): + return doc + return my_component + + spacy.set_factory('my_factory', my_factory) + nlp = Language(pipeline=['my_factory']) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code factory_id] + +cell unicode + +cell + | Unique name of factory. If added to a new pipeline, spaCy will + | look up the factory for this ID and use it to create the + | component. + + +row + +cell #[code factory] + +cell callable + +cell + | Callable that takes a #[code Vocab] object and returns a pipeline + | component. diff --git a/website/docs/api/stringstore.jade b/website/docs/api/stringstore.jade index f684d48ad..f09352c79 100644 --- a/website/docs/api/stringstore.jade +++ b/website/docs/api/stringstore.jade @@ -119,7 +119,7 @@ p Save the current state to a directory. | A path to a directory, which will be created if it doesn't exist. | Paths may be either strings or #[code Path]-like objects. -+h(2, "from_disk") Tokenizer.from_disk ++h(2, "from_disk") StringStore.from_disk +tag method +tag-new(2) @@ -139,10 +139,10 @@ p Loads state from a directory. Modifies the object in place and returns it. +footrow +cell returns - +cell #[code Tokenizer] - +cell The modified #[code Tokenizer] object. + +cell #[code StringStore] + +cell The modified #[code StringStore] object. -+h(2, "to_bytes") Tokenizer.to_bytes ++h(2, "to_bytes") StringStore.to_bytes +tag method p Serialize the current state to a binary string. @@ -159,9 +159,9 @@ p Serialize the current state to a binary string. +footrow +cell returns +cell bytes - +cell The serialized form of the #[code Tokenizer] object. + +cell The serialized form of the #[code StringStore] object. -+h(2, "from_bytes") Tokenizer.from_bytes ++h(2, "from_bytes") StringStore.from_bytes +tag method p Load state from a binary string. diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 744446ec2..ee989047c 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -370,116 +370,131 @@ p The L2 norm of the token's vector representation. +cell #[code lemma] +cell int +cell - | Base form of the word, with no inflectional suffixes. + | Base form of the token, with no inflectional suffixes. +row +cell #[code lemma_] +cell unicode - +cell Base form of the word, with no inflectional suffixes. + +cell Base form of the token, with no inflectional suffixes. +row +cell #[code lower] +cell int - +cell Lower-case form of the word. + +cell Lower-case form of the token. +row +cell #[code lower_] +cell unicode - +cell Lower-case form of the word. + +cell Lower-case form of the token. +row +cell #[code shape] +cell int - +cell Transform of the word's string, to show orthographic features. + +cell + | Transform of the tokens's string, to show orthographic features. + | For example, "Xxxx" or "dd". +row +cell #[code shape_] +cell unicode - +cell A transform of the word's string, to show orthographic features. + | Transform of the tokens's string, to show orthographic features. + | For example, "Xxxx" or "dd". +row +cell #[code prefix] +cell int +cell Integer ID of a length-N substring from the start of the - | word. Defaults to #[code N=1]. + | token. Defaults to #[code N=1]. +row +cell #[code prefix_] +cell unicode +cell - | A length-N substring from the start of the word. Defaults to + | A length-N substring from the start of the token. Defaults to | #[code N=1]. +row +cell #[code suffix] +cell int +cell - | Length-N substring from the end of the word. Defaults to #[code N=3]. + | Length-N substring from the end of the token. Defaults to #[code N=3]. +row +cell #[code suffix_] +cell unicode - +cell Length-N substring from the end of the word. Defaults to #[code N=3]. + +cell Length-N substring from the end of the token. Defaults to #[code N=3]. +row +cell #[code is_alpha] +cell bool - +cell Equivalent to #[code word.orth_.isalpha()]. + +cell + | Does the token consist of alphabetic characters? Equivalent to + | #[code token.text.isalpha()]. +row +cell #[code is_ascii] +cell bool - +cell Equivalent to #[code [any(ord(c) >= 128 for c in word.orth_)]]. + +cell + | Does the token consist of ASCII characters? Equivalent to + | #[code [any(ord(c) >= 128 for c in token.text)]]. +row +cell #[code is_digit] +cell bool - +cell Equivalent to #[code word.orth_.isdigit()]. + +cell + | Does the token consist of digits? Equivalent to + | #[code token.text.isdigit()]. +row +cell #[code is_lower] +cell bool - +cell Equivalent to #[code word.orth_.islower()]. + +cell + | Is the token in lowercase? Equivalent to + | #[code token.text.islower()]. +row +cell #[code is_title] +cell bool - +cell Equivalent to #[code word.orth_.istitle()]. + +cell + | Is the token in titlecase? Equivalent to + | #[code token.text.istitle()]. +row +cell #[code is_punct] +cell bool - +cell Equivalent to #[code word.orth_.ispunct()]. + +cell Is the token punctuation? +row +cell #[code is_space] +cell bool - +cell Equivalent to #[code word.orth_.isspace()]. + +cell + | Does the token consist of whitespace characters? Equivalent to + | #[code token.text.isspace()]. +row +cell #[code like_url] +cell bool - +cell Does the word resemble a URL? + +cell Does the token resemble a URL? +row +cell #[code like_num] +cell bool - +cell Does the word represent a number? e.g. β€œ10.9”, β€œ10”, β€œten”, etc. + +cell Does the token represent a number? e.g. "10.9", "10", "ten", etc. +row +cell #[code like_email] +cell bool - +cell Does the word resemble an email address? + +cell Does the token resemble an email address? +row +cell #[code is_oov] +cell bool - +cell Is the word out-of-vocabulary? + +cell Is the token out-of-vocabulary? +row +cell #[code is_stop] +cell bool - +cell Is the word part of a "stop list"? + +cell Is the token part of a "stop list"? +row +cell #[code pos] From a7de5f0155da2d756bf220d59c2d814b23c7486d Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:43:38 +0200 Subject: [PATCH 073/118] Update SVG illustrations and use unique CSS classes --- website/assets/img/docs/architecture.svg | 98 +++++++++++------------ website/assets/img/docs/language_data.svg | 34 ++++---- website/assets/img/docs/pipeline.svg | 22 ++--- 3 files changed, 77 insertions(+), 77 deletions(-) diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/docs/architecture.svg index 1025fbaaf..f586b75eb 100644 --- a/website/assets/img/docs/architecture.svg +++ b/website/assets/img/docs/architecture.svg @@ -1,128 +1,128 @@ - + - Language + Language - MAKES + MAKES - nlp.vocab.morphology + nlp.vocab.morphology - Vocab + Vocab - nlp.vocab + nlp.vocab - StringStore + StringStore - nlp.vocab.strings + nlp.vocab.strings - nlp.tokenizer.vocab + nlp.tokenizer.vocab - Tokenizer + Tokenizer - nlp.make_doc() + nlp.make_doc() - nlp.pipeline + nlp.pipeline - nlp.pipeline[i].vocab + nlp.pipeline[i].vocab - pt + pt - en + en - de + de - fr + fr - es + es - it + it - nl + nl - sv + sv - fi + fi - nb + nb - hu + hu - he + he - bn + bn - ja + ja - zh + zh - doc.vocab + doc.vocab - MAKES + MAKES - Doc + Doc - MAKES + MAKES - token.doc + token.doc - Token + Token - Span + Span - lexeme.vocab + lexeme.vocab - Lexeme + Lexeme - MAKES + MAKES - span.doc + span.doc - Dependency Parser + Dependency Parser - Entity Recognizer + Entity Recognizer - Tagger + Tagger - Matcher + Matcher - Lemmatizer + Lemmatizer - Morphology + Morphology diff --git a/website/assets/img/docs/language_data.svg b/website/assets/img/docs/language_data.svg index 4662d4c01..b74fffba6 100644 --- a/website/assets/img/docs/language_data.svg +++ b/website/assets/img/docs/language_data.svg @@ -1,13 +1,13 @@ - Tokenizer + Tokenizer @@ -17,7 +17,7 @@ - Base data + Base data @@ -33,50 +33,50 @@ - Language data + Language data - stop words + stop words - lexical attributes + lexical attributes - tokenizer exceptions + tokenizer exceptions - prefixes, suffixes, infixes + prefixes, suffixes, infixes - lemma data + lemma data - Lemmatizer + Lemmatizer - char classes + char classes - Token + Token - morph rules + morph rules - tag map + tag map - Morphology + Morphology diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg index ddd1171ef..e42c2362f 100644 --- a/website/assets/img/docs/pipeline.svg +++ b/website/assets/img/docs/pipeline.svg @@ -1,30 +1,30 @@ - Doc + Doc - Text + Text - nlp + nlp - tokenizer + tokenizer - vectorizer + vectorizer - tagger + tagger - parser + parser - ner + ner From d8fd002e591482a8511b7ab7470239ef68ba082a Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:43:49 +0200 Subject: [PATCH 074/118] Add illustration for Vocab & StringStore --- website/assets/img/docs/vocab_stringstore.svg | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 website/assets/img/docs/vocab_stringstore.svg diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/docs/vocab_stringstore.svg new file mode 100644 index 000000000..f660a8604 --- /dev/null +++ b/website/assets/img/docs/vocab_stringstore.svg @@ -0,0 +1,77 @@ + + + + + 3572 + + Lexeme + + 508 + + Lexeme + + 949 + + Lexeme + + + "coffee" + + 3672 + + "I" + + 508 + + "love" + + 949 + + + + + nsubj + + + + dobj + + String + Store + + Vocab + + Doc + + love + VERB + + Token + + I + PRON + + Token + + coffee + NOUN + + Token + + + + + + + + + + + + + From 6d76c1ea168b6054e012c3a1f7e68c3cff0255a9 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:45:01 +0200 Subject: [PATCH 075/118] Add 101 for Vocab, Lexeme and StringStore --- .../usage/_spacy-101/_vocab-stringstore.jade | 92 +++++++++++++++++++ website/docs/usage/spacy-101.jade | 4 + 2 files changed, 96 insertions(+) create mode 100644 website/docs/usage/_spacy-101/_vocab-stringstore.jade diff --git a/website/docs/usage/_spacy-101/_vocab-stringstore.jade b/website/docs/usage/_spacy-101/_vocab-stringstore.jade new file mode 100644 index 000000000..3f551c9e1 --- /dev/null +++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade @@ -0,0 +1,92 @@ +//- πŸ’« DOCS > USAGE > SPACY 101 > VOCAB & STRINGSTORE + +p + | Whenever possible, spaCy tries to store data in a vocabulary, the + | #[+api("vocab") #[code Vocab]], that will be + | #[strong shared by multiple documents]. To save memory, spaCy also + | encodes all strings to #[strong integer IDs] – in this case for example, + | "coffee" has the ID #[code 3672]. Entity labels like "ORG" and + | part-of-speech tags like "VERB" are also encoded. Internally, spaCy + | only "speaks" in integer IDs. + ++aside + | #[strong Token]: A word, punctuation mark etc. #[em in context], including + | its attributes, tags and dependencies.#[br] + | #[strong Lexeme]: A "word type" with no context. Includes the word shape + | and flags, e.g. if it's lowercase, a digit or punctuation.#[br] + | #[strong Doc]: A processed container of tokens in context.#[br] + | #[strong Vocab]: The collection of lexemes.#[br] + | #[strong StringStore]: The dictionary mapping integer IDs to strings, for + | example #[code 3672] → "coffee". + ++image + include ../../../assets/img/docs/vocab_stringstore.svg + .u-text-right + +button("/assets/img/docs/vocab_stringstore.svg", false, "secondary").u-text-tag View large graphic + +p + | If you process lots of documents containing the word "coffee" in all + | kinds of different contexts, storing the exact string "coffee" every time + | would take up way too much space. So instead, spaCy assigns it an ID + | and stores it in the #[+api("stringstore") #[code StringStore]]. You can + | think of the #[code StringStore] as a + | #[strong lookup table that works in both directions] – you can look up a + | string to get its ID, or an ID to get its string: + ++code. + doc = nlp(u'I like coffee') + assert doc.vocab.strings[u'coffee'] == 3572 + assert doc.vocab.strings[3572] == u'coffee' + +p + | Now that all strings are encoded, the entries in the vocabulary + | #[strong don't need to include the word text] themselves. Instead, + | they can look it up in the #[code StringStore] via its integer ID. Each + | entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]], + | contains the #[strong context-independent] information about a word. + | For example, no matter if "love" is used as a verb or a noun in some + | context, its spelling and whether it consists of alphabetic characters + | won't ever change. + ++code. + for word in doc: + lexeme = doc.vocab[word.text] + print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, + lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_) + ++aside + | #[strong Text]: The original text of the lexeme.#[br] + | #[strong Orth]: The integer ID of the lexeme.#[br] + | #[strong Shape]: The abstract word shape of the lexeme.#[br] + | #[strong Prefix]: By default, the first letter of the word string.#[br] + | #[strong Suffix]: By default, the last three letters of the word string.#[br] + | #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br] + | #[strong is digit]: Does the lexeme consist of digits?#[br] + | #[strong is title]: Does the lexeme consist of alphabetic characters?#[br] + | #[strong Lang]: The language of the parent vocabulary. + ++table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"]) + - var style = [0, 1, 1, 0, 0, 1, 1, 1, 0] + +annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style) + +annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style) + +annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style) + +p + | The specific entries in the voabulary and their IDs don't really matter – + | #[strong as long as they match]. That's why you always need to make sure + | all objects you create have access to the same vocabulary. If they don't, + | the IDs won't match and spaCy will either produce very confusing results, + | or fail alltogether. + ++code. + from spacy.tokens import Doc + from spacy.vocab import Vocab + + doc = nlp(u'I like coffee') # original Doc + new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab + assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc + assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc + +p + | Even though both #[code Doc] objects contain the same words, the internal + | integer IDs are very different. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 9373f182a..cdeeac8bf 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -113,6 +113,10 @@ include _spacy-101/_word-vectors include _spacy-101/_pipelines ++h(2, "vocab-stringstore") Vocab, lexemes and the string store + +include _spacy-101/_vocab-stringstore + +h(2, "serialization") Serialization include _spacy-101/_serialization From 286c3d0719e28110f4d27b75a44f87d20ed00de4 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:46:29 +0200 Subject: [PATCH 076/118] Update usage and 101 docs --- website/docs/usage/_data.json | 2 +- website/docs/usage/_spacy-101/_pipelines.jade | 10 ++++++ .../docs/usage/_spacy-101/_serialization.jade | 28 +++++++++++++++ .../usage/language-processing-pipeline.jade | 5 +-- website/docs/usage/lightning-tour.jade | 2 +- website/docs/usage/spacy-101.jade | 35 +++++++++++++++++++ .../docs/usage/word-vectors-similarities.jade | 27 +------------- 7 files changed, 79 insertions(+), 30 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index a611151b3..59057b0bb 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -80,7 +80,7 @@ }, "customizing-tokenizer": { - "title": "Customizing the tokenizer", + "title": "Customising the tokenizer", "next": "rule-based-matching" }, diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index db095ef04..edf553805 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -48,3 +48,13 @@ p +cell ner +cell #[+api("entityrecognizer") #[code EntityRecognizer]] +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type] + +p + | The processing pipeline always #[strong depends on the statistical model] + | and its capabilities. For example, a pipeline can only include an entity + | recognizer component if the model includes data to make predictions of + | entity labels. This is why each model will specify the pipeline to use + | in its meta data, as a simple list containing the component names: + ++code(false, "json"). + "pipeline": ["vectorizer", "tagger", "parser", "ner"] diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade index f3926dd9c..35d931634 100644 --- a/website/docs/usage/_spacy-101/_serialization.jade +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -34,7 +34,35 @@ p +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) +p + | For example, if you've processed a very large document, you can use + | #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your + | local machine. This will save the document and its tokens, as well as + | the vocabulary associated with the #[code Doc]. + ++aside("Why saving the vocab?") + | Saving the vocabulary with the #[code Doc] is important, because the + | #[code Vocab] holds the context-independent information about the words, + | tags and labels, and their #[strong integer IDs]. If the #[code Vocab] + | wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve + | those IDs – for example, the word text or the dependency labels. You + | might be saving #[code 446] for "whale", but in a different vocabulary, + | this ID could map to "VERB". Similarly, if your document was processed by + | a German model, its vocab will include the specific + | #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels]. + +code. moby_dick = open('moby_dick.txt', 'r') # open a large document doc = nlp(moby_dick) # process it doc.to_disk('/moby_dick.bin') # save the processed Doc + +p + | If you need it again later, you can load it back into an empty #[code Doc] + | with an empty #[code Vocab] by calling + | #[+api("doc#from_disk") #[code from_disk()]]: + ++code. + from spacy.tokens import Doc # to create empty Doc + from spacy.vocab import Vocab # to create empty Vocab + + doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 948212d82..ce23a1666 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -322,8 +322,9 @@ p | If you don't need a particular component of the pipeline – for | example, the tagger or the parser, you can disable loading it. This can | sometimes make a big difference and improve loading speed. Disabled - | component names can be provided to #[code spacy.load], #[code from_disk] - | or the #[code nlp] object itself as a list: + | component names can be provided to #[+api("spacy#load") #[code spacy.load]], + | #[+api("language#from_disk") #[code Language.from_disk]] or the + | #[code nlp] object itself as a list: +code. nlp = spacy.load('en', disable['parser', 'tagger']) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 473f10c5e..4a9a2315f 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -35,7 +35,7 @@ p assert doc[0].text == u'Peach' assert doc[1].text == u'emoji' assert doc[-1].text == u'πŸ‘' - assert doc[17:19] == u'outranking eggplant' + assert doc[17:19].text == u'outranking eggplant' assert doc.noun_chunks[0].text == u'Peach emoji' sentences = list(doc.sents) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index cdeeac8bf..24690af57 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -91,17 +91,35 @@ p include _spacy-101/_tokenization ++infobox + | To learn more about how spaCy's tokenizer and its rules work in detail, + | how to #[strong customise] it and how to #[strong add your own tokenizer] + | to a processing pipeline, see the usage guide on + | #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer]. +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies +tag-model("dependency parse") include _spacy-101/_pos-deps ++infobox + | To learn more about #[strong part-of-speech tagging] and rule-based + | morphology, and how to #[strong navigate and use the parse tree] + | effectively, see the usage guides on + | #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and + | #[+a("/docs/usage/dependency-parse") using the dependency parse]. + +h(3, "annotations-ner") Named Entities +tag-model("named entities") include _spacy-101/_named-entities ++infobox + | To learn more about entity recognition in spaCy, how to + | #[strong add your own entities] to a document and how to train and update + | the entity predictions of a model, see the usage guide on + | #[+a("/docs/usage/entity-recognition") named entity recognition]. + +h(2, "vectors-similarity") Word vectors and similarity +tag-model("vectors") @@ -109,10 +127,22 @@ include _spacy-101/_similarity include _spacy-101/_word-vectors ++infobox + | To learn more about word vectors, how to #[strong customise them] and + | how to load #[strong your own vectors] into spaCy, see the usage + | guide on + | #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities]. + +h(2, "pipelines") Pipelines include _spacy-101/_pipelines ++infobox + | To learn more about #[strong how processing pipelines work] in detail, + | how to enable and disable their components, and how to + | #[strong create your own], see the usage guide on + | #[+a("/docs/usage/language-processing-pipeline") language processing pipelines]. + +h(2, "vocab-stringstore") Vocab, lexemes and the string store include _spacy-101/_vocab-stringstore @@ -121,6 +151,11 @@ include _spacy-101/_vocab-stringstore include _spacy-101/_serialization ++infobox + | To learn more about #[strong serialization] and how to + | #[strong save and load your own models], see the usage guide on + | #[+a("/docs/usage/saving-loading") saving, loading and data serialization]. + +h(2, "training") Training include _spacy-101/_training diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 00e200f59..eecb268b6 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -23,7 +23,6 @@ p include _spacy-101/_similarity include _spacy-101/_word-vectors - +h(2, "custom") Customising word vectors p @@ -31,33 +30,9 @@ p | vector for its underlying #[+api("lexeme") #[code Lexeme]], while | #[+api("doc#vector") #[code Doc.vector]] and | #[+api("span#vector") #[code Span.vector]] return an average of the - | vectors of their tokens. - -p - | You can customize these + | vectors of their tokens. You can customize these | behaviours by modifying the #[code doc.user_hooks], | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] | dictionaries. -+code("Example"). - # TODO - -p - | You can load new word vectors from a file-like buffer using the - | #[code vocab.load_vectors()] method. The file should be a - | whitespace-delimited text file, where the word is in the first column, - | and subsequent columns provide the vector data. For faster loading, you - | can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a - | path to a binary file written by #[code vocab.dump_vectors()]. - -+code("Example"). - # TODO - -p - | You can also load vectors from memory by writing to the - | #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors - | you are writing are of different dimensionality - | from the ones currently loaded, you should first call - | #[code vocab.resize_vectors(new_size)]. - +h(2, "similarity") Similarity From d65f99a72016cb6eb9b0fe18172abf206dc738a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 05:52:09 -0500 Subject: [PATCH 077/118] Improve model saving in train script --- spacy/cli/train.py | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index ee0ee53a2..b25cdcbd5 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -57,9 +57,9 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. - dropout_rates = util.decaying(util.env_opt('dropout_from', 0.5), + dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), - util.env_opt('dropout_decay', 1e-4)) + util.env_opt('dropout_decay', 0.0)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 64), util.env_opt('batch_compound', 1.001)) @@ -71,23 +71,30 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") - for i in range(n_iter): - with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: - train_docs = corpus.train_docs(nlp, projectivize=True, - gold_preproc=False, shuffle=i) - losses = {} - for batch in minibatch(train_docs, size=batch_sizes): - docs, golds = zip(*batch) - nlp.update(docs, golds, sgd=optimizer, - drop=next(dropout_rates), losses=losses) - pbar.update(len(docs)) + try: + for i in range(n_iter): + with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: + train_docs = corpus.train_docs(nlp, projectivize=True, + gold_preproc=False, max_length=1000) + losses = {} + for batch in minibatch(train_docs, size=batch_sizes): + docs, golds = zip(*batch) + nlp.update(docs, golds, sgd=optimizer, + drop=next(dropout_rates), losses=losses) + pbar.update(len(docs)) - with nlp.use_params(optimizer.averages): - scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) - print_progress(i, losses, scorer.scores) - with (output_path / 'model.bin').open('wb') as file_: - with nlp.use_params(optimizer.averages): - dill.dump(nlp, file_, -1) + with nlp.use_params(optimizer.averages): + scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) + with (output_path / ('model%d.pickle' % i)).open('wb') as file_: + dill.dump(nlp, file_, -1) + + + print_progress(i, losses, scorer.scores) + finally: + print("Saving model...") + with (output_path / 'model-final.pickle').open('wb') as file_: + with nlp.use_params(optimizer.averages): + dill.dump(nlp, file_, -1) def _render_parses(i, to_render): From f122d82f290a95cb972a392c401ea04d163b0930 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 13:17:48 +0200 Subject: [PATCH 078/118] Update usage docs and ddd "under construction" --- website/_includes/_mixins-base.jade | 11 +++ website/docs/usage/_spacy-101/_training.jade | 2 +- website/docs/usage/adding-languages.jade | 5 +- website/docs/usage/deep-learning.jade | 6 +- website/docs/usage/production-use.jade | 30 ++++---- website/docs/usage/spacy-101.jade | 4 ++ website/docs/usage/training-ner.jade | 70 +++++++++---------- website/docs/usage/training.jade | 56 --------------- website/docs/usage/visualizers.jade | 2 +- .../docs/usage/word-vectors-similarities.jade | 4 ++ 10 files changed, 78 insertions(+), 112 deletions(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index c6132df74..80d63353d 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -186,3 +186,14 @@ mixin landing-header() mixin landing-badge(url, graphic, alt, size) +a(url)(aria-label=alt title=alt).c-landing__badge +svg("graphics", graphic, size || 225) + + +//- Under construction (temporary) + Marks sections that still need to be completed for the v2.0 release. + +mixin under-construction() + +infobox("🚧 Under construction") + | This section is still being written and will be updated for the v2.0 + | release. Is there anything that you think should definitely mentioned or + | explained here? Any examples you'd like to see? #[strong Let us know] + | on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub! diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/docs/usage/_spacy-101/_training.jade index 59861434c..f4a0c7194 100644 --- a/website/docs/usage/_spacy-101/_training.jade +++ b/website/docs/usage/_spacy-101/_training.jade @@ -1,3 +1,3 @@ //- πŸ’« DOCS > USAGE > SPACY 101 > TRAINING -p ++under-construction diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index ae04aad57..cd1fc4199 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -107,7 +107,6 @@ p .u-text-right +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic - +table(["File name", "Variables", "Description"]) +row +cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py] @@ -439,7 +438,7 @@ p +h(3, "morph-rules") Morph rules -//- TODO: write morph rules section ++under-construction +h(2, "testing") Testing the new language tokenizer @@ -631,7 +630,7 @@ p | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. | The #[code vectors.bin] file should consist of one word and vector per line. -+aside-code("your_data_directory", "yaml"). +//-+aside-code("your_data_directory", "yaml"). β”œβ”€β”€ vocab/ | β”œβ”€β”€ lexemes.bin | β”œβ”€β”€ strings.json diff --git a/website/docs/usage/deep-learning.jade b/website/docs/usage/deep-learning.jade index fec01b4ba..18f33c900 100644 --- a/website/docs/usage/deep-learning.jade +++ b/website/docs/usage/deep-learning.jade @@ -17,6 +17,8 @@ p | #[+a("http://deeplearning.net/software/theano/") Theano] is also | supported. ++under-construction + +code("Runtime usage"). def count_entity_sentiment(nlp, texts): '''Compute the net document sentiment for each entity in the texts.''' @@ -153,7 +155,9 @@ p | adding another LSTM layer, using attention mechanism, using character | features, etc. -+h(2, "attribute-hooks") Attribute hooks (experimental) ++h(2, "attribute-hooks") Attribute hooks + ++under-construction p | Earlier, we saw how to store data in the new generic #[code user_data] diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade index c7f872c6d..e9fd4a30f 100644 --- a/website/docs/usage/production-use.jade +++ b/website/docs/usage/production-use.jade @@ -2,16 +2,18 @@ include ../../_includes/_mixins ++under-construction + +h(2, "multithreading") Multi-threading with #[code .pipe()] p | If you have a sequence of documents to process, you should use the - | #[+api("language#pipe") #[code .pipe()]] method. The method takes an - | iterator of texts, and accumulates an internal buffer, + | #[+api("language#pipe") #[code Language.pipe()]] method. The method takes + | an iterator of texts, and accumulates an internal buffer, | which it works on in parallel. It then yields the documents in order, | one-by-one. After a long and bitter struggle, the global interpreter | lock was freed around spaCy's main parsing loop in v0.100.3. This means - | that the #[code .pipe()] method will be significantly faster in most + | that #[code .pipe()] will be significantly faster in most | practical situations, because it allows shared memory parallelism. +code. @@ -20,23 +22,27 @@ p p | To make full use of the #[code .pipe()] function, you might want to - | brush up on Python generators. Here are a few quick hints: + | brush up on #[strong Python generators]. Here are a few quick hints: +list +item - | Generator comprehensions can be written - | (#[code item for item in sequence]) + | Generator comprehensions can be written as + | #[code (item for item in sequence)]. +item - | The #[code itertools] built-in library and the #[code cytoolz] - | package provide a lot of handy generator tools + | The + | #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library] + | and the + | #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package] + | provide a lot of handy #[strong generator tools]. +item | Often you'll have an input stream that pairs text with some - | important metadata, e.g. a JSON document. To pair up the metadata - | with the processed #[code Doc] object, you should use the tee - | function to split the generator in two, and then #[code izip] the - | extra stream to the document stream. + | important meta data, e.g. a JSON document. To + | #[strong pair up the meta data] with the processed #[code Doc] + | object, you should use the #[code itertools.tee] function to split + | the generator in two, and then #[code izip] the extra stream to the + | document stream. +h(2, "own-annotations") Bringing your own annotations diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 24690af57..7c6525004 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -4,6 +4,8 @@ include ../../_includes/_mixins +h(2, "features") Features ++under-construction + +aside | If one of spaCy's functionalities #[strong needs a model], it means that | you need to have one our the available @@ -162,6 +164,8 @@ include _spacy-101/_training +h(2, "architecture") Architecture ++under-construction + +image include ../../assets/img/docs/architecture.svg .u-text-right diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 8b8789485..4faa47675 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -64,44 +64,10 @@ p | predicts the new category with minimal difference from the previous | output. -+h(2, "saving-loading") Saving and loading - -p - | After training our model, you'll usually want to save its state, and load - | it back later. You can do this with the #[code Language.save_to_directory()] - | method: - -+code. - nlp.save_to_directory('/home/me/data/en_technology') - -p - | To make the model more convenient to deploy, we recommend wrapping it as - | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+api("cli#package") #[code package]] - | CLI command to create all required files and directories. - -+code(false, "bash"). - python -m spacy package /home/me/data/en_technology /home/me/my_models - -p - | To build the package and create a #[code .tar.gz] archive, run - | #[code python setup.py sdist] from within its directory. - -+infobox("Saving and loading models") - | For more information and a detailed guide on how to package your model, - | see the documentation on - | #[+a("/docs/usage/saving-loading") saving and loading models]. - -p - | After you've generated and installed the package, you'll be able to - | load the model as follows: - -+code. - import en_technology - nlp = en_technology.load() - +h(2, "example") Example: Adding and training an #[code ANIMAL] entity ++under-construction + p | This script shows how to add a new entity type to an existing pre-trained | NER model. To keep the example short and simple, only four sentences are @@ -170,5 +136,33 @@ p p | After training your model, you can - | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping - | models as Python packages, for ease of deployment. + | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend + | wrapping models as Python packages, for ease of deployment. + ++h(2, "saving-loading") Saving and loading + +p + | After training our model, you'll usually want to save its state, and load + | it back later. You can do this with the + | #[+api("language#to_disk") #[code Language.to_disk()]] method: + ++code. + nlp.to_disk('/home/me/data/en_technology') + +p + | To make the model more convenient to deploy, we recommend wrapping it as + | a Python package, so that you can install it via pip and load it as a + | module. spaCy comes with a handy #[+api("cli#package") #[code package]] + | CLI command to create all required files and directories. + ++code(false, "bash"). + python -m spacy package /home/me/data/en_technology /home/me/my_models + +p + | To build the package and create a #[code .tar.gz] archive, run + | #[code python setup.py sdist] from within its directory. + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 9df71851a..6c6c17e17 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -81,59 +81,3 @@ p.o-inline-list p +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example - -+h(2, "feature-templates") Customizing the feature extraction - -p - | spaCy currently uses linear models for the tagger, parser and entity - | recognizer, with weights learned using the - | #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm]. - -+aside("Linear Model Feature Scheme") - | For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme]. - -p - | Because it's a linear model, it's important for accuracy to build - | conjunction features out of the atomic predictors. Let's say you have - | two atomic predictors asking, "What is the part-of-speech of the - | previous token?", and "What is the part-of-speech of the previous - | previous token?". These predictors will introduce a number of features, - | e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction - | template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ]. - -p - | The feature extraction proceeds in two passes. In the first pass, we - | fill an array with the values of all of the atomic predictors. In the - | second pass, we iterate over the feature templates, and fill a small - | temporary array with the predictors that will be combined into a - | conjunction feature. Finally, we hash this array into a 64-bit integer, - | using the MurmurHash algorithm. You can see this at work in the - | #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module. - -p - | It's very easy to change the feature templates, to create novel - | combinations of the existing atomic predictors. There's currently no API - | available to add new atomic predictors, though. You'll have to create a - | subclass of the model, and write your own #[code set_featuresC] method. - -p - | The feature templates are passed in using the #[code features] keyword - | argument to the constructors of the #[+api("tagger") #[code Tagger]], - | #[+api("dependencyparser") #[code DependencyParser]] and - | #[+api("entityrecognizer") #[code EntityRecognizer]]: - -+code. - from spacy.vocab import Vocab - from spacy.pipeline import Tagger - from spacy.tagger import P2_orth, P1_orth - from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth - - vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}}) - tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster), - (P2_orth,), (P1_orth,), (W_orth,), - (N1_orth,), (N2_orth,)]) - -p - | Custom feature templates can be passed to the #[code DependencyParser] - | and #[code EntityRecognizer] as well, also using the #[code features] - | keyword argument of the constructor. diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 90a343700..186fc5db3 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -334,7 +334,7 @@ p | token #[code <script src="malicious-code.js"><script>]. | Instead of relying on the server to render and sanitize HTML, you | can do this on the client in JavaScript. displaCy.js creates - | the SVG markup as DOM nodes and will never insert raw HTML. + | the markup as DOM nodes and will never insert raw HTML. p | The #[code parse_deps] function takes a #[code Doc] object and returns diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index eecb268b6..e5935cfb6 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -25,6 +25,8 @@ include _spacy-101/_word-vectors +h(2, "custom") Customising word vectors ++under-construction + p | By default, #[+api("token#vector") #[code Token.vector]] returns the | vector for its underlying #[+api("lexeme") #[code Lexeme]], while @@ -36,3 +38,5 @@ p | dictionaries. +h(2, "similarity") Similarity + ++under-construction From 93ee5c4a5242f971a8bfbfa823151391fe292a26 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 13:22:45 +0200 Subject: [PATCH 079/118] Update serialization info --- website/docs/usage/_spacy-101/_serialization.jade | 8 ++++---- website/docs/usage/v2.jade | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade index 35d931634..a763f422b 100644 --- a/website/docs/usage/_spacy-101/_serialization.jade +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -22,10 +22,10 @@ p | untrusted sources. p - | All container classes and pipeline components, i.e. - for cls in ["Doc", "Language", "Tokenizer", "Tagger", "DependencyParser", "EntityRecognizer", "Vocab", "StringStore"] - | #[+api(cls.toLowerCase()) #[code=cls]], - | have the following methods available: + | All container classes, i.e. #[+api("language") #[code Language]], + | #[+api("doc") #[code Doc]], #[+api("vocab") #[code Vocab]] and + | #[+api("stringstore") #[code StringStore]] have the following methods + | available: +table(["Method", "Returns", "Example"]) - style = [1, 0, 1] diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 9bf32bf96..23b234c43 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -50,9 +50,10 @@ p p | spay's serialization API has been made consistent across classes and - | objects. All container classes and pipeline components now have a - | #[code to_bytes()], #[code from_bytes()], #[code to_disk()] and - | #[code from_disk()] method that supports the Pickle protocol. + | objects. All container classes, i.e. #[code Language], #[code Doc], + | #[code Vocab] and #[code StringStore] now have a #[code to_bytes()], + | #[code from_bytes()], #[code to_disk()] and #[code from_disk()] method + | that supports the Pickle protocol. p | The improved #[code spacy.load] makes loading models easier and more From 1b9c6ded718136500eed4bfa63051c7624e65fd1 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 13:40:32 +0200 Subject: [PATCH 080/118] Update API docs and add "source" button to GH source --- website/_includes/_page-docs.jade | 14 +++-- website/docs/api/_data.json | 45 +++++++++++----- website/docs/api/binder.jade | 5 ++ website/docs/api/language.jade | 5 ++ website/docs/api/tokenizer.jade | 87 ------------------------------- 5 files changed, 54 insertions(+), 102 deletions(-) create mode 100644 website/docs/api/binder.jade diff --git a/website/_includes/_page-docs.jade b/website/_includes/_page-docs.jade index ec2751c4d..26b82381f 100644 --- a/website/_includes/_page-docs.jade +++ b/website/_includes/_page-docs.jade @@ -6,9 +6,17 @@ include _sidebar main.o-main.o-main--sidebar.o-main--aside article.o-content - +h(1)=title - if tag - +tag=tag + +grid.o-no-block + +grid-col(source ? "two-thirds" : "full") + +h(1)=title + if tag + +tag=tag + + if source + +grid-col("third").u-text-right + .o-inline-list + +button(gh("spacy", source), false, "secondary").u-text-tag Source #[+icon("code", 14)] + if ALPHA +infobox("⚠️ You are viewing the spaCy v2.0 alpha docs") diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index f3f996846..f6a6a7e31 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -24,7 +24,8 @@ "Vocab": "vocab", "StringStore": "stringstore", "GoldParse": "goldparse", - "GoldCorpus": "goldcorpus" + "GoldCorpus": "goldcorpus", + "Binder": "binder" }, "Other": { "Annotation Specs": "annotation" @@ -47,62 +48,74 @@ "spacy": { "title": "spaCy top-level functions", + "source": "spacy/__init__.py", "next": "displacy" }, "displacy": { "title": "displaCy", "tag": "module", + "source": "spacy/displacy", "next": "util" }, "util": { "title": "Utility Functions", + "source": "spacy/util.py", "next": "cli" }, "cli": { - "title": "Command Line Interface" + "title": "Command Line Interface", + "source": "spacy/cli" }, "language": { "title": "Language", - "tag": "class" + "tag": "class", + "source": "spacy/language.py" }, "doc": { "title": "Doc", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/doc.pyx" }, "token": { "title": "Token", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/token.pyx" }, "span": { "title": "Span", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/span.pyx" }, "lexeme": { "title": "Lexeme", - "tag": "class" + "tag": "class", + "source": "spacy/lexeme.pyx" }, "vocab": { "title": "Vocab", - "tag": "class" + "tag": "class", + "source": "spacy/vocab.pyx" }, "stringstore": { "title": "StringStore", - "tag": "class" + "tag": "class", + "source": "spacy/strings.pyx" }, "matcher": { "title": "Matcher", - "tag": "class" + "tag": "class", + "source": "spacy/matcher.pyx" }, "dependenyparser": { @@ -122,7 +135,8 @@ "tokenizer": { "title": "Tokenizer", - "tag": "class" + "tag": "class", + "source": "spacy/tokenizer.pyx" }, "tagger": { @@ -132,11 +146,18 @@ "goldparse": { "title": "GoldParse", - "tag": "class" + "tag": "class", + "source": "spacy/gold.pyx" }, "goldcorpus": { "title": "GoldCorpus", + "tag": "class", + "source": "spacy/gold.pyx" + }, + + "binder": { + "title": "Binder", "tag": "class" }, diff --git a/website/docs/api/binder.jade b/website/docs/api/binder.jade new file mode 100644 index 000000000..5e3e7d36c --- /dev/null +++ b/website/docs/api/binder.jade @@ -0,0 +1,5 @@ +//- πŸ’« DOCS > API > BINDER + +include ../../_includes/_mixins + ++under-construction diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index a22bee5f1..9e45a89d9 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -382,6 +382,11 @@ p Load state from a binary string. +cell #[code Vocab] +cell A container for the lexical types. + +row + +cell #[code tokenizer] + +cell #[code Tokenizer] + +cell The tokenizer. + +row +cell #[code make_doc] +cell #[code lambda text: Doc] diff --git a/website/docs/api/tokenizer.jade b/website/docs/api/tokenizer.jade index 87e1ac81e..8d933f75b 100644 --- a/website/docs/api/tokenizer.jade +++ b/website/docs/api/tokenizer.jade @@ -198,93 +198,6 @@ p | attributes. The #[code ORTH] fields of the attributes must | exactly match the string when they are concatenated. -+h(2, "to_disk") Tokenizer.to_disk - +tag method - +tag-new(2) - -p Save the current state to a directory. - -+aside-code("Example"). - tokenizer.to_disk('/path/to/tokenizer') - -+table(["Name", "Type", "Description"]) - +row - +cell #[code path] - +cell unicode or #[code Path] - +cell - | A path to a directory, which will be created if it doesn't exist. - | Paths may be either strings or #[code Path]-like objects. - -+h(2, "from_disk") Tokenizer.from_disk - +tag method - +tag-new(2) - -p Loads state from a directory. Modifies the object in place and returns it. - -+aside-code("Example"). - from spacy.tokenizer import Tokenizer - tokenizer = Tokenizer(nlp.vocab) - tokenizer = tokenizer.from_disk('/path/to/tokenizer') - -+table(["Name", "Type", "Description"]) - +row - +cell #[code path] - +cell unicode or #[code Path] - +cell - | A path to a directory. Paths may be either strings or - | #[code Path]-like objects. - - +footrow - +cell returns - +cell #[code Tokenizer] - +cell The modified #[code Tokenizer] object. - -+h(2, "to_bytes") Tokenizer.to_bytes - +tag method - -p Serialize the current state to a binary string. - -+aside-code("Example"). - tokenizer_bytes = tokenizer.to_bytes() - -+table(["Name", "Type", "Description"]) - +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being serialized. - - +footrow - +cell returns - +cell bytes - +cell The serialized form of the #[code Tokenizer] object. - -+h(2, "from_bytes") Tokenizer.from_bytes - +tag method - -p Load state from a binary string. - -+aside-code("Example"). - fron spacy.tokenizer import Tokenizer - tokenizer_bytes = tokenizer.to_bytes() - new_tokenizer = Tokenizer(nlp.vocab) - new_tokenizer.from_bytes(tokenizer_bytes) - -+table(["Name", "Type", "Description"]) - +row - +cell #[code bytes_data] - +cell bytes - +cell The data to load from. - - +row - +cell #[code **exclude] - +cell - - +cell Named attributes to prevent from being loaded. - - +footrow - +cell returns - +cell #[code Tokenizer] - +cell The #[code Tokenizer] object. - +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) From 1b982f083887e01a780b3845816828840a46e82c Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 14:02:38 +0200 Subject: [PATCH 081/118] Update train command and add docs on hyperparameters --- website/docs/api/cli.jade | 113 +++++++++++++++++++++-- website/docs/usage/adding-languages.jade | 2 +- 2 files changed, 105 insertions(+), 10 deletions(-) diff --git a/website/docs/api/cli.jade b/website/docs/api/cli.jade index b78d4b7c9..30bd27e52 100644 --- a/website/docs/api/cli.jade +++ b/website/docs/api/cli.jade @@ -166,7 +166,7 @@ p | #[+a("/docs/api/annotation#json-input") JSON format]. +code(false, "bash"). - python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner] + python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] +table(["Argument", "Type", "Description"]) +row @@ -192,18 +192,13 @@ p +row +cell #[code --n-iter], #[code -n] +cell option - +cell Number of iterations (default: #[code 15]). + +cell Number of iterations (default: #[code 20]). +row - +cell #[code --n_sents], #[code -ns] + +cell #[code --n-sents], #[code -ns] +cell option +cell Number of sentences (default: #[code 0]). - +row - +cell #[code --parser-L1], #[code -L] - +cell option - +cell L1 regularization penalty for parser (default: #[code 0.0]). - +row +cell #[code --use-gpu], #[code -G] +cell flag @@ -220,7 +215,7 @@ p +cell Don't train parser. +row - +cell #[code --no-ner], #[code -N] + +cell #[code --no-entities], #[code -N] +cell flag +cell Don't train NER. @@ -229,6 +224,106 @@ p +cell flag +cell Show help message and available arguments. ++h(3, "train-hyperparams") Environment variables for hyperparameters + +p + | spaCy lets you set hyperparameters for training via environment variables. + | This is useful, because it keeps the command simple and allows you to + | #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") create an alias] + | for your custom #[code train] command while still being able to easily + | tweak the hyperparameters. For example: + ++code(false, "bash"). + parser_hidden_depth=2 parser_maxout_pieces=2 train-parser + ++under-construction + ++table(["Name", "Description", "Default"]) + +row + +cell #[code dropout_from] + +cell + +cell #[code 0.2] + + +row + +cell #[code dropout_to] + +cell + +cell #[code 0.2] + + +row + +cell #[code dropout_decay] + +cell + +cell #[code 0.0] + + +row + +cell #[code batch_from] + +cell + +cell #[code 1] + + +row + +cell #[code batch_to] + +cell + +cell #[code 64] + + +row + +cell #[code batch_compound] + +cell + +cell #[code 1.001] + + +row + +cell #[code token_vector_width] + +cell + +cell #[code 128] + + +row + +cell #[code embed_size] + +cell + +cell #[code 7500] + + +row + +cell #[code parser_maxout_pieces] + +cell + +cell #[code ] + + +row + +cell #[code parser_hidden_depth] + +cell + +cell #[code ] + + +row + +cell #[code hidden_width] + +cell + +cell #[code 128] + + +row + +cell #[code learn_rate] + +cell + +cell #[code 0.001] + + +row + +cell #[code optimizer_B1] + +cell + +cell #[code 0.9] + + +row + +cell #[code optimizer_B2] + +cell + +cell #[code 0.999] + + +row + +cell #[code optimizer_eps] + +cell + +cell #[code 1e-08] + + +row + +cell #[code L2_penalty] + +cell + +cell #[code 1e-06] + + +row + +cell #[code grad_norm_clip] + +cell + +cell #[code 1.0] + +h(2, "package") Package p diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index cd1fc4199..779e2e100 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -661,4 +661,4 @@ p | model use the using spaCy's #[+api("cli#train") #[code train]] command: +code(false, "bash"). - python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] + python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--no-tagger] [--no-parser] [--no-entities] From 70afcfec3e6a38a28790a242f3c895b356d8393b Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 14:04:31 +0200 Subject: [PATCH 082/118] Update defaults and example --- website/docs/api/cli.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/cli.jade b/website/docs/api/cli.jade index 30bd27e52..a0acf3e9a 100644 --- a/website/docs/api/cli.jade +++ b/website/docs/api/cli.jade @@ -234,7 +234,7 @@ p | tweak the hyperparameters. For example: +code(false, "bash"). - parser_hidden_depth=2 parser_maxout_pieces=2 train-parser + parser_hidden_depth=2 parser_maxout_pieces=1 train-parser +under-construction @@ -282,12 +282,12 @@ p +row +cell #[code parser_maxout_pieces] +cell - +cell #[code ] + +cell #[code 2] +row +cell #[code parser_hidden_depth] +cell - +cell #[code ] + +cell #[code 1] +row +cell #[code hidden_width] From daac3e3573c3661d604909ca56c61fcd8e2107eb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:30:52 -0500 Subject: [PATCH 083/118] Always shuffle gold data, and support length cap --- spacy/gold.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 579010e6d..558e4e008 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -198,15 +198,15 @@ class GoldCorpus(object): n += 1 return n - def train_docs(self, nlp, shuffle=0, gold_preproc=False, - projectivize=False): + def train_docs(self, nlp, gold_preproc=False, + projectivize=False, max_length=None): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( self.train_tuples) - if shuffle: - random.shuffle(train_tuples) - gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) + random.shuffle(train_tuples) + gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, + max_length=max_length) yield from gold_docs def dev_docs(self, nlp, gold_preproc=False): @@ -215,7 +215,7 @@ class GoldCorpus(object): yield from gold_docs @classmethod - def iter_gold_docs(cls, nlp, tuples, gold_preproc): + def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None): for raw_text, paragraph_tuples in tuples: if gold_preproc: raw_text = None @@ -226,7 +226,8 @@ class GoldCorpus(object): gold_preproc) golds = cls._make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): - yield doc, gold + if not max_length or len(doc) < max_length: + yield doc, gold @classmethod def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): From 3d5a536eaa49a46a17156ea8ba996f43179a2e13 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:31:23 -0500 Subject: [PATCH 084/118] Improve efficiency of parser batching --- spacy/syntax/_state.pxd | 1 + spacy/syntax/arc_eager.pyx | 9 ++++- spacy/syntax/ner.pyx | 9 ++++- spacy/syntax/nn_parser.pyx | 55 ++++++++++++------------------ spacy/syntax/stateclass.pyx | 5 +++ spacy/syntax/transition_system.pyx | 28 +++++++++++++++ 6 files changed, 72 insertions(+), 35 deletions(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 829779dc1..4b2b47270 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -345,6 +345,7 @@ cdef cppclass StateC: this._s_i = src._s_i this._e_i = src._e_i this._break = src._break + this.offset = src.offset void fast_forward() nogil: # space token attachement policy: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 0a1422088..f7c1c7922 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -350,8 +350,15 @@ cdef class ArcEager(TransitionSystem): def __get__(self): return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) + def has_gold(self, GoldParse gold, start=0, end=None): + end = end or len(gold.heads) + if all([tag is None for tag in gold.heads[start:end]]): + return False + else: + return True + def preprocess_gold(self, GoldParse gold): - if all([h is None for h in gold.heads]): + if not self.has_gold(gold): return None for i in range(gold.length): if gold.heads[i] is None: # Missing values diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 74ab9c26c..af42eded4 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -95,8 +95,15 @@ cdef class BiluoPushDown(TransitionSystem): else: return MOVE_NAMES[move] + '-' + self.strings[label] + def has_gold(self, GoldParse gold, start=0, end=None): + end = end or len(gold.ner) + if all([tag == '-' for tag in gold.ner[start:end]]): + return False + else: + return True + def preprocess_gold(self, GoldParse gold): - if all([tag == '-' for tag in gold.ner]): + if not self.has_gold(gold): return None for i in range(gold.length): gold.c.ner[i] = self.lookup_transition(gold.ner[i]) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 341b8c041..35966d536 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -427,8 +427,7 @@ cdef class Parser: cuda_stream = get_cuda_stream() - states, golds = self._init_gold_batch(docs, golds) - max_length = min([len(doc) for doc in docs]) + states, golds, max_length = self._init_gold_batch(docs, golds) state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) todo = [(s, g) for (s, g) in zip(states, golds) @@ -472,46 +471,36 @@ cdef class Parser: backprops, sgd, cuda_stream) return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) - def _init_gold_batch(self, docs, golds): + def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long doc will get multiple states. Let's say we have a doc of length 2*N, where N is the shortest doc. We'll make two states, one representing long_doc[:N], and another representing long_doc[N:].""" - cdef StateClass state - lengths = [len(doc) for doc in docs] - min_length = min(lengths) - offset = 0 + cdef: + StateClass state + Transition action + whole_states = self.moves.init_batch(whole_docs) + max_length = max(5, min(20, min([len(doc) for doc in whole_docs]))) states = [] - extra_golds = [] - cdef Pool mem = Pool() - costs = mem.alloc(self.moves.n_moves, sizeof(float)) - is_valid = mem.alloc(self.moves.n_moves, sizeof(int)) - for doc, gold in zip(docs, golds): + golds = [] + for doc, state, gold in zip(whole_docs, whole_states, whole_golds): gold = self.moves.preprocess_gold(gold) - state = StateClass(doc, offset=offset) - self.moves.initialize_state(state.c) - if not state.is_final(): - states.append(state) - extra_golds.append(gold) - start = min(min_length, len(doc)) + if gold is None: + continue + oracle_actions = self.moves.get_oracle_sequence(doc, gold) + start = 0 while start < len(doc): - length = min(min_length, len(doc)-start) - state = StateClass(doc, offset=offset) - self.moves.initialize_state(state.c) + state = state.copy() while state.B(0) < start and not state.is_final(): - self.moves.set_costs(is_valid, costs, state, gold) - for i in range(self.moves.n_moves): - if is_valid[i] and costs[i] <= 0: - self.moves.c[i].do(state.c, self.moves.c[i].label) - break - else: - raise ValueError("Could not find gold move") - start += length - if not state.is_final(): + action = self.moves.c[oracle_actions.pop(0)] + action.do(state.c, action.label) + has_gold = self.moves.has_gold(gold, start=start, + end=start+max_length) + if not state.is_final() and has_gold: states.append(state) - extra_golds.append(gold) - offset += len(doc) - return states, extra_golds + golds.append(gold) + start += min(max_length, len(doc)-start) + return states, golds, max_length def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): # Tells CUDA to block, so our async copies complete. diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index fd38710e7..228a3ff91 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -41,6 +41,11 @@ cdef class StateClass: def is_final(self): return self.c.is_final() + def copy(self): + cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length) + new_state.c.clone(self.c) + return new_state + def print_state(self, words): words = list(words) + ['_'] top = words[self.S(0)] + '_%d' % self.S_(0).head diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index d6750d09c..07102aeb0 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -61,6 +61,24 @@ cdef class TransitionSystem: offset += len(doc) return states + def get_oracle_sequence(self, doc, GoldParse gold): + cdef Pool mem = Pool() + costs = mem.alloc(self.n_moves, sizeof(float)) + is_valid = mem.alloc(self.n_moves, sizeof(int)) + + cdef StateClass state = StateClass(doc, offset=0) + self.initialize_state(state.c) + history = [] + while not state.is_final(): + self.set_costs(is_valid, costs, state, gold) + for i in range(self.n_moves): + if is_valid[i] and costs[i] <= 0: + action = self.c[i] + history.append(i) + action.do(state.c, action.label) + break + return history + cdef int initialize_state(self, StateC* state) nogil: pass @@ -92,11 +110,21 @@ cdef class TransitionSystem: StateClass stcls, GoldParse gold) except -1: cdef int i self.set_valid(is_valid, stcls.c) + cdef int n_gold = 0 for i in range(self.n_moves): if is_valid[i]: costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) + n_gold += costs[i] <= 0 else: costs[i] = 9000 + if n_gold <= 0: + print(gold.words) + print(gold.ner) + raise ValueError( + "Could not find a gold-standard action to supervise " + "the entity recognizer\n" + "The transition system has %d actions.\n" + "%s" % (self.n_moves)) def add_action(self, int action, label): if not isinstance(label, int): From 5a87bcf35f78a88173280918ab5908278ae8a7a6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:32:34 -0500 Subject: [PATCH 085/118] Fix converters --- spacy/cli/converters/iob2json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 45393dd80..c2e944c0a 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ...compat import json_dumps, path2str from ...util import prints +from ...gold import iob_to_biluo def iob2json(input_path, output_path, n_sents=10, *a, **k): @@ -29,9 +30,10 @@ def read_iob(file_): continue tokens = [t.rsplit('|', 2) for t in line.split()] words, pos, iob = zip(*tokens) + biluo = iob_to_biluo(iob) sentences.append([ {'orth': w, 'tag': p, 'ner': ent} - for (w, p, ent) in zip(words, pos, iob) + for (w, p, ent) in zip(words, pos, biluo) ]) sentences = [{'tokens': sent} for sent in sentences] paragraphs = [{'sentences': [sent]} for sent in sentences] From 2b3b937a04622d13e30204ff4553d6815a841289 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:32:41 -0500 Subject: [PATCH 086/118] Fix converter CLI --- spacy/cli/convert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index c7730ab9e..e95ffd08b 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -7,7 +7,6 @@ from pathlib import Path from .converters import conllu2json, iob2json from ..util import prints - # Converters are matched by file extension. To add a converter, add a new entry # to this dict with the file extension mapped to the converter function imported # from /converters. @@ -25,7 +24,7 @@ CONVERTERS = { n_sents=("Number of sentences per doc", "option", "n", float), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(input_file, output_dir, n_sents, morphology): +def convert(_, input_file, output_dir, n_sents, morphology): """Convert files into JSON format for use with train command and other experiment management functions. """ @@ -39,4 +38,5 @@ def convert(input_file, output_dir, n_sents, morphology): if not file_ext in CONVERTERS: prints("Can't find converter for %s" % input_path.parts[-1], title="Unknown format", exits=1) - CONVERTERS[file_ext](input_path, output_path, *args) + CONVERTERS[file_ext](input_path, output_path, + n_sents=n_sents, morphology=morphology) From 2e587c641734c4110e0c0154ddc8e04c68a5a83f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 11:32:55 -0500 Subject: [PATCH 087/118] Export iob_to_biluo utility --- spacy/gold.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 579010e6d..f9500dbb6 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -305,7 +305,7 @@ def read_json_file(loc, docs_filter=None, limit=None): yield [paragraph.get('raw', None), sents] -def _iob_to_biluo(tags): +def iob_to_biluo(tags): out = [] curr_label = None tags = list(tags) From 3d22fcaf0b3c7e4114153b5b3e1d8eb078fa8e44 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 26 May 2017 14:02:59 -0500 Subject: [PATCH 088/118] Return None from parser if there are no annotations --- spacy/syntax/nn_parser.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 35966d536..b7aca26b8 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -432,6 +432,8 @@ cdef class Parser: 0.0) todo = [(s, g) for (s, g) in zip(states, golds) if not s.is_final() and g is not None] + if not todo: + return None backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) From 73a643d32a20d8c4a109bf3a92dff645c370bd17 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 08:20:13 -0500 Subject: [PATCH 089/118] Don't randomise pipeline for training, and don't update if no gradient --- spacy/language.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index e4c18f8ca..7adae0ed5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -212,18 +212,17 @@ class Language(object): """ tok2vec = self.pipeline[0] feats = tok2vec.doc2feats(docs) - procs = list(self.pipeline[1:]) - random.shuffle(procs) grads = {} def get_grads(W, dW, key=None): grads[key] = (W, dW) - for proc in procs: + for proc in self.pipeline[1:]: if not hasattr(proc, 'update'): continue tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) - bp_tokvecses(d_tokvecses, sgd=sgd) + if d_tokvecses is not None: + bp_tokvecses(d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. From de13fe030548acf86e759e2c16c85712ab8e30bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 08:20:32 -0500 Subject: [PATCH 090/118] Remove length cap on sentences --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b25cdcbd5..ed146cb24 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -70,12 +70,12 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) - print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") + print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") try: for i in range(n_iter): with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: train_docs = corpus.train_docs(nlp, projectivize=True, - gold_preproc=False, max_length=1000) + gold_preproc=False, max_length=0) losses = {} for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) From a8e58e04efc5b57a2425595eaf1e049c23a37352 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:57:10 +0200 Subject: [PATCH 091/118] Add symbols class to punctuation rules to handle emoji (see #1088) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently doesn't work for Hungarian, because of conflicts with the custom punctuation rules. Also doesn't take multi-character emoji like πŸ‘©πŸ½β€πŸ’» into account. --- spacy/lang/bn/punctuation.py | 10 +++++----- spacy/lang/char_classes.py | 5 +++-- spacy/lang/punctuation.py | 11 ++++++----- spacy/tests/tokenizer/test_exceptions.py | 12 +++++++++--- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index 66b7d967c..96485dd55 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,8 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, UNITS -from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS +from ..char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES, UNITS _currency = r"\$|Β’|Β£|€|Β₯|ΰΈΏ|৳" @@ -10,16 +10,16 @@ _quotes = QUOTES.replace("'", '') _list_punct = LIST_PUNCT + 'ΰ₯€ ΰ₯₯'.strip().split() -_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES) +_prefixes = ([r'\+'] + _list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS) -_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (_list_punct + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + [r'(?<=[0-9])\+', r'(?<=Β°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(_currency), r'(?<=[0-9])(?:{})'.format(UNITS), r'(?<=[{}(?:{})])\.'.format('|'.join([ALPHA_LOWER, r'%Β²\-\)\]\+', QUOTES]), _currency)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 5b81eddde..bec685646 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -20,7 +20,6 @@ _upper = [_latin_upper] _lower = [_latin_lower] _uncased = [_bengali, _hebrew] - ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased) @@ -33,13 +32,14 @@ _currency = r'\$ Β£ € Β₯ ΰΈΏ US\$ C\$ A\$' _punct = r'… , : ; \! \? ΒΏ Β‘ \( \) \[ \] \{ \} < > _ # \* &' _quotes = r'\' \'\' " ” β€œ `` ` β€˜ Β΄ β€š , β€ž Β» Β«' _hyphens = '- – β€” -- ---' - +_other_symbols = r'[\p{So}]' UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) QUOTES = merge_chars(_quotes) PUNCT = merge_chars(_punct) HYPHENS = merge_chars(_hyphens) +ICONS = _other_symbols LIST_UNITS = split_chars(_units) LIST_CURRENCY = split_chars(_currency) @@ -47,3 +47,4 @@ LIST_QUOTES = split_chars(_quotes) LIST_PUNCT = split_chars(_punct) LIST_HYPHENS = split_chars(_hyphens) LIST_ELLIPSES = [r'\.\.+', '…'] +LIST_ICONS = [_other_symbols] diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index 74bb28f5f..680f5cff0 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -2,15 +2,16 @@ from __future__ import unicode_literals from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY -from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES -from .char_classes import CURRENCY, UNITS +from .char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS +from .char_classes import QUOTES, CURRENCY, UNITS _prefixes = (['Β§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + - LIST_CURRENCY) + LIST_CURRENCY + LIST_ICONS) -_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS + + ["'s", "'S", "’s", "’S"] + [r'(?<=[0-9])\+', r'(?<=Β°[FfCcKk])\.', r'(?<=[0-9])(?:{})'.format(CURRENCY), @@ -19,7 +20,7 @@ _suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QU r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)]) -_infixes = (LIST_ELLIPSES + +_infixes = (LIST_ELLIPSES + LIST_ICONS + [r'(?<=[0-9])[+\-\*^](?=[0-9-])', r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index aab27714e..70fb103dc 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,7 +1,4 @@ # coding: utf-8 -"""Test that tokenizer exceptions and emoticons are handled correctly.""" - - from __future__ import unicode_literals import pytest @@ -39,3 +36,12 @@ def test_tokenizer_handles_emoticons(tokenizer): def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [('can you still dunk?πŸ•πŸ”πŸ˜΅LOL', 8), + ('iπŸ’™you', 3), ('🀘🀘yay!', 4)]) +def test_tokenizer_handles_emoji(tokenizer, text, length): + exceptions = ["hu"] + tokens = tokenizer(text) + if tokens[0].lang_ not in exceptions: + assert len(tokens) == length From e05bcd6aa838a7098c699a920e92628296961927 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:57:46 +0200 Subject: [PATCH 092/118] Update docs to reflect flattened model meta.json Don't use "setup" key and instead, keep "lang" on root level and add "pipeline". --- .../usage/language-processing-pipeline.jade | 22 ++++++++----------- website/docs/usage/saving-loading.jade | 18 +++++++-------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index ce23a1666..1392fc2f8 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -19,19 +19,17 @@ p p | When you load a model, spaCy first consults the model's - | #[+a("/docs/usage/saving-loading#models-generating") meta.json] for its - | #[code setup] details. This typically includes the ID of a language class, + | #[+a("/docs/usage/saving-loading#models-generating") meta.json]. The + | meta typically includes the model details, the ID of a language class, | and an optional list of pipeline components. spaCy then does the | following: +aside-code("meta.json (excerpt)", "json"). { "name": "example_model", + "lang": "en" "description": "Example model for spaCy", - "setup": { - "lang": "en", - "pipeline": ["token_vectors", "tagger"] - } + "pipeline": ["token_vectors", "tagger"] } +list("numbers") @@ -287,17 +285,15 @@ p p | In the model package's meta.json, specify the language class and pipeline - | IDs in #[code setup]: + | IDs: +code("meta.json (excerpt)", "json"). { - "name": "my_sentiment_model", + "name": "sentiment_model", + "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", - "setup": { - "lang": "en", - "pipeline": ["vectorizer", "sentiment"] - } + "pipeline": ["vectorizer", "sentiment"] } p @@ -307,7 +303,7 @@ p | by your custom #[code "sentiment"] factory. +code. - nlp = spacy.load('my_sentiment_model') + nlp = spacy.load('en_sentiment_model') doc = nlp(u'I love pizza') assert doc.sentiment diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 477db925c..1ecb7d7ee 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -74,16 +74,14 @@ p +aside-code("meta.json", "json"). { "name": "example_model", + "lang": "en", "version": "1.0.0", "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", "license": "CC BY-SA 3.0", - "setup": { - "lang": "en", - "pipeline": ["token_vectors", "tagger"] - } + "pipeline": ["token_vectors", "tagger"] } +code(false, "bash"). @@ -110,9 +108,9 @@ p +h(3, "models-custom") Customising the model setup p - | The meta.json includes a #[code setup] key that lets you customise how - | the model should be initialised and loaded. You can define the language - | data to be loaded and the + | The meta.json includes the model details, like name, requirements and + | license, and lets you customise how the model should be initialised and + | loaded. You can define the language data to be loaded and the | #[+a("/docs/usage/language-processing-pipeline") processing pipeline] to | execute. @@ -183,9 +181,9 @@ p p | To load a model from a data directory, you can use | #[+api("spacy#load") #[code spacy.load()]] with the local path. This will - | look for a meta.json in the directory and use the #[code setup] details - | to initialise a #[code Language] class with a processing pipeline and - | load in the model data. + | look for a meta.json in the directory and use the #[code lang] and + | #[code pipeline] settings to initialise a #[code Language] class with a + | processing pipeline and load in the model data. +code. nlp = spacy.load('/path/to/model') From 0d33ead507bfc79ac341fd9b0bbe3a1e8aacc1d9 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:58:06 +0200 Subject: [PATCH 093/118] Fix initialisation of Doc in lightning tour example --- website/docs/usage/lightning-tour.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 4a9a2315f..eefb7a11a 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -129,13 +129,14 @@ p +code. import spacy from spacy.tokens.doc import Doc + from spacy.vocab import Vocab nlp = spacy.load('en') moby_dick = open('moby_dick.txt', 'r') doc = nlp(moby_dick) doc.to_disk('/moby_dick.bin') - new_doc = Doc().from_disk('/moby_dick.bin') + new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') +infobox | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] From 22bf5f63bfb4a37fc8b01724c121d2abbfecaf6e Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 17:58:18 +0200 Subject: [PATCH 094/118] Update Matcher docs and add social media analysis example --- website/docs/usage/rule-based-matching.jade | 119 +++++++++++++++++++- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index a54b70b89..fde6da6ef 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -11,7 +11,7 @@ p | You can also associate patterns with entity IDs, to allow some basic | entity linking or disambiguation. -+aside("What about \"real\" regular expressions?") +//-+aside("What about \"real\" regular expressions?") +h(2, "adding-patterns") Adding patterns @@ -119,7 +119,7 @@ p +code. # Add a new custom flag to the vocab, which is always False by default. # BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span. - BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False) + BAD_HTML_FLAG = nlp.vocab.add_flag(lambda text: False) def merge_and_flag(matcher, doc, i, matches): match_id, start, end = matches[i] @@ -221,7 +221,7 @@ p +cell match 0 or 1 times +cell optional, max one -+h(3, "quantifiers-example1") Quantifiers example: Using linguistic annotations ++h(2, "example1") Example: Using linguistic annotations p | Let's say you're analysing user comments and you want to find out what @@ -283,7 +283,7 @@ p # set manual=True to make displaCy render straight from a dictionary displacy.serve(matched_sents, style='ent', manual=True) -+h(3, "quantifiers-example2") Quantifiers example: Phone numbers ++h(2, "example2") Example: Phone numbers p | Phone numbers can have many different formats and matching them is often @@ -320,3 +320,114 @@ p | It'll produce more predictable results, is much easier to modify and | extend, and doesn't require any training data – only a set of | test cases. + ++h(2, "example3") Example: Hashtags and emoji on social media + +p + | Social media posts, especially tweets, can be difficult to work with. + | They're very short and often contain various emoji and hashtags. By only + | looking at the plain text, you'll lose a lot of valuable semantic + | information. + +p + | Let's say you've extracted a large sample of social media posts on a + | specific topic, for example posts mentioning a brand name or product. + | As the first step of your data exploration, you want to filter out posts + | containing certain emoji and use them to assign a general sentiment + | score, based on whether the expressed emotion is positive or negative, + | e.g. #[span.o-icon.o-icon--inline πŸ˜€] or #[span.o-icon.o-icon--inline 😞]. + | You also want to find, merge and label hashtags like + | #[code #MondayMotivation], to be able to ignore or analyse them later. + ++aside("Note on sentiment analysis") + | Ultimately, sentiment analysis is not always #[em that] easy. In + | addition to the emoji, you'll also want to take specific words into + | account and check the #[code subtree] for intensifiers like "very", to + | increase the sentiment score. At some point, you might also want to train + | a sentiment model. However, the approach described in this example is + | very useful for #[strong bootstrapping rules to gather training data]. + | It's also an incredibly fast way to gather first insights into your data + | – with about 1 million tweets, you'd be looking at a processing time of + | #[strong under 1 minute]. + +p + | By default, spaCy's tokenizer will split emoji into separate tokens. This + | means that you can create a pattern for one or more emoji tokens. In this + | case, a sequence of identical emoji should be treated as one instance. + | Valid hashtags usually consist of a #[code #], plus a sequence of + | ASCII characters with no whitespace, making them easy to match as well. + ++code. + from spacy.lang.en import English + from spacy.matcher import Matcher + + nlp = English() # we only want the tokenizer, so no need to load a model + matcher = Matcher(nlp.vocab) + + pos_emoji = [u'πŸ˜€', u'πŸ˜ƒ', u'πŸ˜‚', u'🀣', u'😊', u'😍'] # positive emoji + neg_emoji = [u'😞', u'😠', u'😩', u'😒', u'😭', u'πŸ˜’'] # negative emoji + + # add patterns to match one or more emoji tokens + pos_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in pos_emoji] + neg_patterns = [[{'ORTH': emoji, 'OP': '+'}] for emoji in neg_emoji] + + matcher.add('HAPPY', label_sentiment, *pos_patterns) # add positive pattern + matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern + + # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token + matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}]) + +p + | Because the #[code on_match] callback receives the ID of each match, you + | can use the same function to handle the sentiment assignment for both + | the positive and negative pattern. To keep it simple, we'll either add + | or subtract #[code 0.1] points – this way, the score will also reflect + | combinations of emoji, even positive #[em and] negative ones. + +p + | With a library like + | #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia], + | we can also retrieve a short description for each emoji – for example, + | #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With + | Heart-Eyes". Assigning it to the merged token's norm will make it + | available as #[code token.norm_]. + ++code. + from emojipedia import Emojipedia # installation: pip install emojipedia + + def label_sentiment(matcher, doc, i, matches): + match_id, start, end = matches[i] + if match_id is 'HAPPY': + doc.sentiment += 0.1 # add 0.1 for positive sentiment + elif match_id is 'SAD': + doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment + span = doc[start : end] + emoji = Emojipedia.search(span[0].text) # get data for emoji + span.merge(norm=emoji.title) # merge span and set NORM to emoji title + +p + | To label the hashtags, we first need to add a new custom flag. + | #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it + | to the hashtag's span, and check its value via a token's + | #[+api("token#check_flag") #[code code check_flag()]] method. On each + | match, we merge the hashtag and assign the flag. + ++code. + # Add a new custom flag to the vocab, which is always False by default + IS_HASHTAG = nlp.vocab.add_flag(lambda text: False) + + def merge_hashtag(matcher, doc, i, matches): + match_id, start, end = matches[i] + span = doc[start : end] + span.merge() # merge hashtag + span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True + +p + | To process a stream of social media posts, we can use + | #[+api("language#pipe") #[code Language.pipe()]], which will return a + | stream of #[code Doc] objects that we can pass to + | #[+api("matcher#pipe") #[code Matcher.pipe()]]. + ++code. + docs = nlp.pipe(LOTS_OF_TWEETS) + matches = matcher.pipe(docs) From 086a06e7d750da5852a447effdb32a376bd86ec7 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 20:01:46 +0200 Subject: [PATCH 095/118] Fix CLI docstrings and add command as first argument Workaround for Plac --- spacy/__init__.py | 6 +++++- spacy/cli/convert.py | 5 +++-- spacy/cli/download.py | 7 ++++--- spacy/cli/info.py | 2 +- spacy/cli/link.py | 5 +++-- spacy/cli/package.py | 5 +++-- spacy/cli/train.py | 6 ++++-- 7 files changed, 23 insertions(+), 13 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 8dc0937f5..6beb7955e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import importlib from .compat import basestring_ -from .cli.info import info +from .cli.info import info as cli_info from .glossary import explain from .deprecated import resolve_load_name from . import util @@ -20,3 +20,7 @@ def load(name, **overrides): overrides['meta'] = meta overrides['path'] = model_path return cls(**overrides) + + +def info(model=None, markdown=False): + return cli_info(None, model, markdown) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e95ffd08b..82b39bba2 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -24,8 +24,9 @@ CONVERTERS = { n_sents=("Number of sentences per doc", "option", "n", float), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(_, input_file, output_dir, n_sents, morphology): - """Convert files into JSON format for use with train command and other +def convert(cmd, input_file, output_dir, n_sents, morphology): + """ + Convert files into JSON format for use with train command and other experiment management functions. """ input_path = Path(input_file) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index fdcacb891..b6e5549da 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -17,8 +17,9 @@ from .. import about direct=("force direct download. Needs model name with version and won't " "perform compatibility check", "flag", "d", bool) ) -def download(model, direct=False): - """Download compatible model from default download path using pip. Model +def download(cmd, model, direct=False): + """ + Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name with version. """ @@ -31,7 +32,7 @@ def download(model, direct=False): version = get_version(model_name, compatibility) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) try: - link(model_name, model, force=True) + link(None, model_name, model, force=True) except: # Dirty, but since spacy.download and the auto-linking is mostly # a convenience wrapper, it's best to show a success message and diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 6f7467521..75aac10c7 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -14,7 +14,7 @@ from .. import util model=("optional: shortcut link of model", "positional", None, str), markdown=("generate Markdown for GitHub issues", "flag", "md", str) ) -def info(model=None, markdown=False): +def info(cmd, model=None, markdown=False): """Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 1feef8bce..9aecdabfe 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -14,8 +14,9 @@ from .. import util link_name=("name of shortuct link to create", "positional", None, str), force=("force overwriting of existing link", "flag", "f", bool) ) -def link(origin, link_name, force=False): - """Create a symlink for models within the spacy/data directory. Accepts +def link(cmd, origin, link_name, force=False): + """ + Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 9acd0a2fa..1c3128d99 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -18,8 +18,9 @@ from .. import about meta=("path to meta.json", "option", "m", str), force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) -def package(input_dir, output_dir, meta, force): - """Generate Python package for model data, including meta and required +def package(cmd, input_dir, output_dir, meta=None, force=False): + """ + Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ diff --git a/spacy/cli/train.py b/spacy/cli/train.py index ed146cb24..25b53e49d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -32,9 +32,11 @@ from .. import displacy no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool) ) -def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, +def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, use_gpu=False, no_tagger=False, no_parser=False, no_entities=False): - """Train a model. Expects data in spaCy's JSON format.""" + """ + Train a model. Expects data in spaCy's JSON format. + """ n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) From 1203959625954fc1164485883ff49e9b5f3b43c3 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 20:02:01 +0200 Subject: [PATCH 096/118] Add pipeline setting to meta.json generator --- spacy/cli/package.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 1c3128d99..e78a4eeb4 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -43,7 +43,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False): meta = util.read_json(meta_path) else: meta = generate_meta() - validate_meta(meta, ['lang', 'name', 'version']) + meta = validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] @@ -86,20 +86,32 @@ def generate_meta(): ('email', 'Author email', False), ('url', 'Author website', False), ('license', 'License', 'CC BY-NC 3.0')] - prints("Enter the package settings for your model.", title="Generating meta.json") meta = {} for setting, desc, default in settings: response = util.get_raw_input(desc, default) meta[setting] = default if response == '' and default else response + meta['pipeline'] = generate_pipeline() return meta +def generate_pipeline(): + prints("If set to 'True', the default pipeline is used. If set to 'False', " + "the pipeline will be disabled. Components should be specified as a " + "comma-separated list of component names, e.g. vectorizer, tagger, " + "parser, ner. For more information, see the docs on processing pipelines.", + title="Enter your model's pipeline components") + pipeline = util.get_raw_input("Pipeline components", True) + replace = {'True': True, 'False': False} + return replace[pipeline] if pipeline in replace else pipeline.split(', ') + + def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': prints("This setting is required to build your package.", title='No "%s" setting found in meta.json' % key, exits=1) + return meta def get_template(filepath): From ae11c8d60f07f5f9257a347f51b72d93aaea3699 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 27 May 2017 20:02:20 +0200 Subject: [PATCH 097/118] Add emoji sentiment to lightning tour matcher example --- website/docs/usage/lightning-tour.jade | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index eefb7a11a..7de486070 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -149,9 +149,14 @@ p nlp = spacy.load('en') matcher = Matcher(nlp.vocab) - # match "Google I/O" or "Google i/o" - pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] - matcher.add('GoogleIO', None, pattern) + + def set_sentiment(matcher, doc, i, matches): + doc.sentiment += 0.1 + + pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] + pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['πŸ˜€', 'πŸ˜‚', '🀣', '😍']] + matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" + matcher.add('HAPPY', set_sentiment, pattern2) # match one or more happy emoji matches = nlp(LOTS_OF TEXT) +infobox From 7cc9c3e9a6f28422485eb2a054d12850481aeb71 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:44:42 -0500 Subject: [PATCH 098/118] Fix convert CLI --- spacy/cli/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index e95ffd08b..ac608a64a 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -39,4 +39,4 @@ def convert(_, input_file, output_dir, n_sents, morphology): prints("Can't find converter for %s" % input_path.parts[-1], title="Unknown format", exits=1) CONVERTERS[file_ext](input_path, output_path, - n_sents=n_sents, morphology=morphology) + n_sents=n_sents, use_morphology=morphology) From 34bbad8e0e115e412e857c71d5f4d0b3ab339681 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:46:06 -0500 Subject: [PATCH 099/118] Add __reduce__ methods on parser subclasses. Fixes pickling. --- spacy/pipeline.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 98b79d709..724891c9b 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -335,6 +335,9 @@ cdef class NeuralDependencyParser(NeuralParser): name = 'parser' TransitionSystem = ArcEager + def __reduce__(self): + return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) + cdef class NeuralEntityRecognizer(NeuralParser): name = 'entity' @@ -342,6 +345,10 @@ cdef class NeuralEntityRecognizer(NeuralParser): nr_feature = 6 + def __reduce__(self): + return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) + + cdef class BeamDependencyParser(BeamParser): TransitionSystem = ArcEager From 5e4312feede7c2511b4d61a5723077c1b16c142d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:47:02 -0500 Subject: [PATCH 100/118] Evaluate loaded class, to ensure save/load works --- spacy/cli/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index b25cdcbd5..7bbda5a47 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -84,11 +84,11 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, pbar.update(len(docs)) with nlp.use_params(optimizer.averages): - scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) with (output_path / ('model%d.pickle' % i)).open('wb') as file_: dill.dump(nlp, file_, -1) - - + with (output_path / ('model%d.pickle' % i)).open('rb') as file_: + nlp_loaded = dill.load(file_) + scorer = nlp_loaded.evaluate(corpus.dev_docs(nlp_loaded, gold_preproc=False)) print_progress(i, losses, scorer.scores) finally: print("Saving model...") From 655ca58c16880c50661039c4db7181b4700cd0e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:49:37 -0500 Subject: [PATCH 101/118] Clarifying change to StateC.clone --- spacy/syntax/_state.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 4b2b47270..0b29412bf 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -335,17 +335,18 @@ cdef cppclass StateC: this._break = this._b_i void clone(const StateC* src) nogil: + this.length = src.length memcpy(this._sent, src._sent, this.length * sizeof(TokenC)) memcpy(this._stack, src._stack, this.length * sizeof(int)) memcpy(this._buffer, src._buffer, this.length * sizeof(int)) memcpy(this._ents, src._ents, this.length * sizeof(Entity)) memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0])) - this.length = src.length this._b_i = src._b_i this._s_i = src._s_i this._e_i = src._e_i this._break = src._break this.offset = src.offset + this._empty_token = src._empty_token void fast_forward() nogil: # space token attachement policy: From 99316fa631efd86a5ab5d68b11654c7366ece650 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:50:21 -0500 Subject: [PATCH 102/118] Use ordered dict to specify actions --- spacy/syntax/arc_eager.pyx | 14 ++++++++------ spacy/syntax/ner.pyx | 31 ++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index f7c1c7922..2e424c1a9 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -9,6 +9,7 @@ import ctypes from libc.stdint cimport uint32_t from libc.string cimport memcpy from cymem.cymem cimport Pool +from collections import OrderedDict from .stateclass cimport StateClass from ._state cimport StateC, is_space_token @@ -312,12 +313,13 @@ cdef class ArcEager(TransitionSystem): @classmethod def get_actions(cls, **kwargs): actions = kwargs.get('actions', - { - SHIFT: [''], - REDUCE: [''], - RIGHT: [], - LEFT: [], - BREAK: ['ROOT']}) + OrderedDict(( + (SHIFT, ['']), + (REDUCE, ['']), + (RIGHT, []), + (LEFT, []), + (BREAK, ['ROOT']) + ))) seen_actions = set() for label in kwargs.get('left_labels', []): if label.upper() != 'ROOT': diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index af42eded4..f8db0a433 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -2,6 +2,7 @@ from __future__ import unicode_literals from thinc.typedefs cimport weight_t +from collections import OrderedDict from .stateclass cimport StateClass from ._state cimport StateC @@ -51,17 +52,29 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil: cdef class BiluoPushDown(TransitionSystem): + def __init__(self, *args, **kwargs): + TransitionSystem.__init__(self, *args, **kwargs) + + def __reduce__(self): + labels_by_action = OrderedDict() + cdef Transition t + for trans in self.c[:self.n_moves]: + label_str = self.strings[trans.label] + labels_by_action.setdefault(trans.move, []).append(label_str) + return (BiluoPushDown, (self.strings, labels_by_action), + None, None) + @classmethod def get_actions(cls, **kwargs): actions = kwargs.get('actions', - { - MISSING: [''], - BEGIN: [], - IN: [], - LAST: [], - UNIT: [], - OUT: [''] - }) + OrderedDict(( + (MISSING, ['']), + (BEGIN, []), + (IN, []), + (LAST, []), + (UNIT, []), + (OUT, ['']) + ))) seen_entities = set() for entity_type in kwargs.get('entity_types', []): if entity_type in seen_entities: @@ -90,7 +103,7 @@ cdef class BiluoPushDown(TransitionSystem): def move_name(self, int move, int label): if move == OUT: return 'O' - elif move == 'MISSING': + elif move == MISSING: return 'M' else: return MOVE_NAMES[move] + '-' + self.strings[label] From 8de9829f094fbf1ed418c527236218667baa1989 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:50:40 -0500 Subject: [PATCH 103/118] Don't overwrite model in initialization, when loading --- spacy/_ml.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/_ml.py b/spacy/_ml.py index f589704a6..ac7849bbb 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -19,6 +19,8 @@ import numpy def _init_for_precomputed(W, ops): + if (W**2).sum() != 0.: + return reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2])) ops.xavier_uniform_init(reshaped) W[:] = reshaped.reshape(W.shape) @@ -247,6 +249,7 @@ def doc2feats(cols=None): model.cols = cols return model + def print_shape(prefix): def forward(X, drop=0.): return X, lambda dX, **kwargs: dX From 3eea5383a1adc179ed7d7feb2c957b1d78f0171b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:51:55 -0500 Subject: [PATCH 104/118] Add move_names property to parser --- spacy/syntax/nn_parser.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 35966d536..6db6e5ae1 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -518,6 +518,14 @@ cdef class Parser: xp.add.at(d_tokvecs, ids, d_state_features * active_feats) + @property + def move_names(self): + names = [] + for i in range(self.moves.n_moves): + name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) + names.append(name) + return names + def get_batch_model(self, batch_size, tokvecs, stream, dropout): lower, upper = self.model state2vec = precompute_hiddens(batch_size, tokvecs, From 7ebd26b8aae34464c3b02cbc9b497bfe0ebfa7d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 15:52:20 -0500 Subject: [PATCH 105/118] Use ordered dict to specify transitions --- spacy/syntax/transition_system.pyx | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 07102aeb0..211b2c950 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -5,7 +5,7 @@ from __future__ import unicode_literals from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t -from collections import defaultdict +from collections import defaultdict, OrderedDict from ..structs cimport TokenC from .stateclass cimport StateClass @@ -26,7 +26,7 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: cdef class TransitionSystem: - def __init__(self, StringStore string_table, dict labels_by_action): + def __init__(self, StringStore string_table, labels_by_action): self.mem = Pool() self.strings = string_table self.n_moves = 0 @@ -34,14 +34,14 @@ cdef class TransitionSystem: self.c = self.mem.alloc(self._size, sizeof(Transition)) - for action, label_strs in sorted(labels_by_action.items()): + for action, label_strs in labels_by_action.items(): for label_str in label_strs: self.add_action(int(action), label_str) self.root_label = self.strings['ROOT'] self.init_beam_state = _init_state def __reduce__(self): - labels_by_action = {} + labels_by_action = OrderedDict() cdef Transition t for trans in self.c[:self.n_moves]: label_str = self.strings[trans.label] @@ -77,6 +77,11 @@ cdef class TransitionSystem: history.append(i) action.do(state.c, action.label) break + else: + print(gold.words) + print(gold.ner) + print(history) + raise ValueError("Could not find gold move") return history cdef int initialize_state(self, StateC* state) nogil: From b03fb2d7b068f4752fda7cb5783d3c08dd0adb63 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:03:16 +0200 Subject: [PATCH 106/118] Update 101 and usage docs --- website/assets/img/docs/pipeline.svg | 2 +- website/docs/usage/_spacy-101/_vocab-stringstore.jade | 4 +++- website/docs/usage/lightning-tour.jade | 2 ++ website/docs/usage/rule-based-matching.jade | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg index e42c2362f..2ff00d787 100644 --- a/website/assets/img/docs/pipeline.svg +++ b/website/assets/img/docs/pipeline.svg @@ -2,7 +2,7 @@ diff --git a/website/docs/usage/_spacy-101/_vocab-stringstore.jade b/website/docs/usage/_spacy-101/_vocab-stringstore.jade index 3f551c9e1..dd300b5b9 100644 --- a/website/docs/usage/_spacy-101/_vocab-stringstore.jade +++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade @@ -89,4 +89,6 @@ p p | Even though both #[code Doc] objects contain the same words, the internal - | integer IDs are very different. + | integer IDs are very different. The same applies for all other strings, + | like the annotation scheme. To avoid mismatched IDs, spaCy will always + | export the vocab if you save a #[code Doc] or #[code nlp] object. diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 7de486070..8cf651be0 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -139,6 +139,8 @@ p new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') +infobox + | #[strong API:] #[+api("language") #[code Language]], + | #[+api("doc") #[code Doc]] | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] +h(2, "rule-matcher") Match text with token rules diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade index fde6da6ef..1fd398ad9 100644 --- a/website/docs/usage/rule-based-matching.jade +++ b/website/docs/usage/rule-based-matching.jade @@ -345,7 +345,7 @@ p | account and check the #[code subtree] for intensifiers like "very", to | increase the sentiment score. At some point, you might also want to train | a sentiment model. However, the approach described in this example is - | very useful for #[strong bootstrapping rules to gather training data]. + | very useful for #[strong bootstrapping rules to collect training data]. | It's also an incredibly fast way to gather first insights into your data | – with about 1 million tweets, you'd be looking at a processing time of | #[strong under 1 minute]. From db116cbedabccb65a100898a3d285e1c2ee804a6 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:03:31 +0200 Subject: [PATCH 107/118] Update tokenization 101 and add illustration --- website/assets/img/docs/tokenization.svg | 123 ++++++++++++++++++ .../docs/usage/_spacy-101/_tokenization.jade | 44 +++++++ website/docs/usage/spacy-101.jade | 7 +- 3 files changed, 171 insertions(+), 3 deletions(-) create mode 100644 website/assets/img/docs/tokenization.svg diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/docs/tokenization.svg new file mode 100644 index 000000000..cc185a3a7 --- /dev/null +++ b/website/assets/img/docs/tokenization.svg @@ -0,0 +1,123 @@ + + + + + β€œLet’s + + + go + + + to + + + N.Y.!” + + + β€œ + + + Let’s + + + go + + + to + + + N.Y.!” + + β€œ + + + Let + + + go + + + to + + + N.Y.!” + + + ’s + + + β€œ + + + Let + + + go + + + to + + + N.Y.! + + + ’s + + + ” + + + β€œ + + + Let + + + go + + + to + + + N.Y. + + + ’s + + + ” + + + ! + + β€œ + + Let + + go + + to + + N.Y. + + ’s + + ” + + ! + + EXCEPTION + + PREFIX + + SUFFIX + + SUFFIX + + EXCEPTION + + DONE + diff --git a/website/docs/usage/_spacy-101/_tokenization.jade b/website/docs/usage/_spacy-101/_tokenization.jade index 64e3f5881..95a9cc520 100644 --- a/website/docs/usage/_spacy-101/_tokenization.jade +++ b/website/docs/usage/_spacy-101/_tokenization.jade @@ -16,3 +16,47 @@ p +row for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"] +cell=cell + +p + | Fist, the raw text is split on whitespace characters, similar to + | #[code text.split(' ')]. Then, the tokenizer processes the text from + | left to right. On each substring, it performs two checks: + ++list("numbers") + +item + | #[strong Does the substring match a tokenizer exception rule?] For + | example, "don't" does not contain whitespace, but should be split + | into two tokens, "do" and "n't", while "U.K." should always + | remain one token. + +item + | #[strong Can a prefix, suffix or infixes be split off?]. For example + | punctuation like commas, periods, hyphens or quotes. + +p + | If there's a match, the rule is applied and the tokenizer continues its + | loop, starting with the newly split substrings. This way, spaCy can split + | #[strong complex, nested tokens] like combinations of abbreviations and + | multiple punctuation marks. + ++aside + | #[strong Tokenizer exception:] Special-case rule to split a string into + | several tokens or prevent a token from being split when punctuation rules + | are applied.#[br] + | #[strong Prefix:] Character(s) at the beginning, e.g. + | #[code $], #[code (], #[code β€œ], #[code ΒΏ].#[br] + | #[strong Suffix:] Character(s) at the end, e.g. + | #[code km], #[code )], #[code ”], #[code !].#[br] + | #[strong Infix:] Character(s) in between, e.g. + | #[code -], #[code --], #[code /], #[code …].#[br] + ++image + include ../../../assets/img/docs/tokenization.svg + .u-text-right + +button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic + +p + | While punctuation rules are usually pretty general, tokenizer exceptions + | strongly depend on the specifics of the individual language. This is + | why each #[+a("/docs/api/language-models") available language] has its + | own subclass like #[code English] or #[code German], that loads in lists + | of hard-coded data and exception rules. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 7c6525004..8b2d0c17e 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -94,9 +94,10 @@ p include _spacy-101/_tokenization +infobox - | To learn more about how spaCy's tokenizer and its rules work in detail, - | how to #[strong customise] it and how to #[strong add your own tokenizer] - | to a processing pipeline, see the usage guide on + | To learn more about how spaCy's tokenization rules work in detail, + | how to #[strong customise and replace] the default tokenizer and how to + | #[strong add language-specific data], see the usage guides on + | #[+a("/docs/usage/adding-languages") adding languages] and | #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer]. +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies From c8543c823792710dae5b0c6d77dc31c53fec177c Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:04:04 +0200 Subject: [PATCH 108/118] Fix formatting and docstrings and remove deprecated function --- spacy/util.py | 22 +++++++++------------- spacy/vocab.pyx | 2 -- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index e42bde810..a30b35a06 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -177,10 +177,13 @@ def get_async(stream, numpy_array): def itershuffle(iterable, bufsize=1000): """Shuffle an iterator. This works by holding `bufsize` items back - and yielding them sometime later. Obviously, this is not unbiased -- + and yielding them sometime later. Obviously, this is not unbiased – but should be good enough for batching. Larger bufsize means less bias. - From https://gist.github.com/andres-erbsen/1307752 + + iterable (iterable): Iterator to shuffle. + bufsize (int): Items to hold back. + YIELDS (iterable): The shuffled iterator. """ iterable = iter(iterable) buf = [] @@ -315,17 +318,16 @@ def normalize_slice(length, start, stop, step=None): def compounding(start, stop, compound): - '''Yield an infinite series of compounding values. Each time the + """Yield an infinite series of compounding values. Each time the generator is called, a value is produced by multiplying the previous value by the compound rate. - EXAMPLE - + EXAMPLE: >>> sizes = compounding(1., 10., 1.5) >>> assert next(sizes) == 1. >>> assert next(sizes) == 1 * 1.5 >>> assert next(sizes) == 1.5 * 1.5 - ''' + """ def clip(value): return max(value, stop) if (start>stop) else min(value, stop) curr = float(start) @@ -335,7 +337,7 @@ def compounding(start, stop, compound): def decaying(start, stop, decay): - '''Yield an infinite series of linearly decaying values.''' + """Yield an infinite series of linearly decaying values.""" def clip(value): return max(value, stop) if (start>stop) else min(value, stop) nr_upd = 1. @@ -344,12 +346,6 @@ def decaying(start, stop, decay): nr_upd += 1 -def check_renamed_kwargs(renamed, kwargs): - for old, new in renamed.items(): - if old in kwargs: - raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) - - def read_json(location): """Open and load JSON from file. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d7d27a3e4..55fde0123 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -53,8 +53,6 @@ cdef class Vocab: vice versa. RETURNS (Vocab): The newly constructed vocab object. """ - util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) - lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} tag_map = tag_map if tag_map is not None else {} if lemmatizer in (None, True, False): From c1983621fbe34659b9243b1af603ed9b85495ac6 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:22:00 +0200 Subject: [PATCH 109/118] Update util functions for model loading --- spacy/__init__.py | 12 +--- spacy/cli/info.py | 10 +++- spacy/cli/link.py | 2 +- spacy/util.py | 111 +++++++++++++++++++++++++------------ website/docs/api/util.jade | 90 ++++++++++++++++-------------- 5 files changed, 132 insertions(+), 93 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 6beb7955e..f9e29037f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,9 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import importlib - -from .compat import basestring_ from .cli.info import info as cli_info from .glossary import explain from .deprecated import resolve_load_name @@ -12,14 +9,7 @@ from . import util def load(name, **overrides): name = resolve_load_name(name, **overrides) - model_path = util.resolve_model_path(name) - meta = util.parse_package_meta(model_path) - if 'lang' not in meta: - raise IOError('No language setting found in model meta.') - cls = util.get_lang_class(meta['lang']) - overrides['meta'] = meta - overrides['path'] = model_path - return cls(**overrides) + return util.load_model(name) def info(model=None, markdown=False): diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 75aac10c7..70f054d84 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -20,8 +20,14 @@ def info(cmd, model=None, markdown=False): prints details in Markdown for easy copy-pasting to GitHub issues. """ if model: - model_path = util.resolve_model_path(model) - meta = util.parse_package_meta(model_path) + if util.is_package(model): + model_path = util.get_package_path(model) + else: + model_path = util.get_data_path() / model + meta_path = model_path / 'meta.json' + if not meta_path.is_file(): + prints(meta_path, title="Can't find model meta.json", exits=1) + meta = read_json(meta_path) if model_path.resolve() != model_path: meta['link'] = path2str(model_path) meta['source'] = path2str(model_path.resolve()) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 9aecdabfe..66824c042 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -21,7 +21,7 @@ def link(cmd, origin, link_name, force=False): directory. Linking models allows loading them via spacy.load(link_name). """ if util.is_package(origin): - model_path = util.get_model_package_path(origin) + model_path = util.get_package_path(model) else: model_path = Path(origin) if not model_path.exists(): diff --git a/spacy/util.py b/spacy/util.py index a30b35a06..25fe198f4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -78,27 +78,86 @@ def ensure_path(path): return path -def resolve_model_path(name): - """Resolve a model name or string to a model path. +def load_model(name): + """Load a model from a shortcut link, package or data path. name (unicode): Package name, shortcut link or model path. - RETURNS (Path): Path to model data directory. + RETURNS (Language): `Language` class with the loaded model. """ data_path = get_data_path() if not data_path or not data_path.exists(): raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) if isinstance(name, basestring_): - if (data_path / name).exists(): # in data dir or shortcut link - return (data_path / name) - if is_package(name): # installed as a package - return get_model_package_path(name) - if Path(name).exists(): # path to model - return Path(name) - elif hasattr(name, 'exists'): # Path or Path-like object - return name + if (data_path / name).exists(): # in data dir or shortcut + return load_model_from_path(data_path / name) + if is_package(name): # installed as package + return load_model_from_pkg(name) + if Path(name).exists(): # path to model data directory + return load_data_from_path(Path(name)) + elif hasattr(name, 'exists'): # Path or Path-like to model data + return load_data_from_path(name) raise IOError("Can't find model '%s'" % name) +def load_model_from_init_py(init_file): + """Helper function to use in the `load()` method of a model package's + __init__.py. + + init_file (unicode): Path to model's __init__.py, i.e. `__file__`. + RETURNS (Language): `Language` class with loaded model. + """ + model_path = Path(init_file).parent + return load_data_from_path(model_path, package=True) + + +def load_model_from_path(model_path): + """Import and load a model package from its file path. + + path (unicode or Path): Path to package directory. + RETURNS (Language): `Language` class with loaded model. + """ + model_path = ensure_path(model_path) + spec = importlib.util.spec_from_file_location('model', model_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module.load() + + +def load_model_from_pkg(name): + """Import and load a model package. + + name (unicode): Name of model package installed via pip. + RETURNS (Language): `Language` class with loaded model. + """ + module = importlib.import_module(name) + return module.load() + + +def load_data_from_path(model_path, package=False): + """Initialie a `Language` class with a loaded model from a model data path. + + model_path (unicode or Path): Path to model data directory. + package (bool): Does the path point to the parent package directory? + RETURNS (Language): `Language` class with loaded model. + """ + model_path = ensure_path(model_path) + meta_path = model_path / 'meta.json' + if not meta_path.is_file(): + raise IOError("Could not read meta.json from %s" % location) + meta = read_json(location) + for setting in ['lang', 'name', 'version']: + if setting not in meta: + raise IOError('No %s setting found in model meta.json' % setting) + if package: + model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) + model_path = model_path / model_data_path + if not model_path.exists(): + raise ValueError("Can't find model directory: %s" % path2str(model_path)) + cls = get_lang_class(meta['lang']) + nlp = cls(pipeline=meta.get('pipeline', True)) + return nlp.from_disk(model_path) + + def is_package(name): """Check if string maps to a package installed via pip. @@ -112,36 +171,16 @@ def is_package(name): return False -def get_model_package_path(package_name): - """Get path to a model package installed via pip. +def get_package_path(name): + """Get the path to an installed package. - package_name (unicode): Name of installed package. - RETURNS (Path): Path to model data directory. + name (unicode): Package name. + RETURNS (Path): Path to installed package. """ # Here we're importing the module just to find it. This is worryingly # indirect, but it's otherwise very difficult to find the package. - # Python's installation and import rules are very complicated. pkg = importlib.import_module(package_name) - package_path = Path(pkg.__file__).parent.parent - meta = parse_package_meta(package_path / package_name) - model_name = '%s-%s' % (package_name, meta['version']) - return package_path / package_name / model_name - - -def parse_package_meta(package_path, require=True): - """Check if a meta.json exists in a package and return its contents. - - package_path (Path): Path to model package directory. - require (bool): If True, raise error if no meta.json is found. - RETURNS (dict or None): Model meta.json data or None. - """ - location = package_path / 'meta.json' - if location.is_file(): - return read_json(location) - elif require: - raise IOError("Could not read meta.json from %s" % location) - else: - return None + return Path(pkg.__file__).parent def is_in_jupyter(): diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index 717abf34a..3e132b7b4 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -1,12 +1,10 @@ -//- πŸ’« DOCS > API > ANNOTATION SPECS +//- πŸ’« DOCS > API > UTIL include ../../_includes/_mixins p | spaCy comes with a small collection of utility functions located in | #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py]. - -+infobox("Important note") | Because utility functions are mostly intended for | #[strong internal use within spaCy], their behaviour may change with | future releases. The functions documented on this page should be safe @@ -74,15 +72,23 @@ p +cell #[code Language] +cell Language class. -+h(2, "resolve_model_path") util.resolve_model_path ++h(2, "load_model") util.load_model +tag function +tag-new(2) -p Resolve a model name or string to a model path. +p + | Load a model from a shortcut link, package or data path. If called with a + | shortcut link or package name, spaCy will assume the model is a Python + | package and import and call its #[code load()] method. If called with a + | path, spaCy will assume it's a data directory, read the language and + | pipeline settings from the meta.json and initialise a #[code Language] + | class. The model data will then be loaded in via + | #[+api("language#from_disk") #[code Language.from_disk()]]. +aside-code("Example"). - model_path = util.resolve_model_path('en') - model_path = util.resolve_model_path('/path/to/en') + nlp = util.load_model('en') + nlp = util.load_model('en_core_web_sm') + nlp = util.load_model('/path/to/data') +table(["Name", "Type", "Description"]) +row @@ -92,8 +98,33 @@ p Resolve a model name or string to a model path. +footrow +cell returns - +cell #[code Path] - +cell Path to model data directory. + +cell #[code Language] + +cell #[code Language] class with the loaded model. + ++h(2, "load_model_from_init_py") util.load_model_from_init_py + +tag function + +tag-new(2) + +p + | A helper function to use in the #[code load()] method of a model package's + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py]. + ++aside-code("Example"). + from spacy.util import load_model_from_init_py + + def load(): + return load_model_from_init_py(__file__) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code init_file] + +cell unicode + +cell Path to model's __init__.py, i.e. #[code __file__]. + + +footrow + +cell returns + +cell #[code Language] + +cell #[code Language] class with the loaded model. +h(2, "is_package") util.is_package +tag function @@ -117,16 +148,18 @@ p +cell #[code bool] +cell #[code True] if installed package, #[code False] if not. -+h(2, "get_model_package_path") util.get_model_package_path ++h(2, "get_package_path") util.get_package_path +tag function + +tag-new(2) p - | Get path to a #[+a("/docs/usage/models") model package] installed via pip. - | Currently imports the package to find it and parse its meta data. + | Get path to an installed package. Mainly used to resolve the location of + | #[+a("/docs/usage/models") model packages]. Currently imports the package + | to find its path. +aside-code("Example"). - util.get_model_package_path('en_core_web_sm') - # /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0 + util.get_package_path('en_core_web_sm') + # /usr/lib/python3.6/site-packages/en_core_web_sm +table(["Name", "Type", "Description"]) +row @@ -137,37 +170,8 @@ p +footrow +cell returns +cell #[code Path] - +cell Path to model data directory. - -+h(2, "parse_package_meta") util.parse_package_meta - +tag function - -p - | Check if a #[code meta.json] exists in a model package and return its - | contents. - -+aside-code("Example"). - if util.is_package('en_core_web_sm'): - path = util.get_model_package_path('en_core_web_sm') - meta = util.parse_package_meta(path, require=True) - # {'name': 'core_web_sm', 'lang': 'en', ...} - -+table(["Name", "Type", "Description"]) - +row - +cell #[code package_path] - +cell #[code Path] +cell Path to model package directory. - +row - +cell #[code require] - +cell #[code bool] - +cell If #[code True], raise error if no #[code meta.json] is found. - - +footrow - +cell returns - +cell dict / #[code None] - +cell Model meta data or #[code None]. - +h(2, "is_in_jupyter") util.is_in_jupyter +tag function +tag-new(2) From eb703f7656a85fa3a7bf01877edd3b9bfd7f7e7d Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:32:43 +0200 Subject: [PATCH 110/118] Update API docs --- website/docs/api/_data.json | 3 ++- website/docs/api/spacy.jade | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index f6a6a7e31..2af9bca1b 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -158,7 +158,8 @@ "binder": { "title": "Binder", - "tag": "class" + "tag": "class", + "source": "spacy/tokens/binder.pyx" }, "annotation": { diff --git a/website/docs/api/spacy.jade b/website/docs/api/spacy.jade index f2fcfde2c..a45307378 100644 --- a/website/docs/api/spacy.jade +++ b/website/docs/api/spacy.jade @@ -11,8 +11,13 @@ p | the name of an installed | #[+a("/docs/usage/saving-loading#generating") model package], a unicode | path or a #[code Path]-like object. spaCy will try resolving the load - | argument in this order. The #[code Language] class to initialise will be - | determined based on the model's settings. + | argument in this order. If a model is loaded from a shortcut link or + | package name, spaCy will assume it's a Python package and import it and + | call the model's own #[code load()] method. If a model is loaded from a + | path, spaCy will assume it's a data directory, read the language and + | pipeline settings off the meta.json and initialise the #[code Language] + | class. The data will be loaded in via + | #[+api("language#from_disk") #[code Language.from_disk()]]. +aside-code("Example"). nlp = spacy.load('en') # shortcut link @@ -20,7 +25,7 @@ p nlp = spacy.load('/path/to/en') # unicode path nlp = spacy.load(Path('/path/to/en')) # pathlib Path - nlp = spacy.load('en', disable['parser', 'tagger']) + nlp = spacy.load('en', disable=['parser', 'tagger']) +table(["Name", "Type", "Description"]) +row From 01a7b10319cf8e73a0c88faf8de8f8ecb1426dfa Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:32:54 +0200 Subject: [PATCH 111/118] Add fallback fonts to illustrations --- website/assets/img/docs/architecture.svg | 8 ++++---- website/assets/img/docs/language_data.svg | 6 +++--- website/assets/img/docs/pipeline.svg | 6 +++--- website/assets/img/docs/tokenization.svg | 4 ++-- website/assets/img/docs/vocab_stringstore.svg | 8 ++++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/website/assets/img/docs/architecture.svg b/website/assets/img/docs/architecture.svg index f586b75eb..c1d12d79b 100644 --- a/website/assets/img/docs/architecture.svg +++ b/website/assets/img/docs/architecture.svg @@ -1,9 +1,9 @@ Language diff --git a/website/assets/img/docs/language_data.svg b/website/assets/img/docs/language_data.svg index b74fffba6..31e1a1b29 100644 --- a/website/assets/img/docs/language_data.svg +++ b/website/assets/img/docs/language_data.svg @@ -1,8 +1,8 @@ diff --git a/website/assets/img/docs/pipeline.svg b/website/assets/img/docs/pipeline.svg index 2ff00d787..8f9dc6dac 100644 --- a/website/assets/img/docs/pipeline.svg +++ b/website/assets/img/docs/pipeline.svg @@ -1,8 +1,8 @@ diff --git a/website/assets/img/docs/tokenization.svg b/website/assets/img/docs/tokenization.svg index cc185a3a7..f5b164725 100644 --- a/website/assets/img/docs/tokenization.svg +++ b/website/assets/img/docs/tokenization.svg @@ -1,7 +1,7 @@ diff --git a/website/assets/img/docs/vocab_stringstore.svg b/website/assets/img/docs/vocab_stringstore.svg index f660a8604..644453737 100644 --- a/website/assets/img/docs/vocab_stringstore.svg +++ b/website/assets/img/docs/vocab_stringstore.svg @@ -1,9 +1,9 @@ From 33e332e67ce7163982806dc5b45a97c6de697486 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:57:59 +0200 Subject: [PATCH 112/118] Remove unused export --- spacy/lang/en/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 7b7d4e1bb..7e1da789b 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -35,4 +35,4 @@ class English(Language): Defaults = EnglishDefaults -__all__ = ['English', 'EnglishDefaults'] +__all__ = ['English'] From 84189c1cab1f8534597cbdf740a8ba51ac1d086a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 00:58:59 +0200 Subject: [PATCH 113/118] Add 'xx' language ID for multi-language support Allows models to specify their language ID as 'xx'. --- spacy/lang/xx/__init__.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 spacy/lang/xx/__init__.py diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py new file mode 100644 index 000000000..fef8c9d59 --- /dev/null +++ b/spacy/lang/xx/__init__.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...language import Language +from ...attrs import LANG +from ...util import update_exc + + +class MultiLanguageDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'xx' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + + +class MultiLanguage(Language): + """Language class to be used for models that support multiple languages. + This module allows models to specify their language ID as 'xx'. + """ + lang = 'xx' + Defaults = MultiLanguageDefaults + + +__all__ = ['MultiLanguage'] From a1d4c97fb7ada8b655292409014d92ab7a6fd9f7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 17:59:00 -0500 Subject: [PATCH 114/118] Improve correctness of minibatching --- spacy/syntax/nn_parser.pyx | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index b7aca26b8..ffd7c8da6 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -427,7 +427,7 @@ cdef class Parser: cuda_stream = get_cuda_stream() - states, golds, max_length = self._init_gold_batch(docs, golds) + states, golds, max_steps = self._init_gold_batch(docs, golds) state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) todo = [(s, g) for (s, g) in zip(states, golds) @@ -438,6 +438,7 @@ cdef class Parser: backprops = [] d_tokvecs = state2vec.ops.allocate(tokvecs.shape) cdef float loss = 0. + n_steps = 0 while todo: states, golds = zip(*todo) @@ -467,7 +468,8 @@ cdef class Parser: todo = [st for st in todo if not st[0].is_final()] if losses is not None: losses[self.name] += (d_scores**2).sum() - if len(backprops) >= (max_length * 2): + n_steps += 1 + if n_steps >= max_steps: break self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) @@ -482,7 +484,8 @@ cdef class Parser: StateClass state Transition action whole_states = self.moves.init_batch(whole_docs) - max_length = max(5, min(20, min([len(doc) for doc in whole_docs]))) + max_length = max(5, min(50, min([len(doc) for doc in whole_docs]))) + max_moves = 0 states = [] golds = [] for doc, state, gold in zip(whole_docs, whole_states, whole_golds): @@ -493,16 +496,20 @@ cdef class Parser: start = 0 while start < len(doc): state = state.copy() + n_moves = 0 while state.B(0) < start and not state.is_final(): action = self.moves.c[oracle_actions.pop(0)] action.do(state.c, action.label) + n_moves += 1 has_gold = self.moves.has_gold(gold, start=start, end=start+max_length) if not state.is_final() and has_gold: states.append(state) golds.append(gold) + max_moves = max(max_moves, n_moves) start += min(max_length, len(doc)-start) - return states, golds, max_length + max_moves = max(max_moves, len(oracle_actions)) + return states, golds, max_moves def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): # Tells CUDA to block, so our async copies complete. From eb5a8be9ade339d7c0a9c01e8075c9ee6827f749 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 01:15:44 +0200 Subject: [PATCH 115/118] Update language overview and add section on 'xx' lang class --- website/docs/api/language-models.jade | 43 +++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index 0990de358..74007f228 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -2,7 +2,10 @@ include ../../_includes/_mixins -p spaCy currently supports the following languages and capabilities: +p + | spaCy currently provides models for the following languages and + | capabilities: + +aside-code("Download language models", "bash"). python -m spacy download en @@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities: +row +cell French #[code fr] - each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ] + each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ] +cell.u-text-center #[+procon(icon)] -+h(2, "available") Available models + +row + +cell Spanish #[code es] + each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ] + +cell.u-text-center #[+procon(icon)] -include ../usage/_models-list +p + +button("/docs/usage/models", true, "primary") See available models +h(2, "alpha-support") Alpha tokenization support @@ -52,9 +59,35 @@ p | #[+a("https://github.com/mocobeta/janome") Janome]. +table([ "Language", "Code", "Source" ]) - each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian BokmΓ₯l", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } + each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian BokmΓ₯l", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } +row +cell #{language} +cell #[code=code] +cell +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code} + ++h(2, "multi-language") Multi-language support + +tag-new(2) + +p + | As of v2.0, spaCy supports models trained on more than one language. This + | is especially useful for named entity recognition. The language ID used + | for multi-language or language-neutral models is #[code xx]. The + | language class, a generic subclass containing only the base language data, + | can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx]. + +p + | To load your model with the neutral, multi-language class, simply set + | #[code "language": "xx"] in your + | #[+a("/docs/usage/saving-loading#models-generating") model package]'s + | meta.json. You can also import the class directly, or call + | #[+api("util#get_lang_class") #[code util.get_lang_class()]] for + | lazy-loading. + ++code("Standard import"). + from spacy.lang.xx import MultiLanguage + nlp = MultiLanguage() + ++code("With lazy-loading"). + from spacy.util import get_lang_class + nlp = get_lang_class('xx') From 10d05c2b9274073da0edac0379e3a42d97816992 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 28 May 2017 01:30:12 +0200 Subject: [PATCH 116/118] Fix typos, wording and formatting --- .../docs/usage/_spacy-101/_similarity.jade | 2 +- .../usage/language-processing-pipeline.jade | 2 +- website/docs/usage/spacy-101.jade | 10 ++- website/docs/usage/v2.jade | 85 +++++++++---------- 4 files changed, 49 insertions(+), 50 deletions(-) diff --git a/website/docs/usage/_spacy-101/_similarity.jade b/website/docs/usage/_spacy-101/_similarity.jade index c99bc9658..6eed1eb7f 100644 --- a/website/docs/usage/_spacy-101/_similarity.jade +++ b/website/docs/usage/_spacy-101/_similarity.jade @@ -5,7 +5,7 @@ p | #[strong how similar they are]. Predicting similarity is useful for | building recommendation systems or flagging duplicates. For example, you | can suggest a user content that's similar to what they're currently - | looking at, or label a support ticket as a duplicate, if it's very + | looking at, or label a support ticket as a duplicate if it's very | similar to an already existing one. p diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 1392fc2f8..ffad01ead 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -144,7 +144,7 @@ p +table(["Argument", "Type", "Description"]) +row +cell #[code vocab] - +cell #[coce Vocab] + +cell #[code Vocab] +cell | Shared data between components, including strings, morphology, | vectors etc. diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 8b2d0c17e..6a1f780dc 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -65,7 +65,7 @@ p | spaCy provides a variety of linguistic annotations to give you insights | into a text's grammatical structure. This includes the word types, | i.e. the parts of speech, and how the words are related to each other. - | For example, if you're analysing text, it makes a #[em huge] difference + | For example, if you're analysing text, it makes a huge difference | whether a noun is the subject of a sentence, or the object – or whether | "google" is used as a verb, or refers to the website or company in a | specific context. @@ -119,9 +119,11 @@ include _spacy-101/_named-entities +infobox | To learn more about entity recognition in spaCy, how to - | #[strong add your own entities] to a document and how to train and update - | the entity predictions of a model, see the usage guide on - | #[+a("/docs/usage/entity-recognition") named entity recognition]. + | #[strong add your own entities] to a document and how to + | #[strong train and update] the entity predictions of a model, see the + | usage guides on + | #[+a("/docs/usage/entity-recognition") named entity recognition] and + | #[+a("/docs/usage/training-ner") training the named entity recognizer]. +h(2, "vectors-similarity") Word vectors and similarity +tag-model("vectors") diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index 23b234c43..25aae8706 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -20,19 +20,18 @@ p nlp = Language(pipeline=['my_factory', mycomponent]) p - | It's now much easier to customise the pipeline with your own components. - | Components are functions that receive a #[code Doc] object, modify and - | return it. If your component is stateful, you'll want to create a new one - | for each pipeline. You can do that by defining and registering a factory - | which receives the shared #[code Vocab] object and returns a component. - -p - | spaCy's default components – the vectorizer, tagger, parser and entity - | recognizer, can be added to your pipeline by using their string IDs. - | This way, you won't have to worry about finding and implementing them – - | to use the default tagger, simply add #[code "tagger"] to the pipeline, + | It's now much easier to #[strong customise the pipeline] with your own + | components, functions that receive a #[code Doc] object, modify and + | return it. If your component is stateful, you can define and register a + | factory which receives the shared #[code Vocab] object and returns a + |Β  component. spaCy's default components can be added to your pipeline by + | using their string IDs. This way, you won't have to worry about finding + | and implementing them – simply add #[code "tagger"] to the pipeline, | and spaCy will know what to do. ++image + include ../../assets/img/docs/pipeline.svg + +infobox | #[strong API:] #[+api("language") #[code Language]] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] @@ -96,11 +95,10 @@ p | #[code Language] class, or load a model that initialises one. This allows | languages to contain more custom data, e.g. lemmatizer lookup tables, or | complex regular expressions. The language data has also been tidied up - | and simplified. It's now also possible to overwrite the functions that - | compute lexical attributes like #[code like_num], and supply - | language-specific syntax iterators, e.g. to determine noun chunks. spaCy - | now also supports simple lookup-based lemmatization. The data is stored - | in a dictionary mapping a string to its lemma. + | and simplified. spaCy now also supports simple lookup-based lemmatization. + ++image + include ../../assets/img/docs/language_data.svg +infobox | #[strong API:] #[+api("language") #[code Language]] @@ -111,13 +109,10 @@ p +aside-code("Example"). from spacy.matcher import Matcher - from spacy.attrs import LOWER, IS_PUNCT matcher = Matcher(nlp.vocab) - matcher.add('HelloWorld', None, - [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}], - [{LOWER: 'hello'}, {LOWER: 'world'}]) + matcher.add('HEARTS', None, [{'ORTH': '❀️', 'OP': '+'}]) assert len(matcher) == 1 - assert 'HelloWorld' in matcher + assert 'HEARTS' in matcher p | Patterns can now be added to the matcher by calling @@ -157,28 +152,8 @@ p +cell #[+api("language#to_disk") #[code Language.to_disk]] +row - +cell #[code Tokenizer.load] - +cell - | #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]] - | #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]] - - +row - +cell #[code Tagger.load] - +cell - | #[+api("tagger#from_disk") #[code Tagger.from_disk]] - | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] - - +row - +cell #[code DependencyParser.load] - +cell - | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] - | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] - - +row - +cell #[code EntityRecognizer.load] - +cell - | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] - | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +cell #[code Language.create_make_doc] + +cell #[+api("language#attributes") #[code Language.tokenizer]] +row +cell @@ -212,6 +187,28 @@ p | #[+api("stringstore#to_disk") #[code StringStore.to_disk]] | #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]] + +row + +cell #[code Tokenizer.load] + +cell - + + +row + +cell #[code Tagger.load] + +cell + | #[+api("tagger#from_disk") #[code Tagger.from_disk]] + | #[+api("tagger#from_bytes") #[code Tagger.from_bytes]] + + +row + +cell #[code DependencyParser.load] + +cell + | #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]] + | #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]] + + +row + +cell #[code EntityRecognizer.load] + +cell + | #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]] + | #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]] + +row +cell #[code Matcher.load] +cell - @@ -232,7 +229,7 @@ p +row +cell #[code Doc.read_bytes] - +cell + +cell #[+api("binder") #[code Binder]] +row +cell #[code Token.is_ancestor_of] From b082f764944a1e5ebc2e9f5e7b44a48221cbbe6c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 18:32:21 -0500 Subject: [PATCH 117/118] Randomize pipeline order during training --- spacy/language.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 7adae0ed5..e874dbb78 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -215,7 +215,9 @@ class Language(object): grads = {} def get_grads(W, dW, key=None): grads[key] = (W, dW) - for proc in self.pipeline[1:]: + pipes = list(self.pipeline[1:]) + random.shuffle(pipes) + for proc in pipes: if not hasattr(proc, 'update'): continue tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) From 9e711c34761ef9d160651a453ce574b72dcc535b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 27 May 2017 18:32:46 -0500 Subject: [PATCH 118/118] Divide d_loss by batch size --- spacy/pipeline.pyx | 2 ++ spacy/syntax/nn_parser.pyx | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 98b79d709..9abb70b40 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -228,6 +228,7 @@ class NeuralTagger(object): idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores @@ -292,6 +293,7 @@ class NeuralLabeller(NeuralTagger): idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores /= d_scores.shape[0] loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index ffd7c8da6..320f3c620 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -450,7 +450,7 @@ cdef class Parser: scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) - d_vector = bp_scores(d_scores, sgd=sgd) + d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd) if drop != 0: d_vector *= mask