From 5f661a1b3a509626ddaba55e95956d1a8968a974 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 19:48:33 +0100 Subject: [PATCH 1/5] Remove tensorizer from pre-set pipe_names --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 01ffd07bf..7b9bda805 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -67,7 +67,7 @@ class BaseDefaults(object): infix_finditer=infix_finditer, token_match=token_match) - pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] + pipe_names = ['tagger', 'parser', 'ner'] token_match = TOKEN_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) From 12954ab218809a67200e5a108d2071d03060f21b Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 19:49:04 +0100 Subject: [PATCH 2/5] Don't document the tensorizer for now --- website/api/_data.json | 9 ---- website/api/tensorizer.jade | 6 --- website/api/textcategorizer.jade | 13 +++--- website/assets/img/pipeline.svg | 8 ++-- .../_processing-pipelines/_pipelines.jade | 12 +++--- website/usage/_spacy-101/_architecture.jade | 6 --- website/usage/_spacy-101/_pipelines.jade | 42 +++++++++---------- .../usage/_vectors-similarity/_basics.jade | 5 +-- 8 files changed, 39 insertions(+), 62 deletions(-) delete mode 100644 website/api/tensorizer.jade diff --git a/website/api/_data.json b/website/api/_data.json index 886404c99..9d447570f 100644 --- a/website/api/_data.json +++ b/website/api/_data.json @@ -17,7 +17,6 @@ "Pipeline": { "Language": "language", "Pipe": "pipe", - "Tensorizer": "tensorizer", "Tagger": "tagger", "DependencyParser": "dependencyparser", "EntityRecognizer": "entityrecognizer", @@ -180,14 +179,6 @@ "source": "spacy/pipeline.pyx" }, - "tensorizer": { - "title": "Tensorizer", - "teaser": "Add a tensor with position-sensitive meaning representations to a document.", - "tag": "class", - "tag_new": 2, - "source": "spacy/pipeline.pyx" - }, - "goldparse": { "title": "GoldParse", "tag": "class", diff --git a/website/api/tensorizer.jade b/website/api/tensorizer.jade deleted file mode 100644 index cc79f36e3..000000000 --- a/website/api/tensorizer.jade +++ /dev/null @@ -1,6 +0,0 @@ -//- 💫 DOCS > API > TENSORIZER - -include ../_includes/_mixins - -//- This class inherits from Pipe, so this page uses the template in pipe.jade. -!=partial("pipe", { subclass: "Tensorizer", pipeline_id: "tensorizer" }) diff --git a/website/api/textcategorizer.jade b/website/api/textcategorizer.jade index a9684b15d..0f81b56eb 100644 --- a/website/api/textcategorizer.jade +++ b/website/api/textcategorizer.jade @@ -7,14 +7,13 @@ p | labels. You can change the model architecture rather easily, but by | default, the #[code TextCategorizer] class uses a convolutional | neural network to assign position-sensitive vectors to each word in the - | document. This step is similar to the #[+api("tensorizer") #[code Tensorizer]] - | component, but the #[code TextCategorizer] uses its own CNN model, to + | document. The #[code TextCategorizer] uses its own CNN model, to | avoid sharing weights with the other pipeline components. The document - | tensor is then - | summarized by concatenating max and mean pooling, and a multilayer - | perceptron is used to predict an output vector of length #[code nr_class], - | before a logistic activation is applied elementwise. The value of each - | output neuron is the probability that some class is present. + | tensor is then summarized by concatenating max and mean pooling, and a + | multilayer perceptron is used to predict an output vector of length + | #[code nr_class], before a logistic activation is applied elementwise. + | The value of each output neuron is the probability that some class is + | present. //- This class inherits from Pipe, so this page uses the template in pipe.jade. !=partial("pipe", { subclass: "TextCategorizer", short: "textcat", pipeline_id: "textcat" }) diff --git a/website/assets/img/pipeline.svg b/website/assets/img/pipeline.svg index 1ff5923cb..1145dbfb3 100644 --- a/website/assets/img/pipeline.svg +++ b/website/assets/img/pipeline.svg @@ -18,13 +18,13 @@ tokenizer - tensorizer + tagger - tagger + parser - parser + ner - ner + ... diff --git a/website/usage/_processing-pipelines/_pipelines.jade b/website/usage/_processing-pipelines/_pipelines.jade index 3c1c28af1..845571f2b 100644 --- a/website/usage/_processing-pipelines/_pipelines.jade +++ b/website/usage/_processing-pipelines/_pipelines.jade @@ -2,7 +2,7 @@ p | spaCy makes it very easy to create your own pipelines consisting of - | reusable components – this includes spaCy's default tensorizer, tagger, + | reusable components – this includes spaCy's default tagger, | parser and entity regcognizer, but also your own custom processing | functions. A pipeline component can be added to an already existing | #[code nlp] object, specified when initialising a #[code Language] class, @@ -49,9 +49,9 @@ p nlp = spacy.load('en') p - | ... the model tells spaCy to use the language #[code "en"] and the pipeline - | #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will - | then initialise #[code spacy.lang.en.English], and create each pipeline + | ... the model tells spaCy to use the language #[code "en"] and the + | pipeline #[code.u-break ["tagger", "parser", "ner"]]. spaCy will then + | initialise #[code spacy.lang.en.English], and create each pipeline | component and add it to the processing pipeline. It'll then load in the | model's data from its data ditectory and return the modified | #[code Language] class for you to use as the #[code nlp] object. @@ -72,7 +72,7 @@ p +code("spacy.load under the hood"). lang = 'en' - pipeline = ['tensorizer', 'tagger', 'parser', 'ner'] + pipeline = ['tagger', 'parser', 'ner'] data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English() @@ -120,7 +120,7 @@ p +code. nlp = spacy.load('en', disable['parser', 'tagger']) - nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) + nlp = English().from_disk('/model', disable=['ner']) p | You can also use the #[+api("language#remove_pipe") #[code remove_pipe]] diff --git a/website/usage/_spacy-101/_architecture.jade b/website/usage/_spacy-101/_architecture.jade index 1a3ed05a3..9c6255420 100644 --- a/website/usage/_spacy-101/_architecture.jade +++ b/website/usage/_spacy-101/_architecture.jade @@ -60,12 +60,6 @@ p +cell #[+api("pipe") #[code Pipe]] +cell Base class for processing pipeline components. - +row - +cell #[+api("tensorizer") #[code Tensorizer]] - +cell - | Add tensors with position-sensitive meaning representations to - | #[code Doc] objects. - +row +cell #[+api("tagger") #[code Tagger]] +cell Annotate part-of-speech tags on #[code Doc] objects. diff --git a/website/usage/_spacy-101/_pipelines.jade b/website/usage/_spacy-101/_pipelines.jade index 4e9cd8aeb..d1e5453a7 100644 --- a/website/usage/_spacy-101/_pipelines.jade +++ b/website/usage/_spacy-101/_pipelines.jade @@ -5,10 +5,9 @@ p | produce a #[code Doc] object. The #[code Doc] is then processed in several | different steps – this is also referred to as the | #[strong processing pipeline]. The pipeline used by the - | #[+a("/models") default models] consists of a - | tensorizer, a tagger, a parser and an entity recognizer. Each pipeline - | component returns the processed #[code Doc], which is then passed on to - | the next component. + | #[+a("/models") default models] consists of a tagger, a parser and an + | entity recognizer. Each pipeline component returns the processed + | #[code Doc], which is then passed on to the next component. +graphic("/assets/img/pipeline.svg") include ../../assets/img/pipeline.svg @@ -21,43 +20,45 @@ p +table(["Name", "Component", "Creates", "Description"]) +row - +cell tokenizer + +cell #[strong tokenizer] +cell #[+api("tokenizer") #[code Tokenizer]] +cell #[code Doc] +cell Segment text into tokens. +row("divider") - +cell tensorizer - +cell #[+api("tensorizer") Tensorizer] - +cell #[code Doc.tensor] - +cell Create feature representation tensor for #[code Doc]. - - +row - +cell tagger + +cell #[strong tagger] +cell #[+api("tagger") #[code Tagger]] +cell #[code Doc[i].tag] +cell Assign part-of-speech tags. +row - +cell parser + +cell #[strong parser] +cell #[+api("dependencyparser") #[code DependencyParser]] +cell - | #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents], + | #[code Doc[i].head], + | #[code Doc[i].dep], + | #[code Doc.sents], | #[code Doc.noun_chunks] +cell Assign dependency labels. +row - +cell ner + +cell #[strong ner] +cell #[+api("entityrecognizer") #[code EntityRecognizer]] +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type] +cell Detect and label named entities. +row - +cell textcat + +cell #[strong textcat] +cell #[+api("textcategorizer") #[code TextCategorizer]] +cell #[code Doc.cats] +cell Assign document labels. + +row("divider") + +cell #[strong ...] + +cell #[+a("/usage/processing-pipelines#custom-components") custom components] + +cell #[code Doc._.xxx], #[code Token._.xxx], #[code Span._.xxx] + +cell Assign custom attributes, methods or properties. + p | The processing pipeline always #[strong depends on the statistical model] | and its capabilities. For example, a pipeline can only include an entity @@ -66,17 +67,16 @@ p | in its meta data, as a simple list containing the component names: +code(false, "json"). - "pipeline": ["tensorizer", "tagger", "parser", "ner"] + "pipeline": ["tagger", "parser", "ner"] p | Although you can mix and match pipeline components, their | #[strong order and combination] is usually important. Some components may - | require certain modifications on the #[code Doc] to process it. For - | example, the default pipeline first applies the tensorizer, which - | pre-processes the doc and encodes its internal + | require certain modifications on the #[code Doc] to process it. As the + | processing pipeline is applied, spaCy encodes the document's internal | #[strong meaning representations] as an array of floats, also called a | #[strong tensor]. This includes the tokens and their context, which is - | required for the next component, the tagger, to make predictions of the + | required for the first component, the tagger, to make predictions of the | part-of-speech tags. Because spaCy's models are neural network models, | they only "speak" tensors and expect the input #[code Doc] to have | a #[code tensor]. diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade index 300680331..07ad6bcd4 100644 --- a/website/usage/_vectors-similarity/_basics.jade +++ b/website/usage/_vectors-similarity/_basics.jade @@ -20,9 +20,8 @@ p | Aside from spaCy's built-in word vectors, which were trained on a lot of | text with a wide vocabulary, the parsing, tagging and NER models also | rely on vector representations of the #[strong meanings of words in context]. - | As the first component of the - | #[+a("/usage/processing-pipelines") processing pipeline], the - | tensorizer encodes a document's internal meaning representations as an + | As the #[+a("/usage/processing-pipelines") processing pipeline] is + | applied spaCy encodes a document's internal meaning representations as an | array of floats, also called a tensor. This allows spaCy to make a | reasonable guess at a word's meaning, based on its surrounding words. | Even if a word hasn't been seen before, spaCy will know #[em something] From 9b4c38fe9f0fb7fe36f61f24c3d74b75c81d7630 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 19:49:27 +0100 Subject: [PATCH 3/5] Add button option to terminal component --- website/_includes/_mixins.jade | 5 ++++- website/assets/css/_components/_misc.sass | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 615160023..3c8493d57 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -624,7 +624,7 @@ mixin qs(data, style) //- Terminal-style code window label - [string] title displayed in top bar of terminal window -mixin terminal(label) +mixin terminal(label, button_text, button_url) .x-terminal .x-terminal__icons: span .u-padding-small.u-text-label.u-text-center=label @@ -632,6 +632,9 @@ mixin terminal(label) +code.x-terminal__code block + if button_text && button_url + +button(button_url, true, "primary", "small").x-terminal__button=button_text + //- Landing diff --git a/website/assets/css/_components/_misc.sass b/website/assets/css/_components/_misc.sass index 8167c94b2..c09fdf79a 100644 --- a/website/assets/css/_components/_misc.sass +++ b/website/assets/css/_components/_misc.sass @@ -6,6 +6,7 @@ padding: $border-radius border-radius: 1em width: 100% + position: relative .x-terminal__icons position: absolute @@ -39,3 +40,13 @@ width: 100% max-width: 100% white-space: pre-wrap + + +.x-terminal__button.x-terminal__button + @include position(absolute, bottom, right, 2.65rem, 2.6rem) + background: $color-dark + border-color: $color-dark + + &:hover + background: darken($color-dark, 5) + border-color: darken($color-dark, 5) From 5dd0d6a383020b7ca87c8582e0dc28e094c36523 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 19:49:36 +0100 Subject: [PATCH 4/5] Update lightning tour --- website/index.jade | 14 ++++++++------ website/usage/_spacy-101/_lightning-tour.jade | 3 +-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/website/index.jade b/website/index.jade index 1abe5a984..79a6dd76d 100644 --- a/website/index.jade +++ b/website/index.jade @@ -54,7 +54,7 @@ include _includes/_mixins .o-content +grid +grid-col("two-thirds") - +terminal("lightning_tour.py"). + +terminal("lightning_tour.py", "More examples", "/usage/spacy-101#lightning-tour"). # Install: pip install spacy && spacy download en import spacy @@ -65,16 +65,18 @@ include _includes/_mixins text = open('war_and_peace.txt').read() doc = nlp(text) - # Hook in your own deep learning models - similarity_model = load_my_neural_network() - def install_similarity(doc): - doc.user_hooks['similarity'] = similarity_model - nlp.pipeline.append(install_similarity) + # Find named entities, phrases and concepts + for entity in doc.ents: + print(entity.text, entity.label_) + # Determine semantic similarities doc1 = nlp(u'the fries were gross') doc2 = nlp(u'worst fries ever') doc1.similarity(doc2) + # Hook in your own deep learning models + nlp.add_pipe(load_my_model(), before='parser') + +grid-col("third") +h(2) Features +list diff --git a/website/usage/_spacy-101/_lightning-tour.jade b/website/usage/_spacy-101/_lightning-tour.jade index acf423c48..9e5da35e1 100644 --- a/website/usage/_spacy-101/_lightning-tour.jade +++ b/website/usage/_spacy-101/_lightning-tour.jade @@ -2,8 +2,7 @@ p | The following examples and code snippets give you an overview of spaCy's - | functionality and its usage. If you're new to spaCy, make sure to check - | out the #[+a("/usage/spacy-101") spaCy 101 guide]. + | functionality and its usage. +h(3, "lightning-tour-models") Install models and process text From e5a4c31bb42d998b3f8d88616552fd55f0b988b3 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 19:49:42 +0100 Subject: [PATCH 5/5] Adjust code line height --- website/assets/css/_components/_code.sass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/assets/css/_components/_code.sass b/website/assets/css/_components/_code.sass index eaf0980e1..0fec230c0 100644 --- a/website/assets/css/_components/_code.sass +++ b/website/assets/css/_components/_code.sass @@ -34,7 +34,7 @@ .c-code-block__content display: block - font: normal normal 1.1rem/#{2} $font-code + font: normal normal 1.1rem/#{1.9} $font-code padding: 1em 2em &[data-prompt]:before,