Update tutorials

2025-08-24 05:54:55 +03:00 · 2016-10-19 01:23:54 +02:00 · 2016-10-19 01:23:54 +02:00 · da0985114d
commit da0985114d
parent 4f41a065fe
6 changed files with 255 additions and 7 deletions
--- a/website/_layout.jade
+++ b/website/_layout.jade
@ -45,7 +45,7 @@ html(lang="en")
            if sidebar
                include _includes/_sidebar

-            main.o-content(class="#{(sidebar) ? 'o-content--sidebar' : '' } #{((current.path[0] == 'docs' && asides != false) || asides) ? 'o-content--asides' : '' }")
+            main.o-content(class="#{(sidebar) ? 'o-content--sidebar' : '' } #{((current.path[0] == 'docs' && asides != false) || asides) ? 'o-content--asides' : '' } #{(current.path[1] == 'tutorials') ? 'o-content--article' : '' }")
                if current.path[1] == "tutorials"
                    +h(1)=title

--- a/website/assets/css/_base/_layout.sass
+++ b/website/assets/css/_base/_layout.sass
@ -24,7 +24,10 @@ body
 //- Paragraphs

 p
-    @extend .u-text-regular, .o-block, .has-aside
+    @extend .o-block, .u-text-regular, .has-aside
+
+    .o-content--article &:not([class])
+        @extend .u-text-medium


 //- Links
--- a/website/docs/tutorials/_data.json
+++ b/website/docs/tutorials/_data.json
@ -1,34 +1,47 @@
 {
+    "training": {
+        "title": "Training the tagger, entity recogniser and parser",
+        "date": "2016-10-17",
+        "description": "This tutorial describes how to train new statistical models for spaCy's part-of-speech tagger, named entity recognizer and dependency parser."
+    },
+
+    "custom-pipelines": {
+        "title": "Custom Pipelines",
+        "date": "2016-10-17",
+        "description": "spaCy 1.0 introduces dynamic pipelines, so that you can easily create custom workflows. This tutorial describes the feature, and introduces experimental support for dynamic Token attributes. The tutorial also discusses how we can make it easier to use bidirectional LSTMs with spaCy."
+    },
+
+    "rule-based-matcher": {
+        "title": "Rule-based Matcher",
+        "date": "2016-10-17",
+        "description": "spaCy features a rule-matching engine that operates over tokens. The rules can refer to token annotations and flags, and matches support callbacks to accept, modify and/or act on the match. The rule matcher also allows you to associate patterns with entity IDs, to allow some basic entity linking or disambiguation."
+    },
+
    "load-new-word-vectors": {
-        "template": "article",
        "title": "Load new word vectors",
        "date": "2015-09-24",
        "description": "Word vectors allow simple similarity queries, and drive many NLP applications. This tutorial explains how to load custom word vectors into spaCy, to make use of task or data-specific representations."
    },

    "byo-annotations": {
-        "template": "article",
        "title": "Using Pre-existing Tokenization, Tags, and Other Annotations",
        "date": "2016-04-15",
        "description": "spaCy assumes by default that your data is raw text. However, sometimes your data is partially annotated, e.g. with pre-existing tokenization, part-of-speech tags, etc. This tutorial explains how to use these annotations in spaCy."
    },

    "mark-adverbs": {
-        "template": "article",
        "title": "Mark all adverbs, particularly for verbs of speech",
        "date": "2015-08-18",
        "description": "Let's say you're developing a proofreading tool, or possibly an IDE for writers.  You're convinced by Stephen King's advice that adverbs are not your friend so you want to highlight all adverbs."
    },

    "syntax-search": {
-        "template": "article",
        "title": "Search Reddit for comments about Google doing something",
        "date": "2015-08-18",
        "description": "Example use of the spaCy NLP tools for data exploration. Here we will look for Reddit comments that describe Google doing something, i.e. discuss the company's actions. This is difficult, because other senses of \"Google\" now dominate usage of the word in conversation, particularly references to using Google products."
    },

    "twitter-filter": {
-        "template": "article",
        "title": "Finding Relevant Tweets",
        "date": "2015-08-18",
        "description": "In this tutorial, we will use word vectors to search for tweets about Jeb Bush. We'll do this by building up two word lists: one that represents the type of meanings in the Jeb Bush tweets, and another to help screen out irrelevant tweets that mention the common, ambiguous word \"bush\"."
--- a/website/docs/tutorials/custom-pipelines.jade
+++ b/website/docs/tutorials/custom-pipelines.jade
@ -0,0 +1,89 @@
+include ../../_includes/_mixins
+
+p.u-text-large spaCy 1.0 introduces dynamic pipelines, so that you can easily create custom workflows. This tutorial describes the feature, and introduces experimental support for dynamic Token attributes. The tutorial also discusses how we can make it easier to use bidirectional LSTMs with spaCy.
+
+p Best practices in NLP are now already pretty different from when I first designed spaCy, even though it's only been two years. The spaCy 1.0 release has a new custom pipeline API to help you use the new hotness.
+
+p Before 1.0, spaCy's pipeline was hard-coded. When you called #[code nlp(text)], spaCy would apply the tokenizer, tagger, parser and named entity recognizer, in sequence. This design assumed that users should subclass the #[code Language] class to customize the pipeline. However, the #[code Language] class has gotten more complicated, and subclassing it now feels like a relatively "serious" thing to do. It feels hard.
+
+p In spaCy 1.0, the order of operations is no longer hard-coded. Instead, the new #[code Language.__call__] does something like this:
+
+code.
+    def __call__(self, text):
+        doc = self.make_doc(text)
+        for process in self.pipeline:
+            process(doc)
+        return doc
+
+p The pipeline can consist of any sequence of callables. They should accept a Doc object, and modify it in-place. You can install the pipeline by passing a callable to the #[code spacy.load()] function, or the constructor of the #[code Language] class:
+
+code("python", "Basic Example").
+    import spacy
+
+    def arbitrary_fixup_rules(doc):
+        for token in doc:
+            if token.text == u'bill' and token.tag_ == u'NNP':
+                token.tag_ = u'NN'
+
+    def custom_pipeline(nlp):
+        return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity)
+
+    nlp = spacy.load('en', pipeline=custom_pipeline)
+
+
+p The value passed to the #[code pipeline] keyword should be a callable that takes the #[code Language] instance (i.e. #[code nlp]) as an argument. The callable should return a sequence of callables. Each member of the sequence should take a Doc object as its sole positional argument.
+
+h(2, "experimental-lstm") Experimental: Bidirectional LSTM with custom pipeline
+
+p Probably the most important new technology in Natural Language Processing is the rise of bidirectional LSTM models. These models associate each word with a #[em context-specific] vector. You can also neatly include character level features, so that all relevant aspects of the word are captured. This is pretty much the best way to do feature extraction in NLP at the moment, for almost any task.
+
+p spaCy doesn't feature any pre-trained LSTM models yet, and the details of this API are still being refined. But, because BiLSTMs are proving so important, I wanted to get the proposal up.
+
+p Version 1.0 adds an attribute #[code tensor] to the #[code Doc] object. The #[code tensor] attribute expects a numpy ndarray object, and is publicly writeable. This gives you a place to store the output of the LSTM (or some other real-valued output you want to keep).
+
+code("python", "Basic Example").
+    import spacy
+    from spacy.symbols import LEMMA, TAG
+
+    class LSTMModel(object):
+        def __init__(self, **kwargs):
+            # Load your weights etc
+            pass
+
+        def __call__(self, doc):
+            features = doc.to_array([LEMMA, TAG])
+            doc.tensor = lstm(features)
+
+    def custom_pipeline(nlp):
+        return (nlp.tagger, LSTMModel(), nlp.parser, nlp.entity)
+
+    nlp = spacy.load('en', pipeline=custom_pipeline)
+
+p Now, so far we only have the LSTM output as an attribute of the #[code Doc] object. We'd like to be able to do stuff like #[code doc[0].vector], and have that get us the LSTM vector for the token. We can do #[code doc.tensor[doc[0].i]], but I'd like a little more sugar. The details of this part are still experimental — in particular, don't take the names too seriously at this point.
+
+p A relevant implementation detail of spaCy is that the #[code Token] objects are thin proxies, that can be created and destroyed as convenient. The #[code Doc] object owns all the data. This means that we can't simply assign a vector to the #[code Token] objects. Instead, we'll add a hook that gets called by #[code token.vector]. We'll also add space for hooks in other places we might need them.
+
+aside("Why don't Token and Span own their data?") Well, we want the sequence of tokens to be stored together in memory. That means we really want to have a sequence owned by the #[code Doc] object. But if we have that, then we would have to copy data to the #[code Token] objects. This gets super messy, especially if the tokens should be able to modify their state. The Token therefore proxies to the Doc, to maintain a single source of truth.
+
+p Here's what that looks like:
+
+code.
+    def install_vector_hook(doc):
+        doc.getters_for_token['similarity'] = lambda token: doc.tensor[token.i]
+
+    def custom_pipeline(nlp):
+        return (nlp.tagger, LSTMModel(), install_vector_hook, nlp.parser, nlp.entity)
+
+    nlp = spacy.load('en', pipeline=custom_pipeline)
+
+p The #[code install_vector_hook] function will run after the LSTM. It modifies the #[code Doc], setting a value in a dictionary that the #[code Token] knows to look for. When you access the #[code token.vector] property, the token checks whether there's a special-case listener for that attribute:
+
+code.
+    @property
+    def vector(self):
+        if 'vector' in self.doc.getters_for_tokens:
+            return self.doc.getters_for_tokens['vector'](self)
+        else:
+            return self.c.lex.vector
+
+p As I said — don't take the names too seriously at this point. But do test out the feature — it should be all working. You should be able to customize he behaviour of a lot of attributes this way already. Possibly we should just make it everything on the Token and the Span, but I think it might not be nice to have so much uncertainty about how some values are being calculated. There's such a thing as being too dynamic.
--- a/website/docs/tutorials/rule-based-matcher.jade
+++ b/website/docs/tutorials/rule-based-matcher.jade
@ -0,0 +1,61 @@
+include ../../_includes/_mixins
+
+p.u-text-large spaCy features a rule-matching engine that operates over tokens. The rules can refer to token annotations and flags, and matches support callbacks to accept, modify and/or act on the match. The rule matcher also allows you to associate patterns with entity IDs, to allow some basic entity linking or disambiguation.
+
+code("python", "Matcher Example").
+    from spacy.matcher import Matcher
+    from spacy.attributes import *
+    import spacy
+
+    nlp = spacy.load('en', parser=False, entity=False)
+
+    matcher = Matcher(nlp.vocab)
+
+    matcher.add_entity(
+        "GoogleNow", # Entity ID -- Helps you act on the match.
+        {"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
+        acceptor=None, # Accept or modify the match
+        on_match=merge_phrases # Callback to act on the matches
+    )
+    matcher.add_pattern(
+        "GoogleNow", # Entity ID -- Created if doesn't exist.
+        [ # The pattern is a list of *Token Specifiers*.
+            { # This Token Specifier matches tokens whose orth field is "Google"
+              ORTH: "Google"
+            },
+            { # This Token Specifier matches tokens whose orth field is "Now"
+              ORTH: "Now"
+            }
+        ],
+        label=None # Can associate a label to the pattern-match, to handle it better.
+    )
+    doc = nlp(u"I prefer Siri to Google Now.")
+    matches = matcher(doc)
+    for ent_id, label, start, end in matches:
+        print(nlp.strings[ent_id], nlp.strings[label], doc[start : end].text)
+        entity = matcher.get_entity(ent_id)
+        print(entity)
+
+    matcher.add_pattern(
+        "GoogleNow",
+        [ # This Surface Form matches "google now", verbatim, and requires
+          # "google" to have the NNP tag. This helps prevent the pattern from
+          # matching cases like "I will google now to look up the time"
+          {
+            ORTH: "google",
+            TAG: "NNP"
+          },
+          {
+            ORTH: "now"
+          }
+        ]
+    )
+
+    doc = nlp(u"I'll google now to find out how the google now service works.")
+    matches = matcher(doc)
+    for ent_id, label, start, end in matches:
+        print(ent_id, label, start, end, doc[start : end].text)
+    # Because we specified the on_match=merge_phrases callback,
+    # we should see 'google now' as a single token.
+    for token in doc:
+        print(token.text, token.lemma_, token.tag_, token.ent_type_)
--- a/website/docs/tutorials/training.jade
+++ b/website/docs/tutorials/training.jade
@ -0,0 +1,82 @@
+include ../../_includes/_mixins
+
+p.u-text-large This tutorial describes how to train new statistical models for spaCy's part-of-speech tagger, named entity recognizer and dependency parser.
+
+p I'll start with some quick code examples, that describe how to train each model. I'll then provide a bit of background about the algorithms, and explain how the data and feature templates work.
+
+h(2, "train-pos-tagger") Training the part-of-speech tagger
+
+code('python', 'Simple Example').
+    from spacy.vocab import Vocab
+    from spacy.pipeline import Tagger
+    from spacy.tokens import Doc
+
+    vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
+    tagger = Tagger(vocab)
+
+    doc = Doc(vocab, words=['I', 'like', 'stuff'])
+    tagger.update(doc, ['N', 'V', 'N'])
+
+    tagger.model.end_training()
+
+p #[+a("https://github.com/" + SOCIAL.github + "/spaCy/examples/training/train_tagger.py") Full example]
+
+h(2, "train-entity") Training the named entity recognizer
+
+code('python', 'Simple Example').
+    from spacy.vocab import Vocab
+    from spacy.pipeline import EntityRecognizer
+    from spacy.tokens import Doc
+
+    vocab = Vocab()
+    entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC'])
+
+    doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
+    entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O'])
+
+    entity.model.end_training()
+
+p #[+a("https://github.com/" + SOCIAL.github + "/spaCy/examples/training/train_ner.y") Full example]
+
+h(2, "train-entity") Training the dependency parser
+
+code('python', 'Simple Example').
+    from spacy.vocab import Vocab
+    from spacy.pipeline import DependencyParser
+    from spacy.tokens import Doc
+
+    vocab = Vocab()
+    parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct'])
+
+    doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?'])
+    parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'),
+                        (1, 'punct')])
+
+    parser.model.end_training()
+
+p #[+a("https://github.com/" + SOCIAL.github + "/spaCy/examples/training/train_parser.py") Full example]
+
+h(2, 'feature-templates') Customising the feature extraction
+
+p spaCy currently uses linear models for the tagger, parser and entity recognizer, with weights learned using the #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
+
+p Because it's a linear model, it's important for accuracy to build conjunction features out of the atomic predictors. Let's say you have two atomic predictors asking, "What is the part-of-speech of the previous token?", and "What is the part-of-speech of the previous previous token?". These ppredictors will introduce a number of features, e.g. "Prev-pos=NN", "Prev-pos=VBZ", etc. A conjunction template introduces features such as "Prev-pos=NN&Prev-pos=VBZ".
+
+p The feature extraction proceeds in two passes. In the first pass, we fill an array with the values of all of the atomic predictors. In the second pass, we iterate over the feature templates, and fill a small temporary array with the predictors that will be combined into a conjunction feature. Finally, we hash this array into a 64-bit integer, using the MurmurHash algorithm. You can see this at work in the #[+a("https://github.com/" + SOCIAL.github + "/thinc/blob/94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb/thinc/linear/features.pyx") thinc.linear.features] module.
+
+p It's very easy to change the feature templates, to create novel combinations of the existing atomic predictors. There's currently no API available to add new atomic predictors, though. You'll have to create a subclass of the model, and write your own #[+code set_featuresC] method.
+
+p The feature templates are passed in using the #[+code features] keyword argument to the constructors of the Tagger, DependencyParser and EntityRecognizer:
+
+code('python', 'custom tagger templates').
+    from spacy.vocab import Vocab
+    from spacy.pipeline import Tagger
+    from spacy.tagger import P2_orth, P1_orth
+    from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
+
+    vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
+    tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster),
+                                     (P2_orth,), (P1_orth,), (W_orth,),
+                                     (N1_orth,), (N2_orth,)])
+
+p Custom feature templates can be passed to the DependencyParser and EntityRecognizer as well, also using the #[+code features] keyword argument of the constructor.