spaCy/website/docs/usage/lightning-tour.jade

//- 💫 DOCS > USAGE > LIGHTNING TOUR

include ../../_includes/_mixins

p
    |  The following examples and code snippets give you an overview of spaCy's
    |  functionality and its usage. If you're new to spaCy, make sure to check
    |  out the #[+a("/docs/usage/spacy-101") spaCy 101 guide].

+h(2, "models") Install models and process text

+code(false, "bash").
    python -m spacy download en
    python -m spacy download de

+code.
    import spacy
    nlp = spacy.load('en')
    doc = nlp(u'Hello, world. Here are two sentences.')

    nlp_de = spacy.load('de')
    doc_de = nlp_de(u'Ich bin ein Berliner.')

+infobox
    |  #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
    |  #[strong Usage:] #[+a("/docs/usage/models") Models],
    |  #[+a("/docs/usage/spacy-101") spaCy 101]

+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
    +tag-model("dependency parse")

+code.
    doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
              u"emoji. It's outranking eggplant 🍑 ")

    assert doc[0].text == u'Peach'
    assert doc[1].text == u'emoji'
    assert doc[-1].text == u'🍑'
    assert doc[17:19].text == u'outranking eggplant'
    assert doc.noun_chunks[0].text == u'Peach emoji'

    sentences = list(doc.sents)
    assert len(sentences) == 3
    assert sentences[0].text == u'Peach is the superior emoji.'

+infobox
    |  #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
    |  #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]

+h(2, "examples-pos-tags") Get part-of-speech tags and flags
    +tag-model("tagger")

+code.
    doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
    apple = doc[0]
    assert [apple.pos_, apple.pos] == [u'PROPN', 94]
    assert [apple.tag_, apple.tag] == [u'NNP', 475]
    assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]
    assert apple.is_alpha == True
    assert apple.is_punct == False

    billion = doc[10]
    assert billion.is_digit == False
    assert billion.like_num == True
    assert billion.like_email == False

+infobox
    |  #[strong API:] #[+api("token") #[code Token]]
    |  #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]

+h(2, "examples-integer-ids") Use integer IDs for any string

+code.
    hello_id = nlp.vocab.strings['Hello']
    hello_str = nlp.vocab.strings[hello_id]
    assert token.text  == hello_id  == 3125
    assert token.text == hello_str == 'Hello'

+h(2, "examples-entities") Recongnise and update named entities
    +tag-model("NER")

+code.
    doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
    ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    assert ents == [(u'San Francisco', 0, 13, u'GPE')]

    from spacy.tokens import Span
    doc = nlp(u'Netflix is hiring a new VP of global policy')
    doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
    ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    assert ents == [(0, 7, u'ORG')]

+infobox
    |  #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]

+h(2, "displacy") Visualize a dependency parse and named entities in your browser
    +tag-model("dependency parse", "NER")

+aside
    .u-text-center(style="overflow: auto").
        <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" class="o-svg" viewBox="270 35 125 240" width="400" height="150" style="max-width: none; color: #fff; background: #1a1e23; font-family: inherit; font-size: 2rem">
            <text fill="currentColor" text-anchor="middle" y="222.0">
                <tspan style="font-weight: bold" fill="currentColor" x="50">This</tspan>
                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="50">DT</tspan>
            </text>
            <text fill="currentColor" text-anchor="middle" y="222.0">
                <tspan style="font-weight: bold" fill="currentColor" x="225">is</tspan>
                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="225">VBZ</tspan>
            </text>
            <text fill="currentColor" text-anchor="middle" y="222.0">
                <tspan style="font-weight: bold" fill="currentColor" x="400">a</tspan>
                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="400">DT</tspan>
            </text>
            <text fill="currentColor" text-anchor="middle" y="222.0">
                <tspan style="font-weight: bold" fill="currentColor" x="575">sentence.</tspan>
                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="575">NN</tspan>
            </text>
            <path id="arrow-0-0" stroke-width="2px" d="M70,177.0 C70,89.5 220.0,89.5 220.0,177.0" fill="none" stroke="currentColor"/>
            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
                <textPath xlink:href="#arrow-0-0" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath>
            </text>
            <path d="M70,179.0 L62,167.0 78,167.0" fill="currentColor"/>
            <path id="arrow-0-1" stroke-width="2px" d="M420,177.0 C420,89.5 570.0,89.5 570.0,177.0" fill="none" stroke="currentColor"/>
            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
                <textPath xlink:href="#arrow-0-1" startOffset="50%" fill="currentColor" text-anchor="middle">det</textPath>
            </text>
            <path d="M420,179.0 L412,167.0 428,167.0" fill="currentColor"/>
            <path id="arrow-0-2" stroke-width="2px" d="M245,177.0 C245,2.0 575.0,2.0 575.0,177.0" fill="none" stroke="currentColor"/>
            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
                <textPath xlink:href="#arrow-0-2" startOffset="50%" fill="currentColor" text-anchor="middle">attr</textPath>
            </text>
            <path d="M575.0,179.0 L583.0,167.0 567.0,167.0" fill="currentColor"/>
        </svg>

+code.
    from spacy import displacy

    doc_dep = nlp(u'This is a sentence.')
    displacy.serve(doc_dep, style='dep')

    doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
                  u'in 2007, few people outside of the company took him seriously.')
    displacy.serve(doc_ent, style='ent')

+infobox
    |  #[strong API:] #[+api("displacy") #[code displacy]]
    |  #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]

+h(2, "examples-word-vectors") Get word vectors and similarity
    +tag-model("word vectors")

+code.
    doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
    apple = doc[0]
    banana = doc[2]
    pasta = doc[6]
    hippo = doc[8]
    assert apple.similarity(banana) > pasta.similarity(hippo)
    assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector

+infobox
    |  #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]

+h(2, "examples-serialization") Simple and efficient serialization

+code.
    import spacy
    from spacy.tokens.doc import Doc
    from spacy.vocab import Vocab

    nlp = spacy.load('en')
    moby_dick = open('moby_dick.txt', 'r')
    doc = nlp(moby_dick)
    doc.to_disk('/moby_dick.bin')

    new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')

+infobox
    |  #[strong API:] #[+api("language") #[code Language]],
    |  #[+api("doc") #[code Doc]]
    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]

+h(2, "rule-matcher") Match text with token rules

+code.
    import spacy
    from spacy.matcher import Matcher

    nlp = spacy.load('en')
    matcher = Matcher(nlp.vocab)

    def set_sentiment(matcher, doc, i, matches):
        doc.sentiment += 0.1

    pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
    pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
    matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
    matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji
    matches = nlp(LOTS_OF TEXT)

+infobox
    |  #[strong API:] #[+api("matcher") #[code Matcher]]
    |  #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]

+h(2, "multi-threaded") Multi-threaded generator

+code.
    texts = [u'One document.', u'...', u'Lots of documents']
    # .pipe streams input, and produces streaming output
    iter_texts = (texts[i % 3] for i in xrange(100000000))
    for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
        assert doc.is_parsed
        if i == 100:
            break

+infobox
    |  #[strong API:] #[+api("doc") #[code Doc]]
    |  #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]

+h(2, "examples-dependencies") Get syntactic dependencies
    +tag-model("dependency parse")

+code.
    def dependency_labels_to_root(token):
        """Walk up the syntactic tree, collecting the arc labels."""
        dep_labels = []
        while token.head is not token:
            dep_labels.append(token.dep)
            token = token.head
        return dep_labels

+infobox
    |  #[strong API:] #[+api("token") #[code Token]]
    |  #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]

+h(2, "examples-numpy-arrays") Export to numpy arrays

+code.
    from spacy.attrs import ORTH, LIKE_URL, IS_OOV

    attr_ids = [ORTH, LIKE_URL, IS_OOV]
    doc_array = doc.to_array(attr_ids)
    assert doc_array.shape == (len(doc), len(attr_ids))
    assert doc[0].orth == doc_array[0, 0]
    assert doc[1].orth == doc_array[1, 0]
    assert doc[0].like_url == doc_array[0, 1]
    assert list(doc_array[:, 1]) == [t.like_url for t in doc]

+h(2, "examples-inline") Calculate inline markup on original string

+code.
    def put_spans_around_tokens(doc, get_classes):
        """Given some function to compute class names, put each token in a
        span element, with the appropriate classes computed. All whitespace is
        preserved, outside of the spans. (Of course, HTML won't display more than
        one whitespace character it – but the point is, no information is lost
        and you can calculate what you need, e.g. &lt;br /&gt;, &lt;p&gt; etc.)
        """
        output = []
        html = '&lt;span class="{classes}"&gt;{word}&lt;/span&gt;{space}'
        for token in doc:
            if token.is_space:
                output.append(token.text)
            else:
                classes = ' '.join(get_classes(token))
                output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
        string = ''.join(output)
        string = string.replace('\n', '')
        string = string.replace('\t', '    ')
        return string
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
+								//- 💫 DOCS > USAGE > LIGHTNING TOUR
 								include ../../_includes/_mixins
 								p
-												Fix typo

											
										
										
											2016-12-25 17:23:30 +03:00
+								    |  The following examples and code snippets give you an overview of spaCy's
-												Update text, examples, typos, wording and formatting

											
										
										
											2017-05-28 17:41:01 +03:00
+								    |  functionality and its usage. If you're new to spaCy, make sure to check
 								    |  out the #[+a("/docs/usage/spacy-101") spaCy 101 guide].
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+h(2, "models") Install models and process text
-												Update lightning tour

											
										
										
											2017-03-17 15:11:00 +03:00
 								+code(false, "bash").
-												Update docs to reflect new commands

											
										
										
											2017-03-18 17:24:42 +03:00
+								    python -m spacy download en
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    python -m spacy download de
-												Update lightning tour

											
										
										
											2017-03-17 15:11:00 +03:00
 								+code.
 								    import spacy
 								    nlp = spacy.load('en')
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    doc = nlp(u'Hello, world. Here are two sentences.')
-												Update lightning tour

											
										
										
											2017-03-17 15:11:00 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    nlp_de = spacy.load('de')
 								    doc_de = nlp_de(u'Ich bin ein Berliner.')
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+infobox
 								    |  #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
 								    |  #[strong Usage:] #[+a("/docs/usage/models") Models],
 								    |  #[+a("/docs/usage/spacy-101") spaCy 101]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
 								    +tag-model("dependency parse")
-												Add displaCy examples to lightning tour

											
										
										
											2017-05-24 00:15:39 +03:00
 								+code.
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
 								              u"emoji. It's outranking eggplant 🍑 ")
-												Add displaCy examples to lightning tour

											
										
										
											2017-05-24 00:15:39 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    assert doc[0].text == u'Peach'
 								    assert doc[1].text == u'emoji'
 								    assert doc[-1].text == u'🍑'
-												Update usage and 101 docs

											
										
										
											2017-05-26 13:46:29 +03:00
+								    assert doc[17:19].text == u'outranking eggplant'
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    assert doc.noun_chunks[0].text == u'Peach emoji'
-												Add displaCy examples to lightning tour

											
										
										
											2017-05-24 00:15:39 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    sentences = list(doc.sents)
 								    assert len(sentences) == 3
 								    assert sentences[0].text == u'Peach is the superior emoji.'
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+infobox
 								    |  #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
 								    |  #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+h(2, "examples-pos-tags") Get part-of-speech tags and flags
 								    +tag-model("tagger")
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+code.
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
 								    apple = doc[0]
 								    assert [apple.pos_, apple.pos] == [u'PROPN', 94]
 								    assert [apple.tag_, apple.tag] == [u'NNP', 475]
 								    assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]
 								    assert apple.is_alpha == True
 								    assert apple.is_punct == False
 								    billion = doc[10]
 								    assert billion.is_digit == False
 								    assert billion.like_num == True
 								    assert billion.like_email == False
 								+infobox
 								    |  #[strong API:] #[+api("token") #[code Token]]
 								    |  #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+h(2, "examples-integer-ids") Use integer IDs for any string
 								+code.
 								    hello_id = nlp.vocab.strings['Hello']
 								    hello_str = nlp.vocab.strings[hello_id]
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    assert token.text  == hello_id  == 3125
 								    assert token.text == hello_str == 'Hello'
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+h(2, "examples-entities") Recongnise and update named entities
 								    +tag-model("NER")
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+code.
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
-												Update text, examples, typos, wording and formatting

											
										
										
											2017-05-28 17:41:01 +03:00
+								    ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    assert ents == [(u'San Francisco', 0, 13, u'GPE')]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    from spacy.tokens import Span
 								    doc = nlp(u'Netflix is hiring a new VP of global policy')
 								    doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
-												Update text, examples, typos, wording and formatting

											
										
										
											2017-05-28 17:41:01 +03:00
+								    ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    assert ents == [(0, 7, u'ORG')]
 								+infobox
 								    |  #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
 								+h(2, "displacy") Visualize a dependency parse and named entities in your browser
 								    +tag-model("dependency parse", "NER")
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Update text, examples, typos, wording and formatting

											
										
										
											2017-05-28 17:41:01 +03:00
+								+aside
 								    .u-text-center(style="overflow: auto").
 								        <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" class="o-svg" viewBox="270 35 125 240" width="400" height="150" style="max-width: none; color: #fff; background: #1a1e23; font-family: inherit; font-size: 2rem">
 								            <text fill="currentColor" text-anchor="middle" y="222.0">
 								                <tspan style="font-weight: bold" fill="currentColor" x="50">This</tspan>
 								                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="50">DT</tspan>
 								            </text>
 								            <text fill="currentColor" text-anchor="middle" y="222.0">
 								                <tspan style="font-weight: bold" fill="currentColor" x="225">is</tspan>
 								                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="225">VBZ</tspan>
 								            </text>
 								            <text fill="currentColor" text-anchor="middle" y="222.0">
 								                <tspan style="font-weight: bold" fill="currentColor" x="400">a</tspan>
 								                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="400">DT</tspan>
 								            </text>
 								            <text fill="currentColor" text-anchor="middle" y="222.0">
 								                <tspan style="font-weight: bold" fill="currentColor" x="575">sentence.</tspan>
 								                <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="575">NN</tspan>
 								            </text>
 								            <path id="arrow-0-0" stroke-width="2px" d="M70,177.0 C70,89.5 220.0,89.5 220.0,177.0" fill="none" stroke="currentColor"/>
 								            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
 								                <textPath xlink:href="#arrow-0-0" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath>
 								            </text>
 								            <path d="M70,179.0 L62,167.0 78,167.0" fill="currentColor"/>
 								            <path id="arrow-0-1" stroke-width="2px" d="M420,177.0 C420,89.5 570.0,89.5 570.0,177.0" fill="none" stroke="currentColor"/>
 								            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
 								                <textPath xlink:href="#arrow-0-1" startOffset="50%" fill="currentColor" text-anchor="middle">det</textPath>
 								            </text>
 								            <path d="M420,179.0 L412,167.0 428,167.0" fill="currentColor"/>
 								            <path id="arrow-0-2" stroke-width="2px" d="M245,177.0 C245,2.0 575.0,2.0 575.0,177.0" fill="none" stroke="currentColor"/>
 								            <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
 								                <textPath xlink:href="#arrow-0-2" startOffset="50%" fill="currentColor" text-anchor="middle">attr</textPath>
 								            </text>
 								            <path d="M575.0,179.0 L583.0,167.0 567.0,167.0" fill="currentColor"/>
 								        </svg>
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
+								+code.
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    from spacy import displacy
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    doc_dep = nlp(u'This is a sentence.')
 								    displacy.serve(doc_dep, style='dep')
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								    doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
 								                  u'in 2007, few people outside of the company took him seriously.')
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    displacy.serve(doc_ent, style='ent')
 								+infobox
 								    |  #[strong API:] #[+api("displacy") #[code displacy]]
 								    |  #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								+h(2, "examples-word-vectors") Get word vectors and similarity
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    +tag-model("word vectors")
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+code.
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
 								    apple = doc[0]
 								    banana = doc[2]
 								    pasta = doc[6]
 								    hippo = doc[8]
 								    assert apple.similarity(banana) > pasta.similarity(hippo)
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								    assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+infobox
 								    |  #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+h(2, "examples-serialization") Simple and efficient serialization
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+code.
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    import spacy
 								    from spacy.tokens.doc import Doc
-												Fix initialisation of Doc in lightning tour example

											
										
										
											2017-05-27 18:58:06 +03:00
+								    from spacy.vocab import Vocab
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    nlp = spacy.load('en')
 								    moby_dick = open('moby_dick.txt', 'r')
 								    doc = nlp(moby_dick)
 								    doc.to_disk('/moby_dick.bin')
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Fix initialisation of Doc in lightning tour example

											
										
										
											2017-05-27 18:58:06 +03:00
+								    new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+infobox
-												Update 101 and usage docs

											
										
										
											2017-05-28 01:03:16 +03:00
+								    |  #[strong API:] #[+api("language") #[code Language]],
 								    |  #[+api("doc") #[code Doc]]
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								+h(2, "rule-matcher") Match text with token rules
 								+code.
 								    import spacy
 								    from spacy.matcher import Matcher
 								    nlp = spacy.load('en')
 								    matcher = Matcher(nlp.vocab)
-												Add emoji sentiment to lightning tour matcher example

											
										
										
											2017-05-27 21:02:20 +03:00
 								    def set_sentiment(matcher, doc, i, matches):
 								        doc.sentiment += 0.1
 								    pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
 								    pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
 								    matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
-												Update text, examples, typos, wording and formatting

											
										
										
											2017-05-28 17:41:01 +03:00
+								    matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								    matches = nlp(LOTS_OF TEXT)
 								+infobox
 								    |  #[strong API:] #[+api("matcher") #[code Matcher]]
 								    |  #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+h(2, "multi-threaded") Multi-threaded generator
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+code.
 								    texts = [u'One document.', u'...', u'Lots of documents']
 								    # .pipe streams input, and produces streaming output
 								    iter_texts = (texts[i % 3] for i in xrange(100000000))
 								    for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
 								        assert doc.is_parsed
 								        if i == 100:
 								            break
 								+infobox
 								    |  #[strong API:] #[+api("doc") #[code Doc]]
 								    |  #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
 								+h(2, "examples-dependencies") Get syntactic dependencies
 								    +tag-model("dependency parse")
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+code.
 								    def dependency_labels_to_root(token):
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								        """Walk up the syntactic tree, collecting the arc labels."""
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
+								        dep_labels = []
 								        while token.head is not token:
 								            dep_labels.append(token.dep)
 								            token = token.head
 								        return dep_labels
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								+infobox
 								    |  #[strong API:] #[+api("token") #[code Token]]
 								    |  #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
 								+h(2, "examples-numpy-arrays") Export to numpy arrays
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+code.
-												Rewrite examples in lightning tour

											
										
										
											2017-05-25 02:58:33 +03:00
+								    from spacy.attrs import ORTH, LIKE_URL, IS_OOV
 								    attr_ids = [ORTH, LIKE_URL, IS_OOV]
 								    doc_array = doc.to_array(attr_ids)
 								    assert doc_array.shape == (len(doc), len(attr_ids))
 								    assert doc[0].orth == doc_array[0, 0]
 								    assert doc[1].orth == doc_array[1, 0]
 								    assert doc[0].like_url == doc_array[0, 1]
 								    assert list(doc_array[:, 1]) == [t.like_url for t in doc]
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								+h(2, "examples-inline") Calculate inline markup on original string
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
 								+code.
 								    def put_spans_around_tokens(doc, get_classes):
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								        """Given some function to compute class names, put each token in a
 								        span element, with the appropriate classes computed. All whitespace is
 								        preserved, outside of the spans. (Of course, HTML won't display more than
 								        one whitespace character it – but the point is, no information is lost
 								        and you can calculate what you need, e.g. &lt;br /&gt;, &lt;p&gt; etc.)
 								        """
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
+								        output = []
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								        html = '&lt;span class="{classes}"&gt;{word}&lt;/span&gt;{space}'
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
+								        for token in doc:
 								            if token.is_space:
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								                output.append(token.text)
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
+								            else:
-												Update and fix lightning tour examples

											
										
										
											2017-05-25 12:15:56 +03:00
+								                classes = ' '.join(get_classes(token))
 								                output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
-												Update to new website

											
										
										
											2016-10-31 21:04:15 +03:00
+								        string = ''.join(output)
 								        string = string.replace('\n', '')
 								        string = string.replace('\t', '    ')
 								        return string