spaCy/website/docs/usage/lightning-tour.jade

//- 💫 DOCS > USAGE > LIGHTNING TOUR

include ../../_includes/_mixins

p
    |  The following examples and code snippets give you an overview of spaCy's
    |  functionality and its usage.

+h(2, "models") Install models and process text

+code(false, "bash").
    python -m spacy download en
    python -m spacy download de

+code.
    import spacy
    nlp = spacy.load('en')
    doc = nlp(u'Hello, world. Here are two sentences.')

    nlp_de = spacy.load('de')
    doc_de = nlp_de(u'Ich bin ein Berliner.')

+infobox
    |  #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
    |  #[strong Usage:] #[+a("/docs/usage/models") Models],
    |  #[+a("/docs/usage/spacy-101") spaCy 101]

+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
    +tag-model("dependency parse")

+code.
    doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
              u"emoji. It's outranking eggplant 🍑 ")

    assert doc[0].text == u'Peach'
    assert doc[1].text == u'emoji'
    assert doc[-1].text == u'🍑'
    assert doc[17:19] == u'outranking eggplant'
    assert doc.noun_chunks[0].text == u'Peach emoji'

    sentences = list(doc.sents)
    assert len(sentences) == 3
    assert sentences[0].text == u'Peach is the superior emoji.'

+infobox
    |  #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
    |  #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]

+h(2, "examples-pos-tags") Get part-of-speech tags and flags
    +tag-model("tagger")

+code.
    doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
    apple = doc[0]
    assert [apple.pos_, apple.pos] == [u'PROPN', 94]
    assert [apple.tag_, apple.tag] == [u'NNP', 475]
    assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]
    assert apple.is_alpha == True
    assert apple.is_punct == False

    billion = doc[10]
    assert billion.is_digit == False
    assert billion.like_num == True
    assert billion.like_email == False

+infobox
    |  #[strong API:] #[+api("token") #[code Token]]
    |  #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]

+h(2, "examples-integer-ids") Use integer IDs for any string

+code.
    hello_id = nlp.vocab.strings['Hello']
    hello_str = nlp.vocab.strings[hello_id]
    assert token.text  == hello_id  == 3125
    assert token.text == hello_str == 'Hello'

+h(2, "examples-entities") Recongnise and update named entities
    +tag-model("NER")

+code.
    doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
    ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
    assert ents == [(u'San Francisco', 0, 13, u'GPE')]

    from spacy.tokens import Span
    doc = nlp(u'Netflix is hiring a new VP of global policy')
    doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
    ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents]
    assert ents == [(0, 7, u'ORG')]

+infobox
    |  #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]

+h(2, "displacy") Visualize a dependency parse and named entities in your browser
    +tag-model("dependency parse", "NER")

+code.
    from spacy import displacy

    doc_dep = nlp(u'This is a sentence.')
    displacy.serve(doc_dep, style='dep')

    doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at '
                  u'Google in 2007, few people outside of the company took him seriously.')
    displacy.serve(doc_ent, style='ent')

+infobox
    |  #[strong API:] #[+api("displacy") #[code displacy]]
    |  #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]

+h(2, "examples-word-vectors") Word vectors
    +tag-model("word vectors")

+code.
    doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
    apple = doc[0]
    banana = doc[2]
    pasta = doc[6]
    hippo = doc[8]
    assert apple.similarity(banana) > pasta.similarity(hippo)

+infobox
    |  #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]

+h(2, "examples-serialization") Simple and efficient serialization

+code.
    import spacy
    from spacy.tokens.doc import Doc

    nlp = spacy.load('en')
    moby_dick = open('moby_dick.txt', 'r')
    doc = nlp(moby_dick)
    doc.to_disk('/moby_dick.bin')

    new_doc = Doc().from_disk('/moby_dick.bin')

+infobox
    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]

+h(2, "multi-threaded") Multi-threaded generator

+code.
    texts = [u'One document.', u'...', u'Lots of documents']
    # .pipe streams input, and produces streaming output
    iter_texts = (texts[i % 3] for i in xrange(100000000))
    for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
        assert doc.is_parsed
        if i == 100:
            break

+infobox
    |  #[strong API:] #[+api("doc") #[code Doc]]
    |  #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]

+h(2, "examples-dependencies") Get syntactic dependencies
    +tag-model("dependency parse")

+code.
    def dependency_labels_to_root(token):
        """Walk up the syntactic tree, collecting the arc labels."""
        dep_labels = []
        while token.head is not token:
            dep_labels.append(token.dep)
            token = token.head
        return dep_labels

+infobox
    |  #[strong API:] #[+api("token") #[code Token]]
    |  #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]

+h(2, "examples-numpy-arrays") Export to numpy arrays

+code.
    from spacy.attrs import ORTH, LIKE_URL, IS_OOV

    attr_ids = [ORTH, LIKE_URL, IS_OOV]
    doc_array = doc.to_array(attr_ids)
    assert doc_array.shape == (len(doc), len(attr_ids))
    assert doc[0].orth == doc_array[0, 0]
    assert doc[1].orth == doc_array[1, 0]
    assert doc[0].like_url == doc_array[0, 1]
    assert list(doc_array[:, 1]) == [t.like_url for t in doc]

+h(2, "examples-inline") Calculate inline mark-up on original string

+code.
    def put_spans_around_tokens(doc, get_classes):
        '''Given some function to compute class names, put each token in a
        span element, with the appropriate classes computed.

        All whitespace is preserved, outside of the spans. (Yes, I know HTML
        won't display it. But the point is no information is lost, so you can
        calculate what you need, e.g. <br /> tags, <p> tags, etc.)
        '''
        output = []
        template = '<span classes="{classes}">{word}</span>{space}'
        for token in doc:
            if token.is_space:
                output.append(token.orth_)
            else:
                output.append(
                  template.format(
                    classes=' '.join(get_classes(token)),
                    word=token.orth_,
                    space=token.whitespace_))
        string = ''.join(output)
        string = string.replace('\n', '')
        string = string.replace('\t', '    ')
        return string
Update to new website 2016-10-31 21:04:15 +03:00			`//- 💫 DOCS > USAGE > LIGHTNING TOUR`

			`include ../../_includes/_mixins`

			`p`
Fix typo 2016-12-25 17:23:30 +03:00			`\| The following examples and code snippets give you an overview of spaCy's`
Update to new website 2016-10-31 21:04:15 +03:00			`\| functionality and its usage.`

Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+h(2, "models") Install models and process text`
Update lightning tour 2017-03-17 15:11:00 +03:00
			`+code(false, "bash").`
Update docs to reflect new commands 2017-03-18 17:24:42 +03:00			`python -m spacy download en`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`python -m spacy download de`
Update lightning tour 2017-03-17 15:11:00 +03:00
			`+code.`
			`import spacy`
			`nlp = spacy.load('en')`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`doc = nlp(u'Hello, world. Here are two sentences.')`
Update lightning tour 2017-03-17 15:11:00 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`nlp_de = spacy.load('de')`
			`doc_de = nlp_de(u'Ich bin ein Berliner.')`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+infobox`
			`\| #[strong API:] #[+api("spacy#load") #[code spacy.load()]]`
			`\| #[strong Usage:] #[+a("/docs/usage/models") Models],`
			`\| #[+a("/docs/usage/spacy-101") spaCy 101]`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences`
			`+tag-model("dependency parse")`
Add displaCy examples to lightning tour 2017-05-24 00:15:39 +03:00
			`+code.`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "`
			`u"emoji. It's outranking eggplant 🍑 ")`
Add displaCy examples to lightning tour 2017-05-24 00:15:39 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`assert doc[0].text == u'Peach'`
			`assert doc[1].text == u'emoji'`
			`assert doc[-1].text == u'🍑'`
			`assert doc[17:19] == u'outranking eggplant'`
			`assert doc.noun_chunks[0].text == u'Peach emoji'`
Add displaCy examples to lightning tour 2017-05-24 00:15:39 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`sentences = list(doc.sents)`
			`assert len(sentences) == 3`
			`assert sentences[0].text == u'Peach is the superior emoji.'`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+infobox`
			`\| #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]`
			`\| #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+h(2, "examples-pos-tags") Get part-of-speech tags and flags`
			`+tag-model("tagger")`
Update to new website 2016-10-31 21:04:15 +03:00
			`+code.`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')`
			`apple = doc[0]`
			`assert [apple.pos_, apple.pos] == [u'PROPN', 94]`
			`assert [apple.tag_, apple.tag] == [u'NNP', 475]`
			`assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]`
			`assert apple.is_alpha == True`
			`assert apple.is_punct == False`

			`billion = doc[10]`
			`assert billion.is_digit == False`
			`assert billion.like_num == True`
			`assert billion.like_email == False`

			`+infobox`
			`\| #[strong API:] #[+api("token") #[code Token]]`
			`\| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]`
Update to new website 2016-10-31 21:04:15 +03:00
			`+h(2, "examples-integer-ids") Use integer IDs for any string`

			`+code.`
			`hello_id = nlp.vocab.strings['Hello']`
			`hello_str = nlp.vocab.strings[hello_id]`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`assert token.text == hello_id == 3125`
			`assert token.text == hello_str == 'Hello'`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+h(2, "examples-entities") Recongnise and update named entities`
			`+tag-model("NER")`
Update to new website 2016-10-31 21:04:15 +03:00
			`+code.`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`doc = nlp(u'San Francisco considers banning sidewalk delivery robots')`
			`ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]`
			`assert ents == [(u'San Francisco', 0, 13, u'GPE')]`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`from spacy.tokens import Span`
			`doc = nlp(u'Netflix is hiring a new VP of global policy')`
			`doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]`
			`ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents]`
			`assert ents == [(0, 7, u'ORG')]`

			`+infobox`
			`\| #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]`

			`+h(2, "displacy") Visualize a dependency parse and named entities in your browser`
			`+tag-model("dependency parse", "NER")`
Update to new website 2016-10-31 21:04:15 +03:00
			`+code.`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`from spacy import displacy`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`doc_dep = nlp(u'This is a sentence.')`
			`displacy.serve(doc_dep, style='dep')`

			`doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at '`
			`u'Google in 2007, few people outside of the company took him seriously.')`
			`displacy.serve(doc_ent, style='ent')`

			`+infobox`
			`\| #[strong API:] #[+api("displacy") #[code displacy]]`
			`\| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]`
Update to new website 2016-10-31 21:04:15 +03:00
			`+h(2, "examples-word-vectors") Word vectors`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+tag-model("word vectors")`
Update to new website 2016-10-31 21:04:15 +03:00
			`+code.`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")`
			`apple = doc[0]`
			`banana = doc[2]`
			`pasta = doc[6]`
			`hippo = doc[8]`
			`assert apple.similarity(banana) > pasta.similarity(hippo)`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+infobox`
			`\| #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+h(2, "examples-serialization") Simple and efficient serialization`
Update to new website 2016-10-31 21:04:15 +03:00
			`+code.`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`import spacy`
			`from spacy.tokens.doc import Doc`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`nlp = spacy.load('en')`
			`moby_dick = open('moby_dick.txt', 'r')`
			`doc = nlp(moby_dick)`
			`doc.to_disk('/moby_dick.bin')`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`new_doc = Doc().from_disk('/moby_dick.bin')`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+infobox`
			`\| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+h(2, "multi-threaded") Multi-threaded generator`
Update to new website 2016-10-31 21:04:15 +03:00
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+code.`
			`texts = [u'One document.', u'...', u'Lots of documents']`
			`# .pipe streams input, and produces streaming output`
			`iter_texts = (texts[i % 3] for i in xrange(100000000))`
			`for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):`
			`assert doc.is_parsed`
			`if i == 100:`
			`break`

			`+infobox`
			`\| #[strong API:] #[+api("doc") #[code Doc]]`
			`\| #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]`

			`+h(2, "examples-dependencies") Get syntactic dependencies`
			`+tag-model("dependency parse")`
Update to new website 2016-10-31 21:04:15 +03:00
			`+code.`
			`def dependency_labels_to_root(token):`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`"""Walk up the syntactic tree, collecting the arc labels."""`
Update to new website 2016-10-31 21:04:15 +03:00			`dep_labels = []`
			`while token.head is not token:`
			`dep_labels.append(token.dep)`
			`token = token.head`
			`return dep_labels`

Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`+infobox`
			`\| #[strong API:] #[+api("token") #[code Token]]`
			`\| #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]`

			`+h(2, "examples-numpy-arrays") Export to numpy arrays`
Update to new website 2016-10-31 21:04:15 +03:00
			`+code.`
Rewrite examples in lightning tour 2017-05-25 02:58:33 +03:00			`from spacy.attrs import ORTH, LIKE_URL, IS_OOV`

			`attr_ids = [ORTH, LIKE_URL, IS_OOV]`
			`doc_array = doc.to_array(attr_ids)`
			`assert doc_array.shape == (len(doc), len(attr_ids))`
			`assert doc[0].orth == doc_array[0, 0]`
			`assert doc[1].orth == doc_array[1, 0]`
			`assert doc[0].like_url == doc_array[0, 1]`
			`assert list(doc_array[:, 1]) == [t.like_url for t in doc]`
Update to new website 2016-10-31 21:04:15 +03:00
			`+h(2, "examples-inline") Calculate inline mark-up on original string`

			`+code.`
			`def put_spans_around_tokens(doc, get_classes):`
			`'''Given some function to compute class names, put each token in a`
			`span element, with the appropriate classes computed.`

			`All whitespace is preserved, outside of the spans. (Yes, I know HTML`
			`won't display it. But the point is no information is lost, so you can`
			`calculate what you need, e.g. <br /> tags, <p> tags, etc.)`
			`'''`
			`output = []`
			`template = '<span classes="{classes}">{word}</span>{space}'`
			`for token in doc:`
			`if token.is_space:`
			`output.append(token.orth_)`
			`else:`
			`output.append(`
			`template.format(`
			`classes=' '.join(get_classes(token)),`
			`word=token.orth_,`
			`space=token.whitespace_))`
			`string = ''.join(output)`
			`string = string.replace('\n', '')`
			`string = string.replace('\t', ' ')`
			`return string`