spaCy/website/usage/_spacy-101/_lightning-tour.jade

285 lines
12 KiB
Plaintext
Raw Normal View History

2017-10-03 15:26:20 +03:00
//- 💫 DOCS > USAGE > SPACY 101 > LIGHTNING TOUR
2016-10-31 21:04:15 +03:00
p
2016-12-25 17:23:30 +03:00
| The following examples and code snippets give you an overview of spaCy's
2017-11-01 21:49:36 +03:00
| functionality and its usage.
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-models") Install models and process text
2017-03-17 15:11:00 +03:00
+code(false, "bash").
python -m spacy download en
python -m spacy download de
2017-03-17 15:11:00 +03:00
+code.
import spacy
nlp = spacy.load('en')
2017-05-25 02:58:33 +03:00
doc = nlp(u'Hello, world. Here are two sentences.')
2017-03-17 15:11:00 +03:00
2017-05-25 02:58:33 +03:00
nlp_de = spacy.load('de')
doc_de = nlp_de(u'Ich bin ein Berliner.')
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("spacy#load") #[code spacy.load()]]
| #[+label-inline Usage:] #[+a("/usage/models") Models],
2017-10-03 15:26:20 +03:00
| #[+a("/usage/spacy-101") spaCy 101]
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-tokens-sentences") Get tokens, noun chunks & sentences
2017-05-25 02:58:33 +03:00
+tag-model("dependency parse")
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
u"emoji. It's outranking eggplant 🍑 ")
2017-05-25 02:58:33 +03:00
assert doc[0].text == u'Peach'
assert doc[1].text == u'emoji'
assert doc[-1].text == u'🍑'
2017-05-26 13:46:29 +03:00
assert doc[17:19].text == u'outranking eggplant'
2017-06-08 20:15:50 +03:00
assert list(doc.noun_chunks)[0].text == u'Peach emoji'
2017-05-25 02:58:33 +03:00
sentences = list(doc.sents)
assert len(sentences) == 3
2017-06-08 20:15:50 +03:00
assert sentences[1].text == u'Peach is the superior emoji.'
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
| #[+label-inline Usage:] #[+a("/usage/spacy-101") spaCy 101]
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-pos-tags") Get part-of-speech tags and flags
2017-05-25 02:58:33 +03:00
+tag-model("tagger")
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]
2017-05-29 02:06:49 +03:00
assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579]
assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553]
assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862]
2017-05-25 02:58:33 +03:00
assert apple.is_alpha == True
assert apple.is_punct == False
billion = doc[10]
assert billion.is_digit == False
assert billion.like_num == True
assert billion.like_email == False
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("token") #[code Token]]
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#pos-tagging") Part-of-speech tagging]
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-hashes") Use hash values for any string
2016-10-31 21:04:15 +03:00
+code.
doc = nlp(u'I love coffee')
2017-05-29 02:06:49 +03:00
coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
2017-05-29 02:06:49 +03:00
assert doc[2].orth == coffee_hash == 3197928453018144401
assert doc[2].text == coffee_text == u'coffee'
2017-05-29 02:06:49 +03:00
beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079
beer_text = doc.vocab.strings[beer_hash] # 'beer'
2016-10-31 21:04:15 +03:00
2017-05-29 02:06:49 +03:00
unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783
2017-05-28 20:42:44 +03:00
unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("stringstore") #[code stringstore]]
| #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
2017-05-28 20:42:44 +03:00
2017-11-09 15:55:13 +03:00
+h(3, "lightning-tour-entities") Recognise and update named entities
2017-05-25 02:58:33 +03:00
+tag-model("NER")
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
2017-05-25 02:58:33 +03:00
assert ents == [(u'San Francisco', 0, 13, u'GPE')]
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
from spacy.tokens import Span
doc = nlp(u'Netflix is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
2017-05-25 02:58:33 +03:00
assert ents == [(0, 7, u'ORG')]
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#named-entities") Named entity recognition]
2017-05-25 02:58:33 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-displacy") Visualize a dependency parse and named entities in your browser
2017-05-25 02:58:33 +03:00
+tag-model("dependency parse", "NER")
+tag-new(2)
2016-10-31 21:04:15 +03:00
+aside
.u-text-center(style="overflow: auto").
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" class="o-svg" viewBox="270 35 125 240" width="400" height="150" style="max-width: none; color: #fff; background: #1a1e23; font-family: inherit; font-size: 2rem">
<text fill="currentColor" text-anchor="middle" y="222.0">
<tspan style="font-weight: bold" fill="currentColor" x="50">This</tspan>
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="50">DT</tspan>
</text>
<text fill="currentColor" text-anchor="middle" y="222.0">
<tspan style="font-weight: bold" fill="currentColor" x="225">is</tspan>
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="225">VBZ</tspan>
</text>
<text fill="currentColor" text-anchor="middle" y="222.0">
<tspan style="font-weight: bold" fill="currentColor" x="400">a</tspan>
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="400">DT</tspan>
</text>
<text fill="currentColor" text-anchor="middle" y="222.0">
<tspan style="font-weight: bold" fill="currentColor" x="575">sentence.</tspan>
<tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="575">NN</tspan>
</text>
<path id="arrow-0-0" stroke-width="2px" d="M70,177.0 C70,89.5 220.0,89.5 220.0,177.0" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
<textPath xlink:href="#arrow-0-0" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath>
</text>
<path d="M70,179.0 L62,167.0 78,167.0" fill="currentColor"/>
<path id="arrow-0-1" stroke-width="2px" d="M420,177.0 C420,89.5 570.0,89.5 570.0,177.0" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
<textPath xlink:href="#arrow-0-1" startOffset="50%" fill="currentColor" text-anchor="middle">det</textPath>
</text>
<path d="M420,179.0 L412,167.0 428,167.0" fill="currentColor"/>
<path id="arrow-0-2" stroke-width="2px" d="M245,177.0 C245,2.0 575.0,2.0 575.0,177.0" fill="none" stroke="currentColor"/>
<text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
<textPath xlink:href="#arrow-0-2" startOffset="50%" fill="currentColor" text-anchor="middle">attr</textPath>
</text>
<path d="M575.0,179.0 L583.0,167.0 567.0,167.0" fill="currentColor"/>
</svg>
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
from spacy import displacy
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
doc_dep = nlp(u'This is a sentence.')
displacy.serve(doc_dep, style='dep')
2017-05-25 12:15:56 +03:00
doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
u'in 2007, few people outside of the company took him seriously.')
2017-05-25 02:58:33 +03:00
displacy.serve(doc_ent, style='ent')
+infobox
| #[+label-inline API:] #[+api("top-level#displacy") #[code displacy]]
2017-10-16 21:36:41 +03:00
| #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizers]
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-word-vectors") Get word vectors and similarity
2017-05-25 02:58:33 +03:00
+tag-model("word vectors")
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]
assert apple.similarity(banana) > pasta.similarity(hippo)
2017-05-25 12:15:56 +03:00
assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
2016-10-31 21:04:15 +03:00
2017-10-16 21:36:41 +03:00
p
| For the best results, you should run this example using the
| #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model.
2017-05-25 02:58:33 +03:00
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline Usage:] #[+a("/usage/vectors-similarity") Word vectors and similarity]
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-serialization") Simple and efficient serialization
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r').read()
2017-05-25 02:58:33 +03:00
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')
2016-10-31 21:04:15 +03:00
new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("language") #[code Language]],
2017-05-28 01:03:16 +03:00
| #[+api("doc") #[code Doc]]
2017-10-16 21:36:41 +03:00
| #[+label-inline Usage:] #[+a("/usage/models#saving-loading") Saving and loading models]
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-rule-matcher") Match text with token rules
2017-05-25 12:15:56 +03:00
+code.
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
def set_sentiment(matcher, doc, i, matches):
doc.sentiment += 0.1
pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji
2017-05-25 12:15:56 +03:00
matches = nlp(LOTS_OF TEXT)
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("matcher") #[code Matcher]]
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#rule-based-matching") Rule-based matching]
2017-05-25 12:15:56 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-multi-threaded") Multi-threaded generator
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+code.
texts = [u'One document.', u'...', u'Lots of documents']
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in xrange(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
assert doc.is_parsed
if i == 100:
break
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("doc") #[code Doc]]
| #[+label-inline Usage:] #[+a("/usage/processing-pipelines#multithreading") Processing pipelines]
2017-05-25 02:58:33 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-dependencies") Get syntactic dependencies
2017-05-25 02:58:33 +03:00
+tag-model("dependency parse")
2016-10-31 21:04:15 +03:00
+code.
def dependency_labels_to_root(token):
2017-05-25 02:58:33 +03:00
"""Walk up the syntactic tree, collecting the arc labels."""
2016-10-31 21:04:15 +03:00
dep_labels = []
while token.head is not token:
dep_labels.append(token.dep)
token = token.head
return dep_labels
2017-05-25 02:58:33 +03:00
+infobox
2017-10-16 21:36:41 +03:00
| #[+label-inline API:] #[+api("token") #[code Token]]
| #[+label-inline Usage:] #[+a("/usage/linguistic-features#dependency-parse") Using the dependency parse]
2017-05-25 02:58:33 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-numpy-arrays") Export to numpy arrays
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
attr_ids = [ORTH, LIKE_URL, IS_OOV]
doc_array = doc.to_array(attr_ids)
assert doc_array.shape == (len(doc), len(attr_ids))
assert doc[0].orth == doc_array[0, 0]
assert doc[1].orth == doc_array[1, 0]
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
2016-10-31 21:04:15 +03:00
2017-10-03 15:26:20 +03:00
+h(3, "lightning-tour-inline") Calculate inline markup on original string
2016-10-31 21:04:15 +03:00
+code.
def put_spans_around_tokens(doc, get_classes):
2017-05-25 12:15:56 +03:00
"""Given some function to compute class names, put each token in a
span element, with the appropriate classes computed. All whitespace is
preserved, outside of the spans. (Of course, HTML won't display more than
one whitespace character it but the point is, no information is lost
and you can calculate what you need, e.g. &lt;br /&gt;, &lt;p&gt; etc.)
"""
2016-10-31 21:04:15 +03:00
output = []
2017-05-25 12:15:56 +03:00
html = '&lt;span class="{classes}"&gt;{word}&lt;/span&gt;{space}'
2016-10-31 21:04:15 +03:00
for token in doc:
if token.is_space:
2017-05-25 12:15:56 +03:00
output.append(token.text)
2016-10-31 21:04:15 +03:00
else:
2017-05-25 12:15:56 +03:00
classes = ' '.join(get_classes(token))
output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
2016-10-31 21:04:15 +03:00
string = ''.join(output)
string = string.replace('\n', '')
string = string.replace('\t', ' ')
return string