2016-10-31 21:04:15 +03:00
|
|
|
|
//- 💫 DOCS > USAGE > LIGHTNING TOUR
|
|
|
|
|
|
|
|
|
|
include ../../_includes/_mixins
|
|
|
|
|
|
|
|
|
|
p
|
2016-12-25 17:23:30 +03:00
|
|
|
|
| The following examples and code snippets give you an overview of spaCy's
|
2016-10-31 21:04:15 +03:00
|
|
|
|
| functionality and its usage.
|
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+h(2, "models") Install models and process text
|
2017-03-17 15:11:00 +03:00
|
|
|
|
|
|
|
|
|
+code(false, "bash").
|
2017-03-18 17:24:42 +03:00
|
|
|
|
python -m spacy download en
|
2017-05-25 02:58:33 +03:00
|
|
|
|
python -m spacy download de
|
2017-03-17 15:11:00 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
|
|
|
|
import spacy
|
|
|
|
|
nlp = spacy.load('en')
|
2017-05-25 02:58:33 +03:00
|
|
|
|
doc = nlp(u'Hello, world. Here are two sentences.')
|
2017-03-17 15:11:00 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
nlp_de = spacy.load('de')
|
|
|
|
|
doc_de = nlp_de(u'Ich bin ein Berliner.')
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+infobox
|
|
|
|
|
| #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/models") Models],
|
|
|
|
|
| #[+a("/docs/usage/spacy-101") spaCy 101]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
|
|
|
|
|
+tag-model("dependency parse")
|
2017-05-24 00:15:39 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-25 02:58:33 +03:00
|
|
|
|
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
|
|
|
|
|
u"emoji. It's outranking eggplant 🍑 ")
|
2017-05-24 00:15:39 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
assert doc[0].text == u'Peach'
|
|
|
|
|
assert doc[1].text == u'emoji'
|
|
|
|
|
assert doc[-1].text == u'🍑'
|
2017-05-26 13:46:29 +03:00
|
|
|
|
assert doc[17:19].text == u'outranking eggplant'
|
2017-05-25 02:58:33 +03:00
|
|
|
|
assert doc.noun_chunks[0].text == u'Peach emoji'
|
2017-05-24 00:15:39 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
sentences = list(doc.sents)
|
|
|
|
|
assert len(sentences) == 3
|
|
|
|
|
assert sentences[0].text == u'Peach is the superior emoji.'
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+infobox
|
|
|
|
|
| #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+h(2, "examples-pos-tags") Get part-of-speech tags and flags
|
|
|
|
|
+tag-model("tagger")
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-25 02:58:33 +03:00
|
|
|
|
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
|
|
|
|
|
apple = doc[0]
|
|
|
|
|
assert [apple.pos_, apple.pos] == [u'PROPN', 94]
|
|
|
|
|
assert [apple.tag_, apple.tag] == [u'NNP', 475]
|
|
|
|
|
assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]
|
|
|
|
|
assert apple.is_alpha == True
|
|
|
|
|
assert apple.is_punct == False
|
|
|
|
|
|
|
|
|
|
billion = doc[10]
|
|
|
|
|
assert billion.is_digit == False
|
|
|
|
|
assert billion.like_num == True
|
|
|
|
|
assert billion.like_email == False
|
|
|
|
|
|
|
|
|
|
+infobox
|
|
|
|
|
| #[strong API:] #[+api("token") #[code Token]]
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+h(2, "examples-integer-ids") Use integer IDs for any string
|
|
|
|
|
|
|
|
|
|
+code.
|
|
|
|
|
hello_id = nlp.vocab.strings['Hello']
|
|
|
|
|
hello_str = nlp.vocab.strings[hello_id]
|
2017-05-25 02:58:33 +03:00
|
|
|
|
assert token.text == hello_id == 3125
|
|
|
|
|
assert token.text == hello_str == 'Hello'
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+h(2, "examples-entities") Recongnise and update named entities
|
|
|
|
|
+tag-model("NER")
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-25 02:58:33 +03:00
|
|
|
|
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
|
|
|
|
|
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
|
|
|
|
|
assert ents == [(u'San Francisco', 0, 13, u'GPE')]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
from spacy.tokens import Span
|
|
|
|
|
doc = nlp(u'Netflix is hiring a new VP of global policy')
|
|
|
|
|
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
|
|
|
|
|
ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents]
|
|
|
|
|
assert ents == [(0, 7, u'ORG')]
|
|
|
|
|
|
|
|
|
|
+infobox
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
|
|
|
|
|
|
|
|
|
|
+h(2, "displacy") Visualize a dependency parse and named entities in your browser
|
|
|
|
|
+tag-model("dependency parse", "NER")
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-25 02:58:33 +03:00
|
|
|
|
from spacy import displacy
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
doc_dep = nlp(u'This is a sentence.')
|
|
|
|
|
displacy.serve(doc_dep, style='dep')
|
|
|
|
|
|
2017-05-25 12:15:56 +03:00
|
|
|
|
doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
|
|
|
|
|
u'in 2007, few people outside of the company took him seriously.')
|
2017-05-25 02:58:33 +03:00
|
|
|
|
displacy.serve(doc_ent, style='ent')
|
|
|
|
|
|
|
|
|
|
+infobox
|
|
|
|
|
| #[strong API:] #[+api("displacy") #[code displacy]]
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 12:15:56 +03:00
|
|
|
|
+h(2, "examples-word-vectors") Get word vectors and similarity
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+tag-model("word vectors")
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-25 02:58:33 +03:00
|
|
|
|
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
|
|
|
|
|
apple = doc[0]
|
|
|
|
|
banana = doc[2]
|
|
|
|
|
pasta = doc[6]
|
|
|
|
|
hippo = doc[8]
|
|
|
|
|
assert apple.similarity(banana) > pasta.similarity(hippo)
|
2017-05-25 12:15:56 +03:00
|
|
|
|
assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+infobox
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+h(2, "examples-serialization") Simple and efficient serialization
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-25 02:58:33 +03:00
|
|
|
|
import spacy
|
|
|
|
|
from spacy.tokens.doc import Doc
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
|
moby_dick = open('moby_dick.txt', 'r')
|
|
|
|
|
doc = nlp(moby_dick)
|
|
|
|
|
doc.to_disk('/moby_dick.bin')
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
new_doc = Doc().from_disk('/moby_dick.bin')
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+infobox
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 12:15:56 +03:00
|
|
|
|
+h(2, "rule-matcher") Match text with token rules
|
|
|
|
|
|
|
|
|
|
+code.
|
|
|
|
|
import spacy
|
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
|
|
|
|
|
|
nlp = spacy.load('en')
|
|
|
|
|
matcher = Matcher(nlp.vocab)
|
|
|
|
|
# match "Google I/O" or "Google i/o"
|
|
|
|
|
pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
|
|
|
|
|
matcher.add('GoogleIO', None, pattern)
|
|
|
|
|
matches = nlp(LOTS_OF TEXT)
|
|
|
|
|
|
|
|
|
|
+infobox
|
|
|
|
|
| #[strong API:] #[+api("matcher") #[code Matcher]]
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
|
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+h(2, "multi-threaded") Multi-threaded generator
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+code.
|
|
|
|
|
texts = [u'One document.', u'...', u'Lots of documents']
|
|
|
|
|
# .pipe streams input, and produces streaming output
|
|
|
|
|
iter_texts = (texts[i % 3] for i in xrange(100000000))
|
|
|
|
|
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
|
|
|
|
|
assert doc.is_parsed
|
|
|
|
|
if i == 100:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
+infobox
|
|
|
|
|
| #[strong API:] #[+api("doc") #[code Doc]]
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
|
|
|
|
|
|
|
|
|
|
+h(2, "examples-dependencies") Get syntactic dependencies
|
|
|
|
|
+tag-model("dependency parse")
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
|
|
|
|
def dependency_labels_to_root(token):
|
2017-05-25 02:58:33 +03:00
|
|
|
|
"""Walk up the syntactic tree, collecting the arc labels."""
|
2016-10-31 21:04:15 +03:00
|
|
|
|
dep_labels = []
|
|
|
|
|
while token.head is not token:
|
|
|
|
|
dep_labels.append(token.dep)
|
|
|
|
|
token = token.head
|
|
|
|
|
return dep_labels
|
|
|
|
|
|
2017-05-25 02:58:33 +03:00
|
|
|
|
+infobox
|
|
|
|
|
| #[strong API:] #[+api("token") #[code Token]]
|
|
|
|
|
| #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
|
|
|
|
|
|
|
|
|
|
+h(2, "examples-numpy-arrays") Export to numpy arrays
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
2017-05-25 02:58:33 +03:00
|
|
|
|
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
|
|
|
|
|
|
|
|
|
|
attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
|
|
|
|
doc_array = doc.to_array(attr_ids)
|
|
|
|
|
assert doc_array.shape == (len(doc), len(attr_ids))
|
|
|
|
|
assert doc[0].orth == doc_array[0, 0]
|
|
|
|
|
assert doc[1].orth == doc_array[1, 0]
|
|
|
|
|
assert doc[0].like_url == doc_array[0, 1]
|
|
|
|
|
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
2017-05-25 12:15:56 +03:00
|
|
|
|
+h(2, "examples-inline") Calculate inline markup on original string
|
2016-10-31 21:04:15 +03:00
|
|
|
|
|
|
|
|
|
+code.
|
|
|
|
|
def put_spans_around_tokens(doc, get_classes):
|
2017-05-25 12:15:56 +03:00
|
|
|
|
"""Given some function to compute class names, put each token in a
|
|
|
|
|
span element, with the appropriate classes computed. All whitespace is
|
|
|
|
|
preserved, outside of the spans. (Of course, HTML won't display more than
|
|
|
|
|
one whitespace character it – but the point is, no information is lost
|
|
|
|
|
and you can calculate what you need, e.g. <br />, <p> etc.)
|
|
|
|
|
"""
|
2016-10-31 21:04:15 +03:00
|
|
|
|
output = []
|
2017-05-25 12:15:56 +03:00
|
|
|
|
html = '<span class="{classes}">{word}</span>{space}'
|
2016-10-31 21:04:15 +03:00
|
|
|
|
for token in doc:
|
|
|
|
|
if token.is_space:
|
2017-05-25 12:15:56 +03:00
|
|
|
|
output.append(token.text)
|
2016-10-31 21:04:15 +03:00
|
|
|
|
else:
|
2017-05-25 12:15:56 +03:00
|
|
|
|
classes = ' '.join(get_classes(token))
|
|
|
|
|
output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
|
2016-10-31 21:04:15 +03:00
|
|
|
|
string = ''.join(output)
|
|
|
|
|
string = string.replace('\n', '')
|
|
|
|
|
string = string.replace('\t', ' ')
|
|
|
|
|
return string
|