spaCy/website/docs/usage/lightning-tour.jade

212 lines
6.8 KiB
Plaintext
Raw Normal View History

2016-10-31 21:04:15 +03:00
//- 💫 DOCS > USAGE > LIGHTNING TOUR
include ../../_includes/_mixins
p
2016-12-25 17:23:30 +03:00
| The following examples and code snippets give you an overview of spaCy's
2016-10-31 21:04:15 +03:00
| functionality and its usage.
2017-05-25 02:58:33 +03:00
+h(2, "models") Install models and process text
2017-03-17 15:11:00 +03:00
+code(false, "bash").
2017-03-18 17:24:42 +03:00
python -m spacy download en
2017-05-25 02:58:33 +03:00
python -m spacy download de
2017-03-17 15:11:00 +03:00
+code.
import spacy
nlp = spacy.load('en')
2017-05-25 02:58:33 +03:00
doc = nlp(u'Hello, world. Here are two sentences.')
2017-03-17 15:11:00 +03:00
2017-05-25 02:58:33 +03:00
nlp_de = spacy.load('de')
doc_de = nlp_de(u'Ich bin ein Berliner.')
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
| #[strong Usage:] #[+a("/docs/usage/models") Models],
| #[+a("/docs/usage/spacy-101") spaCy 101]
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
+tag-model("dependency parse")
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
u"emoji. It's outranking eggplant 🍑 ")
2017-05-25 02:58:33 +03:00
assert doc[0].text == u'Peach'
assert doc[1].text == u'emoji'
assert doc[-1].text == u'🍑'
assert doc[17:19] == u'outranking eggplant'
assert doc.noun_chunks[0].text == u'Peach emoji'
2017-05-25 02:58:33 +03:00
sentences = list(doc.sents)
assert len(sentences) == 3
assert sentences[0].text == u'Peach is the superior emoji.'
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+infobox
| #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+h(2, "examples-pos-tags") Get part-of-speech tags and flags
+tag-model("tagger")
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
apple = doc[0]
assert [apple.pos_, apple.pos] == [u'PROPN', 94]
assert [apple.tag_, apple.tag] == [u'NNP', 475]
assert [apple.shape_, apple.shape] == [u'Xxxxx', 684]
assert apple.is_alpha == True
assert apple.is_punct == False
billion = doc[10]
assert billion.is_digit == False
assert billion.like_num == True
assert billion.like_email == False
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
2016-10-31 21:04:15 +03:00
+h(2, "examples-integer-ids") Use integer IDs for any string
+code.
hello_id = nlp.vocab.strings['Hello']
hello_str = nlp.vocab.strings[hello_id]
2017-05-25 02:58:33 +03:00
assert token.text == hello_id == 3125
assert token.text == hello_str == 'Hello'
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+h(2, "examples-entities") Recongnise and update named entities
+tag-model("NER")
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
assert ents == [(u'San Francisco', 0, 13, u'GPE')]
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
from spacy.tokens import Span
doc = nlp(u'Netflix is hiring a new VP of global policy')
doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents]
assert ents == [(0, 7, u'ORG')]
+infobox
| #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
+h(2, "displacy") Visualize a dependency parse and named entities in your browser
+tag-model("dependency parse", "NER")
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
from spacy import displacy
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
doc_dep = nlp(u'This is a sentence.')
displacy.serve(doc_dep, style='dep')
doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at '
u'Google in 2007, few people outside of the company took him seriously.')
displacy.serve(doc_ent, style='ent')
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
2016-10-31 21:04:15 +03:00
+h(2, "examples-word-vectors") Word vectors
2017-05-25 02:58:33 +03:00
+tag-model("word vectors")
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
apple = doc[0]
banana = doc[2]
pasta = doc[6]
hippo = doc[8]
assert apple.similarity(banana) > pasta.similarity(hippo)
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+infobox
| #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+h(2, "examples-serialization") Simple and efficient serialization
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
import spacy
from spacy.tokens.doc import Doc
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
nlp = spacy.load('en')
moby_dick = open('moby_dick.txt', 'r')
doc = nlp(moby_dick)
doc.to_disk('/moby_dick.bin')
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
new_doc = Doc().from_disk('/moby_dick.bin')
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+infobox
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+h(2, "multi-threaded") Multi-threaded generator
2016-10-31 21:04:15 +03:00
2017-05-25 02:58:33 +03:00
+code.
texts = [u'One document.', u'...', u'Lots of documents']
# .pipe streams input, and produces streaming output
iter_texts = (texts[i % 3] for i in xrange(100000000))
for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
assert doc.is_parsed
if i == 100:
break
+infobox
| #[strong API:] #[+api("doc") #[code Doc]]
| #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
+h(2, "examples-dependencies") Get syntactic dependencies
+tag-model("dependency parse")
2016-10-31 21:04:15 +03:00
+code.
def dependency_labels_to_root(token):
2017-05-25 02:58:33 +03:00
"""Walk up the syntactic tree, collecting the arc labels."""
2016-10-31 21:04:15 +03:00
dep_labels = []
while token.head is not token:
dep_labels.append(token.dep)
token = token.head
return dep_labels
2017-05-25 02:58:33 +03:00
+infobox
| #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
+h(2, "examples-numpy-arrays") Export to numpy arrays
2016-10-31 21:04:15 +03:00
+code.
2017-05-25 02:58:33 +03:00
from spacy.attrs import ORTH, LIKE_URL, IS_OOV
attr_ids = [ORTH, LIKE_URL, IS_OOV]
doc_array = doc.to_array(attr_ids)
assert doc_array.shape == (len(doc), len(attr_ids))
assert doc[0].orth == doc_array[0, 0]
assert doc[1].orth == doc_array[1, 0]
assert doc[0].like_url == doc_array[0, 1]
assert list(doc_array[:, 1]) == [t.like_url for t in doc]
2016-10-31 21:04:15 +03:00
+h(2, "examples-inline") Calculate inline mark-up on original string
+code.
def put_spans_around_tokens(doc, get_classes):
'''Given some function to compute class names, put each token in a
span element, with the appropriate classes computed.
All whitespace is preserved, outside of the spans. (Yes, I know HTML
won't display it. But the point is no information is lost, so you can
calculate what you need, e.g. <br /> tags, <p> tags, etc.)
'''
output = []
template = '<span classes="{classes}">{word}</span>{space}'
for token in doc:
if token.is_space:
output.append(token.orth_)
else:
output.append(
template.format(
classes=' '.join(get_classes(token)),
word=token.orth_,
space=token.whitespace_))
string = ''.join(output)
string = string.replace('\n', '')
string = string.replace('\t', ' ')
return string