//- 💫 DOCS > USAGE > LIGHTNING TOUR include ../../_includes/_mixins p | The following examples and code snippets give you an overview of spaCy's | functionality and its usage. +h(2, "models") Install models and process text +code(false, "bash"). python -m spacy download en python -m spacy download de +code. import spacy nlp = spacy.load('en') doc = nlp(u'Hello, world. Here are two sentences.') nlp_de = spacy.load('de') doc_de = nlp_de(u'Ich bin ein Berliner.') +infobox | #[strong API:] #[+api("spacy#load") #[code spacy.load()]] | #[strong Usage:] #[+a("/docs/usage/models") Models], | #[+a("/docs/usage/spacy-101") spaCy 101] +h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences +tag-model("dependency parse") +code. doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " u"emoji. It's outranking eggplant 🍑 ") assert doc[0].text == u'Peach' assert doc[1].text == u'emoji' assert doc[-1].text == u'🍑' assert doc[17:19].text == u'outranking eggplant' assert doc.noun_chunks[0].text == u'Peach emoji' sentences = list(doc.sents) assert len(sentences) == 3 assert sentences[0].text == u'Peach is the superior emoji.' +infobox | #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] | #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101] +h(2, "examples-pos-tags") Get part-of-speech tags and flags +tag-model("tagger") +code. doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') apple = doc[0] assert [apple.pos_, apple.pos] == [u'PROPN', 94] assert [apple.tag_, apple.tag] == [u'NNP', 475] assert [apple.shape_, apple.shape] == [u'Xxxxx', 684] assert apple.is_alpha == True assert apple.is_punct == False billion = doc[10] assert billion.is_digit == False assert billion.like_num == True assert billion.like_email == False +infobox | #[strong API:] #[+api("token") #[code Token]] | #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging] +h(2, "examples-integer-ids") Use integer IDs for any string +code. hello_id = nlp.vocab.strings['Hello'] hello_str = nlp.vocab.strings[hello_id] assert token.text == hello_id == 3125 assert token.text == hello_str == 'Hello' +h(2, "examples-entities") Recongnise and update named entities +tag-model("NER") +code. doc = nlp(u'San Francisco considers banning sidewalk delivery robots') ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents] assert ents == [(u'San Francisco', 0, 13, u'GPE')] from spacy.tokens import Span doc = nlp(u'Netflix is hiring a new VP of global policy') doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] ents = [(e.start_char, e.end_char, e.label_) for ent in doc.ents] assert ents == [(0, 7, u'ORG')] +infobox | #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition] +h(2, "displacy") Visualize a dependency parse and named entities in your browser +tag-model("dependency parse", "NER") +code. from spacy import displacy doc_dep = nlp(u'This is a sentence.') displacy.serve(doc_dep, style='dep') doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google ' u'in 2007, few people outside of the company took him seriously.') displacy.serve(doc_ent, style='ent') +infobox | #[strong API:] #[+api("displacy") #[code displacy]] | #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers] +h(2, "examples-word-vectors") Get word vectors and similarity +tag-model("word vectors") +code. doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") apple = doc[0] banana = doc[2] pasta = doc[6] hippo = doc[8] assert apple.similarity(banana) > pasta.similarity(hippo) assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector +infobox | #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity] +h(2, "examples-serialization") Simple and efficient serialization +code. import spacy from spacy.tokens.doc import Doc from spacy.vocab import Vocab nlp = spacy.load('en') moby_dick = open('moby_dick.txt', 'r') doc = nlp(moby_dick) doc.to_disk('/moby_dick.bin') new_doc = Doc(Vocab()).from_disk('/moby_dick.bin') +infobox | #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading] +h(2, "rule-matcher") Match text with token rules +code. import spacy from spacy.matcher import Matcher nlp = spacy.load('en') matcher = Matcher(nlp.vocab) # match "Google I/O" or "Google i/o" pattern = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}] matcher.add('GoogleIO', None, pattern) matches = nlp(LOTS_OF TEXT) +infobox | #[strong API:] #[+api("matcher") #[code Matcher]] | #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching] +h(2, "multi-threaded") Multi-threaded generator +code. texts = [u'One document.', u'...', u'Lots of documents'] # .pipe streams input, and produces streaming output iter_texts = (texts[i % 3] for i in xrange(100000000)) for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)): assert doc.is_parsed if i == 100: break +infobox | #[strong API:] #[+api("doc") #[code Doc]] | #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage] +h(2, "examples-dependencies") Get syntactic dependencies +tag-model("dependency parse") +code. def dependency_labels_to_root(token): """Walk up the syntactic tree, collecting the arc labels.""" dep_labels = [] while token.head is not token: dep_labels.append(token.dep) token = token.head return dep_labels +infobox | #[strong API:] #[+api("token") #[code Token]] | #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse] +h(2, "examples-numpy-arrays") Export to numpy arrays +code. from spacy.attrs import ORTH, LIKE_URL, IS_OOV attr_ids = [ORTH, LIKE_URL, IS_OOV] doc_array = doc.to_array(attr_ids) assert doc_array.shape == (len(doc), len(attr_ids)) assert doc[0].orth == doc_array[0, 0] assert doc[1].orth == doc_array[1, 0] assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] +h(2, "examples-inline") Calculate inline markup on original string +code. def put_spans_around_tokens(doc, get_classes): """Given some function to compute class names, put each token in a span element, with the appropriate classes computed. All whitespace is preserved, outside of the spans. (Of course, HTML won't display more than one whitespace character it – but the point is, no information is lost and you can calculate what you need, e.g. <br />, <p> etc.) """ output = [] html = '<span class="{classes}">{word}</span>{space}' for token in doc: if token.is_space: output.append(token.text) else: classes = ' '.join(get_classes(token)) output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) string = ''.join(output) string = string.replace('\n', '') string = string.replace('\t', ' ') return string