//- 💫 DOCS > USAGE > SPACY 101 > LIGHTNING TOUR p | The following examples and code snippets give you an overview of spaCy's | functionality and its usage. +h(3, "lightning-tour-models") Install models and process text +code(false, "bash"). python -m spacy download en_core_web_sm python -m spacy download de_core_news_sm +code-exec. import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(u'Hello, world. Here are two sentences.') print([t.text for t in doc]) nlp_de = spacy.load('de_core_news_sm') doc_de = nlp_de(u'Ich bin ein Berliner.') print([t.text for t in doc_de]) +infobox | #[+label-inline API:] #[+api("spacy#load") #[code spacy.load()]] | #[+label-inline Usage:] #[+a("/usage/models") Models], | #[+a("/usage/spacy-101") spaCy 101] +h(3, "lightning-tour-tokens-sentences") Get tokens, noun chunks & sentences +tag-model("dependency parse") +code-exec. import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(u"Peach emoji is where it has always been. Peach is the superior " u"emoji. It's outranking eggplant 🍑 ") print(doc[0].text) # Peach print(doc[1].text) # emoji print(doc[-1].text) # 🍑 print(doc[17:19].text) # outranking eggplant noun_chunks = list(doc.noun_chunks) print(noun_chunks[0].text) # Peach emoji sentences = list(doc.sents) assert len(sentences) == 3 print(sentences[1].text) # 'Peach is the superior emoji.' +infobox | #[+label-inline API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]] | #[+label-inline Usage:] #[+a("/usage/spacy-101") spaCy 101] +h(3, "lightning-tour-pos-tags") Get part-of-speech tags and flags +tag-model("tagger") +code-exec. import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion') apple = doc[0] print('Fine-grained POS tag', apple.pos_, apple.pos) print('Coarse-grained POS tag', apple.tag_, apple.tag) print('Word shape', apple.shape_, apple.shape) print('Alphanumeric characters?', apple.is_alpha) print('Punctuation mark?', apple.is_punct) billion = doc[10] print('Digit?', billion.is_digit) print('Like a number?', billion.like_num) print('Like an email address?', billion.like_email) +infobox | #[+label-inline API:] #[+api("token") #[code Token]] | #[+label-inline Usage:] #[+a("/usage/linguistic-features#pos-tagging") Part-of-speech tagging] +h(3, "lightning-tour-hashes") Use hash values for any string +code-exec. import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(u'I love coffee') coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401 coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee' print(coffee_hash, coffee_text) print(doc[2].orth, coffee_hash) # 3197928453018144401 print(doc[2].text, coffee_text) # 'coffee' beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079 beer_text = doc.vocab.strings[beer_hash] # 'beer' print(beer_hash, beer_text) unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783 unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 ' print(unicorn_hash, unicorn_text) +infobox | #[+label-inline API:] #[+api("stringstore") #[code StringStore]] | #[+label-inline Usage:] #[+a("/usage/spacy-101#vocab") Vocab, hashes and lexemes 101] +h(3, "lightning-tour-entities") Recognise and update named entities +tag-model("NER") +code-exec. import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(u'San Francisco considers banning sidewalk delivery robots') for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) from spacy.tokens import Span doc = nlp(u'FB is hiring a new VP of global policy') doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) +infobox | #[+label-inline Usage:] #[+a("/usage/linguistic-features#named-entities") Named entity recognition] +h(3, "lightning-tour-training") Train and update neural network models +tag-model +code. import spacy import random nlp = spacy.load('en') train_data = [("Uber blew through $1 million", {'entities': [(0, 4, 'ORG')]})] with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']): optimizer = nlp.begin_training() for i in range(10): random.shuffle(train_data) for text, annotations in train_data: nlp.update([text], [annotations], sgd=optimizer) nlp.to_disk('/model') +infobox | #[+label-inline API:] #[+api("language#update") #[code Language.update]] | #[+label-inline Usage:] #[+a("/usage/training") Training spaCy's statistical models] +h(3, "lightning-tour-displacy") Visualize a dependency parse and named entities in your browser +tag-model("dependency parse", "NER") +tag-new(2) +aside .u-text-center(style="overflow: auto"). This DT is VBZ a DT sentence. NN nsubj det attr +code. from spacy import displacy doc_dep = nlp(u'This is a sentence.') displacy.serve(doc_dep, style='dep') doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google ' u'in 2007, few people outside of the company took him seriously.') displacy.serve(doc_ent, style='ent') +infobox | #[+label-inline API:] #[+api("top-level#displacy") #[code displacy]] | #[+label-inline Usage:] #[+a("/usage/visualizers") Visualizers] +h(3, "lightning-tour-word-vectors") Get word vectors and similarity +tag-model("word vectors") +code-exec. import spacy nlp = spacy.load('en_core_web_md') doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.") apple = doc[0] banana = doc[2] pasta = doc[6] hippo = doc[8] print('apple <-> banana', apple.similarity(banana)) print('pasta <-> hippo', pasta.similarity(hippo)) print(apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector) p | For the best results, you should run this example using the | #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model | (currently not available in the live demo). +infobox | #[+label-inline Usage:] #[+a("/usage/vectors-similarity") Word vectors and similarity] +h(3, "lightning-tour-serialization") Simple and efficient serialization +code. import spacy from spacy.tokens import Doc from spacy.vocab import Vocab nlp = spacy.load('en') customer_feedback = open('customer_feedback_627.txt').read() doc = nlp(customer_feedback) doc.to_disk('/tmp/customer_feedback_627.bin') new_doc = Doc(Vocab()).from_disk('/tmp/customer_feedback_627.bin') +infobox | #[+label-inline API:] #[+api("language") #[code Language]], | #[+api("doc") #[code Doc]] | #[+label-inline Usage:] #[+a("/usage/models#saving-loading") Saving and loading models] +h(3, "lightning-tour-rule-matcher") Match text with token rules +code-exec. import spacy from spacy.matcher import Matcher nlp = spacy.load('en_core_web_sm') matcher = Matcher(nlp.vocab) def set_sentiment(matcher, doc, i, matches): doc.sentiment += 0.1 pattern1 = [{'ORTH': 'Google'}, {'ORTH': 'I'}, {'ORTH': '/'}, {'ORTH': 'O'}] pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']] matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o" matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji doc = nlp(u"A text about Google I/O 😀😀") matches = matcher(doc) for match_id, start, end in matches: string_id = nlp.vocab.strings[match_id] span = doc[start:end] print(string_id, span.text) print('Sentiment', doc.sentiment) +infobox | #[+label-inline API:] #[+api("matcher") #[code Matcher]] | #[+label-inline Usage:] #[+a("/usage/linguistic-features#rule-based-matching") Rule-based matching] +h(3, "lightning-tour-multi-threaded") Multi-threaded generator +code. texts = [u'One document.', u'...', u'Lots of documents'] # .pipe streams input, and produces streaming output iter_texts = (texts[i % 3] for i in xrange(100000000)) for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)): assert doc.is_parsed if i == 100: break +infobox | #[+label-inline API:] #[+api("doc") #[code Doc]] | #[+label-inline Usage:] #[+a("/usage/processing-pipelines#multithreading") Processing pipelines] +h(3, "lightning-tour-dependencies") Get syntactic dependencies +tag-model("dependency parse") +code-exec. import spacy nlp = spacy.load('en_core_web_sm') doc = nlp(u"When Sebastian Thrun started working on self-driving cars at Google " u"in 2007, few people outside of the company took him seriously.") dep_labels = [] for token in doc: while token.head != token: dep_labels.append(token.dep_) token = token.head print(dep_labels) +infobox | #[+label-inline API:] #[+api("token") #[code Token]] | #[+label-inline Usage:] #[+a("/usage/linguistic-features#dependency-parse") Using the dependency parse] +h(3, "lightning-tour-numpy-arrays") Export to numpy arrays +code-exec. import spacy from spacy.attrs import ORTH, LIKE_URL nlp = spacy.load('en_core_web_sm') doc = nlp(u"Check out https://spacy.io") for token in doc: print(token.text, token.orth, token.like_url) attr_ids = [ORTH, LIKE_URL] doc_array = doc.to_array(attr_ids) print(doc_array.shape) print(len(doc), len(attr_ids)) assert doc[0].orth == doc_array[0, 0] assert doc[1].orth == doc_array[1, 0] assert doc[0].like_url == doc_array[0, 1] assert list(doc_array[:, 1]) == [t.like_url for t in doc] print(list(doc_array[:, 1])) +h(3, "lightning-tour-inline") Calculate inline markup on original string +code-exec. import spacy def put_spans_around_tokens(doc): """Here, we're building a custom "syntax highlighter" for part-of-speech tags and dependencies. We put each token in a span element, with the appropriate classes computed. All whitespace is preserved, outside of the spans. (Of course, HTML will only display multiple whitespace if enabled – but the point is, no information is lost and you can calculate what you need, e.g. <br />, <p> etc.) """ output = [] html = '<span class="{classes}">{word}</span>{space}' for token in doc: if token.is_space: output.append(token.text) else: classes = 'pos-{} dep-{}'.format(token.pos_, token.dep_) output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) string = ''.join(output) string = string.replace('\n', '') string = string.replace('\t', ' ') return '<pre>{}</pre>'.format(string) nlp = spacy.load('en_core_web_sm') doc = nlp(u"This is a test.\n\nHello world.") html = put_spans_around_tokens(doc) print(html)