mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			283 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			283 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > LIGHTNING TOUR
 | ||
| 
 | ||
| include ../../_includes/_mixins
 | ||
| 
 | ||
| p
 | ||
|     |  The following examples and code snippets give you an overview of spaCy's
 | ||
|     |  functionality and its usage. If you're new to spaCy, make sure to check
 | ||
|     |  out the #[+a("/docs/usage/spacy-101") spaCy 101 guide].
 | ||
| 
 | ||
| +h(2, "models") Install models and process text
 | ||
| 
 | ||
| +code(false, "bash").
 | ||
|     python -m spacy download en
 | ||
|     python -m spacy download de
 | ||
| 
 | ||
| +code.
 | ||
|     import spacy
 | ||
|     nlp = spacy.load('en')
 | ||
|     doc = nlp(u'Hello, world. Here are two sentences.')
 | ||
| 
 | ||
|     nlp_de = spacy.load('de')
 | ||
|     doc_de = nlp_de(u'Ich bin ein Berliner.')
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("spacy#load") #[code spacy.load()]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/models") Models],
 | ||
|     |  #[+a("/docs/usage/spacy-101") spaCy 101]
 | ||
| 
 | ||
| +h(2, "examples-tokens-sentences") Get tokens, noun chunks & sentences
 | ||
|     +tag-model("dependency parse")
 | ||
| 
 | ||
| +code.
 | ||
|     doc = nlp(u"Peach emoji is where it has always been. Peach is the superior "
 | ||
|               u"emoji. It's outranking eggplant 🍑 ")
 | ||
| 
 | ||
|     assert doc[0].text == u'Peach'
 | ||
|     assert doc[1].text == u'emoji'
 | ||
|     assert doc[-1].text == u'🍑'
 | ||
|     assert doc[17:19].text == u'outranking eggplant'
 | ||
|     assert doc.noun_chunks[0].text == u'Peach emoji'
 | ||
| 
 | ||
|     sentences = list(doc.sents)
 | ||
|     assert len(sentences) == 3
 | ||
|     assert sentences[0].text == u'Peach is the superior emoji.'
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("doc") #[code Doc]], #[+api("token") #[code Token]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/spacy-101") spaCy 101]
 | ||
| 
 | ||
| +h(2, "examples-pos-tags") Get part-of-speech tags and flags
 | ||
|     +tag-model("tagger")
 | ||
| 
 | ||
| +code.
 | ||
|     doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
 | ||
|     apple = doc[0]
 | ||
|     assert [apple.pos_, apple.pos] == [u'PROPN', 17049293600679659579]
 | ||
|     assert [apple.tag_, apple.tag] == [u'NNP', 15794550382381185553]
 | ||
|     assert [apple.shape_, apple.shape] == [u'Xxxxx', 16072095006890171862]
 | ||
|     assert apple.is_alpha == True
 | ||
|     assert apple.is_punct == False
 | ||
| 
 | ||
|     billion = doc[10]
 | ||
|     assert billion.is_digit == False
 | ||
|     assert billion.like_num == True
 | ||
|     assert billion.like_email == False
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("token") #[code Token]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
 | ||
| 
 | ||
| +h(2, "examples-hashes") Use hash values for any string
 | ||
| 
 | ||
| +code.
 | ||
|     doc = nlp(u'I love coffee')
 | ||
|     coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401
 | ||
|     coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
 | ||
| 
 | ||
|     assert doc[2].orth == coffee_hash == 3197928453018144401
 | ||
|     assert doc[2].text == coffee_text == u'coffee'
 | ||
| 
 | ||
|     beer_hash = doc.vocab.strings.add(u'beer') # 3073001599257881079
 | ||
|     beer_text = doc.vocab.strings[beer_hash] # 'beer'
 | ||
| 
 | ||
|     unicorn_hash = doc.vocab.strings.add(u'🦄 ') # 18234233413267120783
 | ||
|     unicorn_text = doc.vocab.strings[unicorn_hash] # '🦄 '
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("stringstore") #[code stringstore]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
 | ||
| 
 | ||
| +h(2, "examples-entities") Recongnise and update named entities
 | ||
|     +tag-model("NER")
 | ||
| 
 | ||
| +code.
 | ||
|     doc = nlp(u'San Francisco considers banning sidewalk delivery robots')
 | ||
|     ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
 | ||
|     assert ents == [(u'San Francisco', 0, 13, u'GPE')]
 | ||
| 
 | ||
|     from spacy.tokens import Span
 | ||
|     doc = nlp(u'Netflix is hiring a new VP of global policy')
 | ||
|     doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])]
 | ||
|     ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
 | ||
|     assert ents == [(0, 7, u'ORG')]
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/entity-recognition") Named entity recognition]
 | ||
| 
 | ||
| +h(2, "displacy") Visualize a dependency parse and named entities in your browser
 | ||
|     +tag-model("dependency parse", "NER")
 | ||
| 
 | ||
| +aside
 | ||
|     .u-text-center(style="overflow: auto").
 | ||
|         <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" class="o-svg" viewBox="270 35 125 240" width="400" height="150" style="max-width: none; color: #fff; background: #1a1e23; font-family: inherit; font-size: 2rem">
 | ||
|             <text fill="currentColor" text-anchor="middle" y="222.0">
 | ||
|                 <tspan style="font-weight: bold" fill="currentColor" x="50">This</tspan>
 | ||
|                 <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="50">DT</tspan>
 | ||
|             </text>
 | ||
|             <text fill="currentColor" text-anchor="middle" y="222.0">
 | ||
|                 <tspan style="font-weight: bold" fill="currentColor" x="225">is</tspan>
 | ||
|                 <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="225">VBZ</tspan>
 | ||
|             </text>
 | ||
|             <text fill="currentColor" text-anchor="middle" y="222.0">
 | ||
|                 <tspan style="font-weight: bold" fill="currentColor" x="400">a</tspan>
 | ||
|                 <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="400">DT</tspan>
 | ||
|             </text>
 | ||
|             <text fill="currentColor" text-anchor="middle" y="222.0">
 | ||
|                 <tspan style="font-weight: bold" fill="currentColor" x="575">sentence.</tspan>
 | ||
|                 <tspan dy="2em" class="u-color-theme" style="font-weight: bold" fill="currentColor" x="575">NN</tspan>
 | ||
|             </text>
 | ||
|             <path id="arrow-0-0" stroke-width="2px" d="M70,177.0 C70,89.5 220.0,89.5 220.0,177.0" fill="none" stroke="currentColor"/>
 | ||
|             <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
 | ||
|                 <textPath xlink:href="#arrow-0-0" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath>
 | ||
|             </text>
 | ||
|             <path d="M70,179.0 L62,167.0 78,167.0" fill="currentColor"/>
 | ||
|             <path id="arrow-0-1" stroke-width="2px" d="M420,177.0 C420,89.5 570.0,89.5 570.0,177.0" fill="none" stroke="currentColor"/>
 | ||
|             <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
 | ||
|                 <textPath xlink:href="#arrow-0-1" startOffset="50%" fill="currentColor" text-anchor="middle">det</textPath>
 | ||
|             </text>
 | ||
|             <path d="M420,179.0 L412,167.0 428,167.0" fill="currentColor"/>
 | ||
|             <path id="arrow-0-2" stroke-width="2px" d="M245,177.0 C245,2.0 575.0,2.0 575.0,177.0" fill="none" stroke="currentColor"/>
 | ||
|             <text dy="1.25em" style="font-size: 0.9em; letter-spacing: 2px">
 | ||
|                 <textPath xlink:href="#arrow-0-2" startOffset="50%" fill="currentColor" text-anchor="middle">attr</textPath>
 | ||
|             </text>
 | ||
|             <path d="M575.0,179.0 L583.0,167.0 567.0,167.0" fill="currentColor"/>
 | ||
|         </svg>
 | ||
| 
 | ||
| +code.
 | ||
|     from spacy import displacy
 | ||
| 
 | ||
|     doc_dep = nlp(u'This is a sentence.')
 | ||
|     displacy.serve(doc_dep, style='dep')
 | ||
| 
 | ||
|     doc_ent = nlp(u'When Sebastian Thrun started working on self-driving cars at Google '
 | ||
|                   u'in 2007, few people outside of the company took him seriously.')
 | ||
|     displacy.serve(doc_ent, style='ent')
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("displacy") #[code displacy]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizers]
 | ||
| 
 | ||
| +h(2, "examples-word-vectors") Get word vectors and similarity
 | ||
|     +tag-model("word vectors")
 | ||
| 
 | ||
| +code.
 | ||
|     doc = nlp(u"Apple and banana are similar. Pasta and hippo aren't.")
 | ||
|     apple = doc[0]
 | ||
|     banana = doc[2]
 | ||
|     pasta = doc[6]
 | ||
|     hippo = doc[8]
 | ||
|     assert apple.similarity(banana) > pasta.similarity(hippo)
 | ||
|     assert apple.has_vector, banana.has_vector, pasta.has_vector, hippo.has_vector
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/word-vectors-similarities") Word vectors and similarity]
 | ||
| 
 | ||
| +h(2, "examples-serialization") Simple and efficient serialization
 | ||
| 
 | ||
| +code.
 | ||
|     import spacy
 | ||
|     from spacy.tokens.doc import Doc
 | ||
|     from spacy.vocab import Vocab
 | ||
| 
 | ||
|     nlp = spacy.load('en')
 | ||
|     moby_dick = open('moby_dick.txt', 'r')
 | ||
|     doc = nlp(moby_dick)
 | ||
|     doc.to_disk('/moby_dick.bin')
 | ||
| 
 | ||
|     new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("language") #[code Language]],
 | ||
|     |  #[+api("doc") #[code Doc]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
 | ||
| 
 | ||
| +h(2, "rule-matcher") Match text with token rules
 | ||
| 
 | ||
| +code.
 | ||
|     import spacy
 | ||
|     from spacy.matcher import Matcher
 | ||
| 
 | ||
|     nlp = spacy.load('en')
 | ||
|     matcher = Matcher(nlp.vocab)
 | ||
| 
 | ||
|     def set_sentiment(matcher, doc, i, matches):
 | ||
|         doc.sentiment += 0.1
 | ||
| 
 | ||
|     pattern1 = [{'ORTH': 'Google'}, {'UPPER': 'I'}, {'ORTH': '/'}, {'UPPER': 'O'}]
 | ||
|     pattern2 = [[{'ORTH': emoji, 'OP': '+'}] for emoji in ['😀', '😂', '🤣', '😍']]
 | ||
|     matcher.add('GoogleIO', None, pattern1) # match "Google I/O" or "Google i/o"
 | ||
|     matcher.add('HAPPY', set_sentiment, *pattern2) # match one or more happy emoji
 | ||
|     matches = nlp(LOTS_OF TEXT)
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("matcher") #[code Matcher]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
 | ||
| 
 | ||
| +h(2, "multi-threaded") Multi-threaded generator
 | ||
| 
 | ||
| +code.
 | ||
|     texts = [u'One document.', u'...', u'Lots of documents']
 | ||
|     # .pipe streams input, and produces streaming output
 | ||
|     iter_texts = (texts[i % 3] for i in xrange(100000000))
 | ||
|     for i, doc in enumerate(nlp.pipe(iter_texts, batch_size=50, n_threads=4)):
 | ||
|         assert doc.is_parsed
 | ||
|         if i == 100:
 | ||
|             break
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("doc") #[code Doc]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/production-usage") Production usage]
 | ||
| 
 | ||
| +h(2, "examples-dependencies") Get syntactic dependencies
 | ||
|     +tag-model("dependency parse")
 | ||
| 
 | ||
| +code.
 | ||
|     def dependency_labels_to_root(token):
 | ||
|         """Walk up the syntactic tree, collecting the arc labels."""
 | ||
|         dep_labels = []
 | ||
|         while token.head is not token:
 | ||
|             dep_labels.append(token.dep)
 | ||
|             token = token.head
 | ||
|         return dep_labels
 | ||
| 
 | ||
| +infobox
 | ||
|     |  #[strong API:] #[+api("token") #[code Token]]
 | ||
|     |  #[strong Usage:] #[+a("/docs/usage/dependency-parse") Using the dependency parse]
 | ||
| 
 | ||
| +h(2, "examples-numpy-arrays") Export to numpy arrays
 | ||
| 
 | ||
| +code.
 | ||
|     from spacy.attrs import ORTH, LIKE_URL, IS_OOV
 | ||
| 
 | ||
|     attr_ids = [ORTH, LIKE_URL, IS_OOV]
 | ||
|     doc_array = doc.to_array(attr_ids)
 | ||
|     assert doc_array.shape == (len(doc), len(attr_ids))
 | ||
|     assert doc[0].orth == doc_array[0, 0]
 | ||
|     assert doc[1].orth == doc_array[1, 0]
 | ||
|     assert doc[0].like_url == doc_array[0, 1]
 | ||
|     assert list(doc_array[:, 1]) == [t.like_url for t in doc]
 | ||
| 
 | ||
| +h(2, "examples-inline") Calculate inline markup on original string
 | ||
| 
 | ||
| +code.
 | ||
|     def put_spans_around_tokens(doc, get_classes):
 | ||
|         """Given some function to compute class names, put each token in a
 | ||
|         span element, with the appropriate classes computed. All whitespace is
 | ||
|         preserved, outside of the spans. (Of course, HTML won't display more than
 | ||
|         one whitespace character it – but the point is, no information is lost
 | ||
|         and you can calculate what you need, e.g. <br />, <p> etc.)
 | ||
|         """
 | ||
|         output = []
 | ||
|         html = '<span class="{classes}">{word}</span>{space}'
 | ||
|         for token in doc:
 | ||
|             if token.is_space:
 | ||
|                 output.append(token.text)
 | ||
|             else:
 | ||
|                 classes = ' '.join(get_classes(token))
 | ||
|                 output.append(html.format(classes=classes, word=token.text, space=token.whitespace_))
 | ||
|         string = ''.join(output)
 | ||
|         string = string.replace('\n', '')
 | ||
|         string = string.replace('\t', '    ')
 | ||
|         return string
 |