mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			63 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			63 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > SPACY 101 > POS TAGGING AND DEPENDENCY PARSING
 | ||
| 
 | ||
| p
 | ||
|     |  After tokenization, spaCy can also #[strong parse] and #[strong tag] a
 | ||
|     |  given #[code Doc]. This is where the statistical model comes in, which
 | ||
|     |  enables spaCy to #[strong make a prediction] of which tag or label most
 | ||
|     |  likely applies in this context. A model consists of binary data and is
 | ||
|     |  produced by showing a system enough examples for it to make predictions
 | ||
|     |  that generalise across the language – for example, a word following "the"
 | ||
|     |  in English is most likely a noun.
 | ||
| 
 | ||
| p
 | ||
|     |  Linguistic annotations are available as
 | ||
|     |  #[+api("token#attributes") #[code Token] attributes]. Like many NLP
 | ||
|     |  libraries, spaCy #[strong encodes all strings to hash values] to reduce
 | ||
|     |  memory usage and improve efficiency. So to get the readable string
 | ||
|     |  representation of an attribute, we need to add an underscore #[code _]
 | ||
|     |  to its name:
 | ||
| 
 | ||
| +code.
 | ||
|     doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
 | ||
| 
 | ||
|     for token in doc:
 | ||
|         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
 | ||
|               token.shape_, token.is_alpha, token.is_stop)
 | ||
| 
 | ||
| +aside
 | ||
|     |  #[strong Text:] The original word text.#[br]
 | ||
|     |  #[strong Lemma:] The base form of the word.#[br]
 | ||
|     |  #[strong POS:] The simple part-of-speech tag.#[br]
 | ||
|     |  #[strong Tag:] The detailed part-of-speech tag.#[br]
 | ||
|     |  #[strong Dep:] Syntactic dependency, i.e. the relation between tokens.#[br]
 | ||
|     |  #[strong Shape:] The word shape – capitalisation, punctuation, digits.#[br]
 | ||
|     |  #[strong is alpha:] Is the token an alpha character?#[br]
 | ||
|     |  #[strong is stop:] Is the token part of a stop list, i.e. the most common
 | ||
|     |  words of the language?#[br]
 | ||
| 
 | ||
| +table(["Text", "Lemma", "POS", "Tag", "Dep", "Shape", "alpha", "stop"])
 | ||
|     - var style = [0, 0, 1, 1, 1, 1, 1, 1]
 | ||
|     +annotation-row(["Apple", "apple", "PROPN", "NNP", "nsubj", "Xxxxx", true, false], style)
 | ||
|     +annotation-row(["is", "be", "VERB", "VBZ", "aux", "xx", true, true], style)
 | ||
|     +annotation-row(["looking", "look", "VERB", "VBG", "ROOT", "xxxx", true, false], style)
 | ||
|     +annotation-row(["at", "at", "ADP", "IN", "prep", "xx", true, true], style)
 | ||
|     +annotation-row(["buying", "buy", "VERB", "VBG", "pcomp", "xxxx", true, false], style)
 | ||
|     +annotation-row(["U.K.", "u.k.", "PROPN", "NNP", "compound", "X.X.", false, false], style)
 | ||
|     +annotation-row(["startup", "startup", "NOUN", "NN", "dobj", "xxxx", true, false], style)
 | ||
|     +annotation-row(["for", "for", "ADP", "IN", "prep", "xxx", true, true], style)
 | ||
|     +annotation-row(["$", "$", "SYM", "$", "quantmod", "$", false, false], style)
 | ||
|     +annotation-row(["1", "1", "NUM", "CD", "compound", "d", false, false], style)
 | ||
|     +annotation-row(["billion", "billion", "NUM", "CD", "pobj", "xxxx", true, false], style)
 | ||
| 
 | ||
| +aside("Tip: Understanding tags and labels")
 | ||
|     |  Most of the tags and labels look pretty abstract, and they vary between
 | ||
|     |  languages. #[code spacy.explain()] will show you a short description –
 | ||
|     |  for example, #[code spacy.explain("VBZ")] returns "verb, 3rd person
 | ||
|     |  singular present".
 | ||
| 
 | ||
| p
 | ||
|     |  Using spaCy's built-in #[+a("/docs/usage/visualizers") displaCy visualizer],
 | ||
|     |  here's what our example sentence and its dependencies look like:
 | ||
| 
 | ||
| +codepen("030d1e4dfa6256cad8fdd59e6aefecbe", 460)
 |