mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			161 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			161 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > API > LEMMATIZER
 | |
| 
 | |
| include ../_includes/_mixins
 | |
| 
 | |
| p
 | |
|     |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
 | |
|     |  rules and lookup tables.
 | |
| 
 | |
| +h(2, "init") Lemmatizer.__init__
 | |
|     +tag method
 | |
| 
 | |
| p Create a #[code Lemmatizer].
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.lemmatizer import Lemmatizer
 | |
|     lemmatizer = Lemmatizer()
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code index]
 | |
|         +cell dict / #[code None]
 | |
|         +cell Inventory of lemmas in the language.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code exceptions]
 | |
|         +cell dict / #[code None]
 | |
|         +cell Mapping of string forms to lemmas that bypass the #[code rules].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code rules]
 | |
|         +cell dict / #[code None]
 | |
|         +cell List of suffix rewrite rules.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code lookup]
 | |
|         +cell dict / #[code None]
 | |
|         +cell Lookup table mapping string to their lemmas.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell #[code Lemmatizer]
 | |
|         +cell The newly created object.
 | |
| 
 | |
| +h(2, "call") Lemmatizer.__call__
 | |
|     +tag method
 | |
| 
 | |
| p Lemmatize a string.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.lemmatizer import Lemmatizer
 | |
|     from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
 | |
|     lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
 | |
|     lemmas = lemmatizer(u'ducks', u'NOUN')
 | |
|     assert lemmas == [u'duck']
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code string]
 | |
|         +cell unicode
 | |
|         +cell The string to lemmatize, e.g. the token text.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code univ_pos]
 | |
|         +cell unicode / int
 | |
|         +cell The token's universal part-of-speech tag.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code morphology]
 | |
|         +cell dict / #[code None]
 | |
|         +cell
 | |
|             |  Morphological features following the
 | |
|             |  #[+a("http://universaldependencies.org/") Universal Dependencies]
 | |
|             |  scheme.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell list
 | |
|         +cell The available lemmas for the string.
 | |
| 
 | |
| +h(2, "lookup") Lemmatizer.lookup
 | |
|     +tag method
 | |
|     +tag-new(2)
 | |
| 
 | |
| p
 | |
|     |  Look up a lemma in the lookup table, if available. If no lemma is found,
 | |
|     |  the original string is returned. Languages can provide a
 | |
|     |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
 | |
|     |  #[code lemma_lookup] variable, set on the individual #[code Language]
 | |
|     |  class.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     lookup = {u'going': u'go'}
 | |
|     lemmatizer = Lemmatizer(lookup=lookup)
 | |
|     assert lemmatizer.lookup(u'going') == u'go'
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code string]
 | |
|         +cell unicode
 | |
|         +cell The string to look up.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell unicode
 | |
|         +cell The lemma if the string was found, otherwise the original string.
 | |
| 
 | |
| +h(2, "is_base_form") Lemmatizer.is_base_form
 | |
|     +tag method
 | |
| 
 | |
| p
 | |
|     |  Check whether we're dealing with an uninflected paradigm, so we can
 | |
|     |  avoid lemmatization entirely.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     pos = 'verb'
 | |
|     morph = {'VerbForm': 'inf'}
 | |
|     is_base_form = lemmatizer.is_base_form(pos, morph)
 | |
|     assert is_base_form == True
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code univ_pos]
 | |
|         +cell unicode / int
 | |
|         +cell The token's universal part-of-speech tag.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code morphology]
 | |
|         +cell dict
 | |
|         +cell The token's morphological features.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell bool
 | |
|         +cell
 | |
|             |  Whether the token's part-of-speech tag and morphological features
 | |
|             |  describe a base form.
 | |
| 
 | |
| +h(2, "attributes") Attributes
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code index]
 | |
|         +cell dict / #[code None]
 | |
|         +cell Inventory of lemmas in the language.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code exc]
 | |
|         +cell dict / #[code None]
 | |
|         +cell Mapping of string forms to lemmas that bypass the #[code rules].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code rules]
 | |
|         +cell dict / #[code None]
 | |
|         +cell List of suffix rewrite rules.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code lookup_table]
 | |
|             +tag-new(2)
 | |
|         +cell dict / #[code None]
 | |
|         +cell The lemma lookup table, if available.
 |