mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			205 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			205 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //-  Docs > API > Lexeme
 | ||
| //- ============================================================================
 | ||
| 
 | ||
| +section('lexeme')
 | ||
|     +h2('lexeme', 'https://github.com/' + profiles.github + '/spaCy/blob/master/spacy/lexeme.pyx#L31')
 | ||
|         | #[+label('tag') class] Lexeme
 | ||
| 
 | ||
|     p.
 | ||
|         The Lexeme object represents a lexical type, stored in the vocabulary – 
 | ||
|         as opposed to a token, occurring in a document.
 | ||
| 
 | ||
|     p.
 | ||
|         Each Token object receives a reference to a lexeme object (specifically, 
 | ||
|         it receives a pointer to a #[code LexemeC] struct). This allows features 
 | ||
|         to be computed and saved once per type, rather than once per token. As 
 | ||
|         job sizes grow, this amounts to substantial efficiency improvements, as 
 | ||
|         the vocabulary size (number of types) will be much smaller than the total 
 | ||
|         number of words processed (number of tokens).
 | ||
| 
 | ||
|     p.
 | ||
|         All Lexeme attributes are therefore context independent, as a single lexeme 
 | ||
|         is reused for all usages of that word. Lexemes are keyed by the #[code orth] 
 | ||
|         attribute.
 | ||
| 
 | ||
|     p.
 | ||
|         Most Lexeme attributes can be set, with the exception of the primary key, 
 | ||
|         #[code orth]. Assigning to an attribute of the #[code Lexeme] object writes 
 | ||
|         to the underlying struct, so all tokens that are backed by that 
 | ||
|         #[code Lexeme] will inherit the new value.
 | ||
| 
 | ||
|     +code('python', 'Overview').
 | ||
|         class Lexeme:
 | ||
|             def __init__(self, vocab, key):
 | ||
|                 return self
 | ||
| 
 | ||
|             int rank
 | ||
|             
 | ||
|             int orth, lower, shape, prefix, suffix
 | ||
| 
 | ||
|             unicode orth_, lower_, shape_, prefix_, suffix_
 | ||
| 
 | ||
|             bool is_alpha, is_ascii, is_lower, is_title, is_punct, is_space, like_url, like_num, like_email, is_oov, is_stop
 | ||
| 
 | ||
|             float prob
 | ||
|             int cluster
 | ||
|             numpy.ndarray[float64] vector
 | ||
|             bool has_vector
 | ||
| 
 | ||
|             def set_flag(self, flag_id, value):
 | ||
|                 return None
 | ||
| 
 | ||
|             def check_flag(self, flag_id):
 | ||
|                 return bool
 | ||
| 
 | ||
|             def similarity(self, other):
 | ||
|                 return float
 | ||
| 
 | ||
|     +table(['Example', 'Description'], 'code')
 | ||
|         +row
 | ||
|             +cell #[code.lang-python lexeme = nlp.vocab[string]]
 | ||
|             +cell Lookup by string
 | ||
|         +row
 | ||
|             +cell #[code.lang-python lexeme = vocab[i]]
 | ||
|             +cell Lookup by integer
 | ||
| 
 | ||
|     +section('lexeme-stringfeatures')
 | ||
|         +h3('lexeme-stringfeatures').
 | ||
|             String Features
 | ||
| 
 | ||
|         +table(['Name', 'Description'], 'params')
 | ||
|             +row
 | ||
|                 +cell orth / orth_
 | ||
|                 +cell.
 | ||
|                     The form of the word with no string normalization or processing, 
 | ||
|                     as it appears in the string, without trailing whitespace.
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell lower / lower_
 | ||
|                 +cell.
 | ||
|                     The form of the word, but forced to lower-case, i.e. 
 | ||
|                     #[code lower = word.orth_.lower()]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell shape / shape_
 | ||
|                 +cell.
 | ||
|                     A transform of the word's string, to show orthographic features. 
 | ||
|                     The characters a-z are mapped to x, A-Z is mapped to X, 0-9 
 | ||
|                     is mapped to d. After these mappings, sequences of 4 or more 
 | ||
|                     of the same character are truncated to length 4. Examples: 
 | ||
|                     C3Po --> XdXx, favorite --> xxxx, :) --> :)
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell prefix / prefix_
 | ||
|                 +cell.
 | ||
|                     A length-N substring from the start of the word. Length may 
 | ||
|                     vary by language; currently for English n=1, i.e. 
 | ||
|                     #[code prefix = word.orth_[:1]]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell suffix / suffix_
 | ||
|                 +cell.
 | ||
|                     A length-N substring from the end of the word. Length may vary 
 | ||
|                     by language; currently for English n=3, i.e. 
 | ||
|                     #[code suffix = word.orth_[-3:]]
 | ||
| 
 | ||
|     +section('lexeme-booleanflags')
 | ||
|         +h3('lexeme-booleanflags')
 | ||
|             | Boolean Flags
 | ||
| 
 | ||
|         +table(['Name', 'Description'], 'params')
 | ||
|             +row
 | ||
|                 +cell is_alpha
 | ||
|                 +cell.
 | ||
|                     Equivalent to #[code word.orth_.isalpha()]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_ascii
 | ||
|                 +cell.
 | ||
|                     Equivalent to any(ord(c) >= 128 for c in word.orth_)]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_digit
 | ||
|                 +cell.
 | ||
|                     Equivalent to #[code word.orth_.isdigit()]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_lower
 | ||
|                 +cell.
 | ||
|                     Equivalent to #[code word.orth_.islower()]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_title
 | ||
|                 +cell.
 | ||
|                     Equivalent to #[code word.orth_.istitle()]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_punct
 | ||
|                 +cell.
 | ||
|                     Equivalent to #[code word.orth_.ispunct()]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_space
 | ||
|                 +cell.
 | ||
|                     Equivalent to #[code word.orth_.isspace()]
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell like_url
 | ||
|                 +cell.
 | ||
|                     Does the word resemble a URL?
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell like_num
 | ||
|                 +cell.
 | ||
|                     Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc.
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell like_email
 | ||
|                 +cell.
 | ||
|                     Does the word resemble an email?
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_oov
 | ||
|                 +cell.
 | ||
|                     Is the word out-of-vocabulary?
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell is_stop
 | ||
|                 +cell.
 | ||
|                     Is the word part of a "stop list"? Stop lists are used to 
 | ||
|                     improve the quality of topic models, by filtering out common, 
 | ||
|                     domain-general words.
 | ||
| 
 | ||
|     +section('lexeme-distributional')
 | ||
|         +h3('lexeme-distributional')
 | ||
|             | Distributional Features
 | ||
| 
 | ||
|         +table(['Name', 'Description'], 'params')
 | ||
|             +row
 | ||
|                 +cell prob
 | ||
|                 +cell.
 | ||
|                     The unigram log-probability of the word, estimated from 
 | ||
|                     counts from a large corpus, smoothed using Simple Good Turing 
 | ||
|                     estimation.
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell cluster
 | ||
|                 +cell.
 | ||
|                     The Brown cluster ID of the word. These are often useful features 
 | ||
|                     for linear models. If you’re using a non-linear model, particularly 
 | ||
|                     a neural net or random forest, consider using the real-valued 
 | ||
|                     word representation vector, in #[code Token.repvec], instead.
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell vector
 | ||
|                 +cell.
 | ||
|                     A "word embedding" representation: a dense real-valued vector 
 | ||
|                     that supports similarity queries between words. By default, 
 | ||
|                     spaCy currently loads vectors produced by the Levy and 
 | ||
|                     Goldberg (2014) dependency-based word2vec model.
 | ||
| 
 | ||
|             +row
 | ||
|                 +cell has_vector
 | ||
|                 +cell.
 | ||
|                     A boolean value indicating whether a vector.
 |