mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	* Add Word classes
This commit is contained in:
		
							parent
							
								
									3b793cf4f7
								
							
						
					
					
						commit
						ce59526011
					
				
							
								
								
									
										58
									
								
								spacy/word.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								spacy/word.pxd
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,58 @@ | ||||||
|  | from libc.stdint cimport uint32_t | ||||||
|  | from libc.stdint cimport uint64_t | ||||||
|  | 
 | ||||||
|  | ctypedef int ClusterID | ||||||
|  | ctypedef uint32_t StringHash | ||||||
|  | ctypedef size_t LexID | ||||||
|  | ctypedef char OrthFlags | ||||||
|  | ctypedef char DistFlags | ||||||
|  | ctypedef uint64_t TagFlags | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef enum OrthFlag: | ||||||
|  |     IS_ALPHA | ||||||
|  |     IS_DIGIT | ||||||
|  |     IS_PUNCT | ||||||
|  |     IS_SPACE | ||||||
|  |     IS_LOWER | ||||||
|  |     IS_UPPER | ||||||
|  |     IS_TITLE | ||||||
|  |     IS_ASCII | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef enum: | ||||||
|  |     NORM | ||||||
|  |     SHAPE | ||||||
|  |     LAST3 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef class Word: | ||||||
|  |     # NB: the readonly keyword refers to _Python_ access. The attributes are | ||||||
|  |     # writeable from Cython. | ||||||
|  |     cdef readonly StringHash lex | ||||||
|  |     cdef readonly char* string | ||||||
|  |     cdef readonly size_t length | ||||||
|  |     cdef readonly double prob | ||||||
|  |     cdef readonly ClusterID cluster | ||||||
|  |     cdef readonly TagFlags possible_tags | ||||||
|  |     cdef readonly DistFlags dist_flags | ||||||
|  |     cdef readonly OrthFlags orth_flags | ||||||
|  |     cdef StringHash* string_views | ||||||
|  | 
 | ||||||
|  |     cpdef StringHash get_view(self, size_t i) except 0 | ||||||
|  | 
 | ||||||
|  |     cpdef bint can_tag(self, TagFlags flag) except * | ||||||
|  |     cpdef bint check_dist_flag(self, DistFlags flag) except * | ||||||
|  |     cpdef bint check_orth_flag(self, OrthFlags flag) except * | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_often_titled(self) except * | ||||||
|  |     cpdef bint is_often_uppered(self) except * | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_alpha(self) except * | ||||||
|  |     cpdef bint is_digit(self) except * | ||||||
|  |     cpdef bint is_punct(self) except * | ||||||
|  |     cpdef bint is_space(self) except * | ||||||
|  |     cpdef bint is_lower(self) except * | ||||||
|  |     cpdef bint is_upper(self) except * | ||||||
|  |     cpdef bint is_title(self) except * | ||||||
|  |     cpdef bint is_ascii(self) except * | ||||||
							
								
								
									
										239
									
								
								spacy/word.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										239
									
								
								spacy/word.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,239 @@ | ||||||
|  | # cython: profile=True | ||||||
|  | # cython: embedsignature=True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | from libc.stdlib cimport calloc, free | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # Python-visible enum for POS tags | ||||||
|  | PUNCT = 0 | ||||||
|  | CONJ = 1 | ||||||
|  | NUM = 2 | ||||||
|  | X = 3 | ||||||
|  | DET = 4 | ||||||
|  | ADP = 5 | ||||||
|  | ADJ = 6 | ||||||
|  | ADV = 7 | ||||||
|  | VERB = 8 | ||||||
|  | NOUN = 9 | ||||||
|  | PDT = 10 | ||||||
|  | POS = 11 | ||||||
|  | PRON = 12 | ||||||
|  | PRT = 13 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | DEF OFT_UPPER = 1 | ||||||
|  | DEF OFT_TITLE = 2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef class Word: | ||||||
|  |     """A lexical type. | ||||||
|  | 
 | ||||||
|  |     Attributes: | ||||||
|  |         string (bytes): | ||||||
|  |             A utf8-encoded byte-string for the word. | ||||||
|  |          | ||||||
|  |         lex (StringHash): | ||||||
|  |             A hash of the word. | ||||||
|  |         length (size_t): | ||||||
|  |             The (unicode) length of the word. | ||||||
|  |          | ||||||
|  |         prob (double): | ||||||
|  |             An estimate of the word's unigram log probability. | ||||||
|  | 
 | ||||||
|  |             Probabilities are calculated from a large text corpus, and smoothed using | ||||||
|  |             simple Good-Turing.  Estimates are read from data/en/probabilities, and | ||||||
|  |             can be replaced using spacy.en.load_probabilities. | ||||||
|  |          | ||||||
|  |         cluster (int): | ||||||
|  |             An integer representation of the word's Brown cluster. | ||||||
|  | 
 | ||||||
|  |             A Brown cluster is an address into a binary tree, which gives some (noisy) | ||||||
|  |             information about the word's distributional context. | ||||||
|  |      | ||||||
|  |             >>> strings = (u'pineapple', u'apple', u'dapple', u'scalable') | ||||||
|  |             >>> print ["{0:b"} % lookup(s).cluster for s in strings] | ||||||
|  |             ["100111110110", "100111100100", "01010111011001", "100111110110"] | ||||||
|  | 
 | ||||||
|  |             The clusterings are unideal, but often slightly useful. | ||||||
|  |             "pineapple" and "apple" share a long prefix, indicating a similar meaning, | ||||||
|  |             while "dapple" is totally different. On the other hand, "scalable" receives | ||||||
|  |             the same cluster ID as "pineapple", which is not what we'd like. | ||||||
|  |     """ | ||||||
|  |     def __cinit__(self, bytes string, list string_views): | ||||||
|  |         self.string = <char*>string | ||||||
|  |         self.length = len(string) | ||||||
|  |         self.lex = hash(string) | ||||||
|  |         self.string_views = <StringHash*>calloc(len(string_views), sizeof(StringHash)) | ||||||
|  |         cdef unicode view | ||||||
|  |         for i in range(len(string_views)): | ||||||
|  |             view = string_views[i] | ||||||
|  |             self.string_views[i] = hash(view) | ||||||
|  | 
 | ||||||
|  |     def __dealloc__(self): | ||||||
|  |         free(self.string_views) | ||||||
|  | 
 | ||||||
|  |     cpdef StringHash get_view(self, size_t i) except 0: | ||||||
|  |         return self.string_views[i] | ||||||
|  | 
 | ||||||
|  |     cpdef bint check_orth_flag(self, OrthFlags flag) except *: | ||||||
|  |         """Access the value of one of the pre-computed boolean orthographic features. | ||||||
|  | 
 | ||||||
|  |         Meanings depend on the language-specific orthographic features being loaded. | ||||||
|  |         The suggested features for latin-alphabet languages are: TODO | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & (1 << flag) | ||||||
|  | 
 | ||||||
|  |     cpdef bint check_dist_flag(self, DistFlags flag) except *: | ||||||
|  |         """Access the value of one of the pre-computed boolean distribution features. | ||||||
|  | 
 | ||||||
|  |         Meanings depend on the language-specific distributional features being loaded. | ||||||
|  |         The suggested features for latin-alphabet languages are: TODO | ||||||
|  |         """ | ||||||
|  |   | ||||||
|  |         return self.dist_flags & (1 << flag) | ||||||
|  | 
 | ||||||
|  |     cpdef bint can_tag(self, TagFlags flag) except *: | ||||||
|  |         """Check whether the word often receives a particular tag in a large text | ||||||
|  |         corpus. "Often" is chosen by heuristic. | ||||||
|  |         """ | ||||||
|  |         return self.possible_tags & (1 << flag) | ||||||
|  |      | ||||||
|  |     cpdef bint is_often_uppered(self) except *: | ||||||
|  |         '''Check the OFT_UPPER distributional flag for the word. | ||||||
|  |      | ||||||
|  |         The OFT_UPPER flag records whether a lower-cased version of the word | ||||||
|  |         is found in all-upper case frequently in a large sample of text, where | ||||||
|  |         "frequently" is defined as P >= 0.95 (chosen for high mutual information for | ||||||
|  |         POS tagging). | ||||||
|  |      | ||||||
|  |         Case statistics are estimated from a large text corpus. Estimates are read | ||||||
|  |         from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. | ||||||
|  |      | ||||||
|  |         >>> is_often_uppered(lookup(u'nato')) | ||||||
|  |         True | ||||||
|  |         >>> is_often_uppered(lookup(u'the'))  | ||||||
|  |         False | ||||||
|  |         ''' | ||||||
|  |         return self.dist_flags & (1 << OFT_UPPER) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_often_titled(self) except *: | ||||||
|  |         '''Check the OFT_TITLE distributional flag for the word. | ||||||
|  |      | ||||||
|  |         The OFT_TITLE flag records whether a lower-cased version of the word | ||||||
|  |         is found title-cased (see string.istitle) frequently in a large sample of text, | ||||||
|  |         where "frequently" is defined as P >= 0.3 (chosen for high mutual information for | ||||||
|  |         POS tagging). | ||||||
|  |      | ||||||
|  |         Case statistics are estimated from a large text corpus. Estimates are read | ||||||
|  |         from data/en/case_stats, and can be replaced using spacy.en.load_case_stats. | ||||||
|  |      | ||||||
|  |         >>> is_oft_upper(lookup(u'john')) | ||||||
|  |         True | ||||||
|  |         >>> is_oft_upper(lookup(u'Bill'))  | ||||||
|  |         False | ||||||
|  |         ''' | ||||||
|  |         return self.dist_flags & (1 << OFT_TITLE) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_alpha(self) except *: | ||||||
|  |         """Check whether all characters in the word's string are alphabetic. | ||||||
|  |          | ||||||
|  |         Should match the :py:func:`unicode.isalpha()` function. | ||||||
|  | 
 | ||||||
|  |         >>> is_alpha(lookup(u'Hello')) | ||||||
|  |         True | ||||||
|  |         >>> is_alpha(lookup(u'العرب')) | ||||||
|  |         True | ||||||
|  |         >>> is_alpha(lookup(u'10')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_ALPHA | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_digit(self) except *: | ||||||
|  |         """Check whether all characters in the word's string are numeric. | ||||||
|  |      | ||||||
|  |         Should match the :py:func:`unicode.isdigit()` function. | ||||||
|  | 
 | ||||||
|  |         >>> is_digit(lookup(u'10')) | ||||||
|  |         True | ||||||
|  |         >>> is_digit(lookup(u'๐')) | ||||||
|  |         True | ||||||
|  |         >>> is_digit(lookup(u'one')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_DIGIT | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_punct(self) except *: | ||||||
|  |         """Check whether all characters belong to a punctuation unicode data category | ||||||
|  |         for a Lexeme ID. | ||||||
|  | 
 | ||||||
|  |         >>> is_punct(lookup(u'.')) | ||||||
|  |         True | ||||||
|  |         >>> is_punct(lookup(u'⁒')) | ||||||
|  |         True | ||||||
|  |         >>> is_punct(lookup(u' ')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_PUNCT | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_space(self) except *: | ||||||
|  |         """Give the result of unicode.isspace() for a Lexeme ID. | ||||||
|  | 
 | ||||||
|  |         >>> is_space(lookup(u'\\t')) | ||||||
|  |         True | ||||||
|  |         >>> is_space(lookup(u'<unicode space>')) | ||||||
|  |         True | ||||||
|  |         >>> is_space(lookup(u'Hi\\n')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_SPACE | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_lower(self) except *: | ||||||
|  |         """Give the result of unicode.islower() for a Lexeme ID. | ||||||
|  | 
 | ||||||
|  |         >>> is_lower(lookup(u'hi')) | ||||||
|  |         True | ||||||
|  |         >>> is_lower(lookup(<unicode>)) | ||||||
|  |         True | ||||||
|  |         >>> is_lower(lookup(u'10')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_LOWER | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_upper(self) except *: | ||||||
|  |         """Give the result of unicode.isupper() for a Lexeme ID. | ||||||
|  | 
 | ||||||
|  |         >>> is_upper(lookup(u'HI')) | ||||||
|  |         True | ||||||
|  |         >>> is_upper(lookup(u'H10')) | ||||||
|  |         True | ||||||
|  |         >>> is_upper(lookup(u'10')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_UPPER | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_title(self) except *: | ||||||
|  |         """Give the result of unicode.istitle() for a Lexeme ID. | ||||||
|  | 
 | ||||||
|  |         >>> is_title(lookup(u'Hi')) | ||||||
|  |         True | ||||||
|  |         >>> is_title(lookup(u'Hi1')) | ||||||
|  |         True | ||||||
|  |         >>> is_title(lookup(u'1')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_TITLE | ||||||
|  | 
 | ||||||
|  |     cpdef bint is_ascii(self) except *: | ||||||
|  |         """Give the result of checking whether all characters in the string are ascii. | ||||||
|  | 
 | ||||||
|  |         >>> is_ascii(lookup(u'Hi')) | ||||||
|  |         True | ||||||
|  |         >>> is_ascii(lookup(u' ')) | ||||||
|  |         True | ||||||
|  |         >>> is_title(lookup(u'<unicode>')) | ||||||
|  |         False | ||||||
|  |         """ | ||||||
|  |         return self.orth_flags & 1 << IS_ASCII | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user