mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-22 19:54:18 +03:00 
			
		
		
		
	* Fixed group_by, removed idea of general attr_of function.
This commit is contained in:
		
							parent
							
								
									811b7a6b91
								
							
						
					
					
						commit
						07ecf5d2f4
					
				
							
								
								
									
										20
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								spacy/en.pxd
									
									
									
									
									
								
							|  | @ -10,30 +10,14 @@ from spacy.tokens cimport Tokens | |||
| cimport cython | ||||
| 
 | ||||
| 
 | ||||
| ctypedef fused AttrType: | ||||
|     ClusterID | ||||
|     StringHash | ||||
|     cython.char | ||||
| 
 | ||||
| 
 | ||||
| cdef enum AttrName: | ||||
|     LEX | ||||
|     FIRST | ||||
|     LENGTH | ||||
|     CLUSTER | ||||
|     NORM | ||||
|     SHAPE | ||||
|     LAST3 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef class English(spacy.Language): | ||||
|     cdef int find_split(self, unicode word) | ||||
|     cdef int set_orth(self, unicode word, Lexeme* lex) except -1 | ||||
|     cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except * | ||||
| 
 | ||||
| 
 | ||||
| cdef English EN | ||||
| 
 | ||||
| 
 | ||||
| cpdef LexID lookup(unicode word) except 0 | ||||
| cpdef Tokens tokenize(unicode string) | ||||
| cpdef unicode unhash(StringHash hash_value) | ||||
|  |  | |||
							
								
								
									
										21
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -76,27 +76,6 @@ cdef class English(spacy.Language): | |||
|                 i += 1 | ||||
|         return i | ||||
| 
 | ||||
|     cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *: | ||||
|         cdef Lexeme* w = <Lexeme*>lex_id | ||||
|         if attr == LEX: | ||||
|             return <AttrType>w.lex | ||||
|         elif attr == FIRST: | ||||
|             return w.string[0] | ||||
|         elif attr == LENGTH: | ||||
|             return w.length | ||||
|         elif attr == CLUSTER: | ||||
|             return w.cluster | ||||
|         elif attr == NORM: | ||||
|             return w.string_views[0] | ||||
|         elif attr == SHAPE: | ||||
|             return w.string_views[1] | ||||
|         elif attr == LAST3: | ||||
|             return w.string_views[2] | ||||
|         else: | ||||
|             raise AttributeError(attr) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef bint check_punct(unicode word, size_t i, size_t length): | ||||
|     # Don't count appostrophes as punct if the next char is a letter | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from libcpp.vector cimport vector | ||||
| from spacy.spacy cimport Lexeme_addr | ||||
| from spacy.lexeme cimport LexID | ||||
| from spacy.lexeme cimport Lexeme | ||||
| 
 | ||||
| from cython.operator cimport dereference as deref | ||||
|  | @ -8,10 +8,10 @@ from spacy.spacy cimport Language | |||
| 
 | ||||
| cdef class Tokens: | ||||
|     cdef Language lang | ||||
|     cdef vector[Lexeme_addr]* vctr | ||||
|     cdef vector[LexID]* vctr | ||||
|     cdef size_t length | ||||
|      | ||||
|     cpdef int append(self, Lexeme_addr token) | ||||
|     cpdef int append(self, LexID token) | ||||
|     cpdef int extend(self, Tokens other) except -1 | ||||
|      | ||||
|     cpdef object group_by(self, size_t attr) | ||||
|  |  | |||
|  | @ -9,14 +9,14 @@ from spacy.spacy cimport StringHash | |||
| cdef class Tokens: | ||||
|     def __cinit__(self, Language lang): | ||||
|         self.lang = lang | ||||
|         self.vctr = new vector[Lexeme_addr]() | ||||
|         self.vctr = new vector[LexID]() | ||||
|         self.length = 0 | ||||
| 
 | ||||
|     def __dealloc__(self): | ||||
|         del self.vctr | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin() | ||||
|         cdef vector[LexID].iterator it = self.vctr[0].begin() | ||||
|         while it != self.vctr[0].end(): | ||||
|             yield deref(it) | ||||
|             inc(it) | ||||
|  | @ -27,16 +27,16 @@ cdef class Tokens: | |||
|     def __len__(self): | ||||
|         return self.length | ||||
| 
 | ||||
|     cpdef int append(self, Lexeme_addr token): | ||||
|     cpdef int append(self, LexID token): | ||||
|         self.vctr[0].push_back(token) | ||||
|         self.length += 1 | ||||
| 
 | ||||
|     cpdef int extend(self, Tokens other) except -1: | ||||
|         cdef Lexeme_addr el | ||||
|         cdef LexID el | ||||
|         for el in other: | ||||
|             self.append(el) | ||||
| 
 | ||||
|     cpdef object group_by(self, size_t attr): | ||||
|     cpdef object group_by(self, size_t view_idx): | ||||
|         '''Group tokens that share the property attr into Tokens instances, and | ||||
|         return a list of them. Returns a tuple of three lists: | ||||
|          | ||||
|  | @ -63,9 +63,12 @@ cdef class Tokens: | |||
|         cdef list hashes = [] | ||||
| 
 | ||||
|         cdef StringHash key | ||||
|         cdef Lexeme_addr t | ||||
|         cdef LexID t | ||||
|         for t in self.vctr[0]: | ||||
|             key = self.lang.attr_of(t, attr) | ||||
|             if view_idx == 0: | ||||
|                 key = (<Lexeme*>t).lex | ||||
|             else: | ||||
|                 key = (<Lexeme*>t).string_views[view_idx - 1] | ||||
|             if key in indices: | ||||
|                 groups[indices[key]].append(t) | ||||
|             else: | ||||
|  | @ -78,7 +81,7 @@ cdef class Tokens: | |||
| 
 | ||||
|     cpdef dict count_by(self, size_t attr): | ||||
|         counts = {} | ||||
|         cdef Lexeme_addr t | ||||
|         cdef LexID t | ||||
|         cdef StringHash key | ||||
|         for t in self.vctr[0]: | ||||
|             #key = attr_of(t, attr) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user