mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Fixed group_by, removed idea of general attr_of function.
This commit is contained in:
		
							parent
							
								
									811b7a6b91
								
							
						
					
					
						commit
						07ecf5d2f4
					
				
							
								
								
									
										20
									
								
								spacy/en.pxd
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								spacy/en.pxd
									
									
									
									
									
								
							|  | @ -10,30 +10,14 @@ from spacy.tokens cimport Tokens | ||||||
| cimport cython | cimport cython | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ctypedef fused AttrType: |  | ||||||
|     ClusterID |  | ||||||
|     StringHash |  | ||||||
|     cython.char |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef enum AttrName: |  | ||||||
|     LEX |  | ||||||
|     FIRST |  | ||||||
|     LENGTH |  | ||||||
|     CLUSTER |  | ||||||
|     NORM |  | ||||||
|     SHAPE |  | ||||||
|     LAST3 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class English(spacy.Language): | cdef class English(spacy.Language): | ||||||
|     cdef int find_split(self, unicode word) |     cdef int find_split(self, unicode word) | ||||||
|     cdef int set_orth(self, unicode word, Lexeme* lex) except -1 |     cdef int set_orth(self, unicode word, Lexeme* lex) except -1 | ||||||
|     cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except * | 
 | ||||||
| 
 | 
 | ||||||
| cdef English EN | cdef English EN | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| cpdef LexID lookup(unicode word) except 0 | cpdef LexID lookup(unicode word) except 0 | ||||||
| cpdef Tokens tokenize(unicode string) | cpdef Tokens tokenize(unicode string) | ||||||
| cpdef unicode unhash(StringHash hash_value) | cpdef unicode unhash(StringHash hash_value) | ||||||
|  |  | ||||||
							
								
								
									
										21
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -76,27 +76,6 @@ cdef class English(spacy.Language): | ||||||
|                 i += 1 |                 i += 1 | ||||||
|         return i |         return i | ||||||
| 
 | 
 | ||||||
|     cdef AttrType attr_of(self, LexID lex_id, AttrName attr) except *: |  | ||||||
|         cdef Lexeme* w = <Lexeme*>lex_id |  | ||||||
|         if attr == LEX: |  | ||||||
|             return <AttrType>w.lex |  | ||||||
|         elif attr == FIRST: |  | ||||||
|             return w.string[0] |  | ||||||
|         elif attr == LENGTH: |  | ||||||
|             return w.length |  | ||||||
|         elif attr == CLUSTER: |  | ||||||
|             return w.cluster |  | ||||||
|         elif attr == NORM: |  | ||||||
|             return w.string_views[0] |  | ||||||
|         elif attr == SHAPE: |  | ||||||
|             return w.string_views[1] |  | ||||||
|         elif attr == LAST3: |  | ||||||
|             return w.string_views[2] |  | ||||||
|         else: |  | ||||||
|             raise AttributeError(attr) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| cdef bint check_punct(unicode word, size_t i, size_t length): | cdef bint check_punct(unicode word, size_t i, size_t length): | ||||||
|     # Don't count appostrophes as punct if the next char is a letter |     # Don't count appostrophes as punct if the next char is a letter | ||||||
|  |  | ||||||
|  | @ -1,5 +1,5 @@ | ||||||
| from libcpp.vector cimport vector | from libcpp.vector cimport vector | ||||||
| from spacy.spacy cimport Lexeme_addr | from spacy.lexeme cimport LexID | ||||||
| from spacy.lexeme cimport Lexeme | from spacy.lexeme cimport Lexeme | ||||||
| 
 | 
 | ||||||
| from cython.operator cimport dereference as deref | from cython.operator cimport dereference as deref | ||||||
|  | @ -8,10 +8,10 @@ from spacy.spacy cimport Language | ||||||
| 
 | 
 | ||||||
| cdef class Tokens: | cdef class Tokens: | ||||||
|     cdef Language lang |     cdef Language lang | ||||||
|     cdef vector[Lexeme_addr]* vctr |     cdef vector[LexID]* vctr | ||||||
|     cdef size_t length |     cdef size_t length | ||||||
|      |      | ||||||
|     cpdef int append(self, Lexeme_addr token) |     cpdef int append(self, LexID token) | ||||||
|     cpdef int extend(self, Tokens other) except -1 |     cpdef int extend(self, Tokens other) except -1 | ||||||
|      |      | ||||||
|     cpdef object group_by(self, size_t attr) |     cpdef object group_by(self, size_t attr) | ||||||
|  |  | ||||||
|  | @ -9,14 +9,14 @@ from spacy.spacy cimport StringHash | ||||||
| cdef class Tokens: | cdef class Tokens: | ||||||
|     def __cinit__(self, Language lang): |     def __cinit__(self, Language lang): | ||||||
|         self.lang = lang |         self.lang = lang | ||||||
|         self.vctr = new vector[Lexeme_addr]() |         self.vctr = new vector[LexID]() | ||||||
|         self.length = 0 |         self.length = 0 | ||||||
| 
 | 
 | ||||||
|     def __dealloc__(self): |     def __dealloc__(self): | ||||||
|         del self.vctr |         del self.vctr | ||||||
| 
 | 
 | ||||||
|     def __iter__(self): |     def __iter__(self): | ||||||
|         cdef vector[Lexeme_addr].iterator it = self.vctr[0].begin() |         cdef vector[LexID].iterator it = self.vctr[0].begin() | ||||||
|         while it != self.vctr[0].end(): |         while it != self.vctr[0].end(): | ||||||
|             yield deref(it) |             yield deref(it) | ||||||
|             inc(it) |             inc(it) | ||||||
|  | @ -27,16 +27,16 @@ cdef class Tokens: | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|         return self.length |         return self.length | ||||||
| 
 | 
 | ||||||
|     cpdef int append(self, Lexeme_addr token): |     cpdef int append(self, LexID token): | ||||||
|         self.vctr[0].push_back(token) |         self.vctr[0].push_back(token) | ||||||
|         self.length += 1 |         self.length += 1 | ||||||
| 
 | 
 | ||||||
|     cpdef int extend(self, Tokens other) except -1: |     cpdef int extend(self, Tokens other) except -1: | ||||||
|         cdef Lexeme_addr el |         cdef LexID el | ||||||
|         for el in other: |         for el in other: | ||||||
|             self.append(el) |             self.append(el) | ||||||
| 
 | 
 | ||||||
|     cpdef object group_by(self, size_t attr): |     cpdef object group_by(self, size_t view_idx): | ||||||
|         '''Group tokens that share the property attr into Tokens instances, and |         '''Group tokens that share the property attr into Tokens instances, and | ||||||
|         return a list of them. Returns a tuple of three lists: |         return a list of them. Returns a tuple of three lists: | ||||||
|          |          | ||||||
|  | @ -63,9 +63,12 @@ cdef class Tokens: | ||||||
|         cdef list hashes = [] |         cdef list hashes = [] | ||||||
| 
 | 
 | ||||||
|         cdef StringHash key |         cdef StringHash key | ||||||
|         cdef Lexeme_addr t |         cdef LexID t | ||||||
|         for t in self.vctr[0]: |         for t in self.vctr[0]: | ||||||
|             key = self.lang.attr_of(t, attr) |             if view_idx == 0: | ||||||
|  |                 key = (<Lexeme*>t).lex | ||||||
|  |             else: | ||||||
|  |                 key = (<Lexeme*>t).string_views[view_idx - 1] | ||||||
|             if key in indices: |             if key in indices: | ||||||
|                 groups[indices[key]].append(t) |                 groups[indices[key]].append(t) | ||||||
|             else: |             else: | ||||||
|  | @ -78,7 +81,7 @@ cdef class Tokens: | ||||||
| 
 | 
 | ||||||
|     cpdef dict count_by(self, size_t attr): |     cpdef dict count_by(self, size_t attr): | ||||||
|         counts = {} |         counts = {} | ||||||
|         cdef Lexeme_addr t |         cdef LexID t | ||||||
|         cdef StringHash key |         cdef StringHash key | ||||||
|         for t in self.vctr[0]: |         for t in self.vctr[0]: | ||||||
|             #key = attr_of(t, attr) |             #key = attr_of(t, attr) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user