mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Upd from spacy
This commit is contained in:
		
							parent
							
								
									87bf205b82
								
							
						
					
					
						commit
						a895fe5ddb
					
				|  | @ -1,8 +1,17 @@ | |||
| from .lexeme import lex_of | ||||
| from .lexeme import sic_of | ||||
| 
 | ||||
| from .tokens import Tokens | ||||
| 
 | ||||
| __all__ = [lex_of, sic_of] | ||||
| # Don't know how to get the enum Python visible :( | ||||
| 
 | ||||
| SIC = 0 | ||||
| LEX = 1 | ||||
| NORM = 2 | ||||
| SHAPE = 3 | ||||
| LAST3 = 4 | ||||
| 
 | ||||
| __all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3] | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
|  |  | |||
|  | @ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length): | |||
|     if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): | ||||
|         # ...Unless we're at 0 | ||||
|         return i == 0 | ||||
|     if word[i] == "-" and i < (length - 1) and word[i+1] == '-': | ||||
|         return False | ||||
|     # Don't count commas as punct if the next char is a number | ||||
|     if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): | ||||
|         return False | ||||
|     # Don't count periods as punct if the next char is a number | ||||
|     if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): | ||||
|     # Don't count periods as punct if the next char is not whitespace | ||||
|     if word[i] == "." and i < (length - 1) and not word[i+1].isspace(): | ||||
|         return False | ||||
|     return not word[i].isalnum() | ||||
| 
 | ||||
|  |  | |||
|  | @ -16,12 +16,12 @@ from . import util | |||
| from os import path | ||||
| cimport cython | ||||
| 
 | ||||
| 
 | ||||
| def get_normalized(unicode lex, size_t length): | ||||
|     return lex.lower() | ||||
|     #if lex.isdigit(): | ||||
|     #    return '!YEAR' if length == 4 else '!DIGIT' | ||||
|     #else: | ||||
|     #    return lex.lower() | ||||
|     if lex.isalpha() and lex.islower(): | ||||
|         return lex | ||||
|     else: | ||||
|         return get_word_shape(lex, length) | ||||
| 
 | ||||
| 
 | ||||
| def get_word_shape(lex, length): | ||||
|  | @ -55,7 +55,6 @@ def set_orth_flags(lex, length): | |||
|     return 0 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef class Language: | ||||
|     def __cinit__(self, name): | ||||
|         self.name = name | ||||
|  |  | |||
|  | @ -14,5 +14,5 @@ cdef class Tokens: | |||
|     cpdef int append(self, Lexeme_addr token) | ||||
|     cpdef int extend(self, Tokens other) except -1 | ||||
|      | ||||
|     cpdef list group_by(self, StringAttr attr) | ||||
|     cpdef object group_by(self, StringAttr attr) | ||||
|     cpdef dict count_by(self, StringAttr attr) | ||||
|  |  | |||
|  | @ -37,21 +37,45 @@ cdef class Tokens: | |||
|         for el in other: | ||||
|             self.append(el) | ||||
| 
 | ||||
|     cpdef list group_by(self, StringAttr attr): | ||||
|     cpdef object group_by(self, StringAttr attr): | ||||
|         '''Group tokens that share the property attr into Tokens instances, and | ||||
|         return a list of them. Returns a tuple of three lists: | ||||
|          | ||||
|         (string names, hashes, tokens) | ||||
| 
 | ||||
|         The lists are aligned, so the ith entry in string names is the string | ||||
|         that the ith entry in hashes unhashes to, which the Tokens instance | ||||
|         is grouped by. | ||||
|          | ||||
|         You can then use count_by or group_by on the Tokens | ||||
|         for further processing. Calling group_by and then asking the length | ||||
|         of the Tokens objects is equivalent to count_by, but somewhat slower. | ||||
|         ''' | ||||
|         # Implementation here is working around some of the constraints in | ||||
|         # Cython about what type of thing can go in what type of container. | ||||
|         # Long story short, it's pretty hard to get a Python object like | ||||
|         # Tokens into a vector or array. If we really need this to run faster, | ||||
|         # we can be tricky and get the Python list access out of the loop. What | ||||
|         # we'd do is store pointers to the underlying vectors. | ||||
|         # So far, speed isn't mattering here. | ||||
|         cdef dict indices = {} | ||||
|         cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]() | ||||
|         cdef list groups = [] | ||||
|         cdef list names = [] | ||||
|         cdef list hashes = [] | ||||
| 
 | ||||
|         cdef StringHash key | ||||
|         cdef Lexeme_addr t | ||||
|         for t in self.vctr[0]: | ||||
|             key = attr_of(t, attr) | ||||
|             if key in indices: | ||||
|                 groups[indices[key]].push_back(t) | ||||
|                 groups[indices[key]].append(t) | ||||
|             else: | ||||
|                 indices[key] = groups.size() | ||||
|                 groups.push_back(vector[Lexeme_addr]()) | ||||
|                 groups.back().push_back(t) | ||||
|         return groups | ||||
|                 indices[key] = len(groups) | ||||
|                 groups.append(Tokens(self.lang)) | ||||
|                 names.append(self.lang.unhash(key)) | ||||
|                 hashes.append(key) | ||||
|                 groups[-1].append(t) | ||||
|         return names, hashes, groups | ||||
| 
 | ||||
|     cpdef dict count_by(self, StringAttr attr): | ||||
|         counts = {} | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user