mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Upd from spacy
This commit is contained in:
		
							parent
							
								
									87bf205b82
								
							
						
					
					
						commit
						a895fe5ddb
					
				|  | @ -1,8 +1,17 @@ | ||||||
| from .lexeme import lex_of | from .lexeme import lex_of | ||||||
| from .lexeme import sic_of | from .lexeme import sic_of | ||||||
| 
 | 
 | ||||||
|  | from .tokens import Tokens | ||||||
| 
 | 
 | ||||||
| __all__ = [lex_of, sic_of] | # Don't know how to get the enum Python visible :( | ||||||
|  | 
 | ||||||
|  | SIC = 0 | ||||||
|  | LEX = 1 | ||||||
|  | NORM = 2 | ||||||
|  | SHAPE = 3 | ||||||
|  | LAST3 = 4 | ||||||
|  | 
 | ||||||
|  | __all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | @ -38,11 +38,13 @@ cdef bint is_punct(unicode word, size_t i, size_t length): | ||||||
|     if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): |     if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): | ||||||
|         # ...Unless we're at 0 |         # ...Unless we're at 0 | ||||||
|         return i == 0 |         return i == 0 | ||||||
|  |     if word[i] == "-" and i < (length - 1) and word[i+1] == '-': | ||||||
|  |         return False | ||||||
|     # Don't count commas as punct if the next char is a number |     # Don't count commas as punct if the next char is a number | ||||||
|     if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): |     if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): | ||||||
|         return False |         return False | ||||||
|     # Don't count periods as punct if the next char is a number |     # Don't count periods as punct if the next char is not whitespace | ||||||
|     if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): |     if word[i] == "." and i < (length - 1) and not word[i+1].isspace(): | ||||||
|         return False |         return False | ||||||
|     return not word[i].isalnum() |     return not word[i].isalnum() | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -16,12 +16,12 @@ from . import util | ||||||
| from os import path | from os import path | ||||||
| cimport cython | cimport cython | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def get_normalized(unicode lex, size_t length): | def get_normalized(unicode lex, size_t length): | ||||||
|     return lex.lower() |     if lex.isalpha() and lex.islower(): | ||||||
|     #if lex.isdigit(): |         return lex | ||||||
|     #    return '!YEAR' if length == 4 else '!DIGIT' |     else: | ||||||
|     #else: |         return get_word_shape(lex, length) | ||||||
|     #    return lex.lower() |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_word_shape(lex, length): | def get_word_shape(lex, length): | ||||||
|  | @ -55,7 +55,6 @@ def set_orth_flags(lex, length): | ||||||
|     return 0 |     return 0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| cdef class Language: | cdef class Language: | ||||||
|     def __cinit__(self, name): |     def __cinit__(self, name): | ||||||
|         self.name = name |         self.name = name | ||||||
|  |  | ||||||
|  | @ -14,5 +14,5 @@ cdef class Tokens: | ||||||
|     cpdef int append(self, Lexeme_addr token) |     cpdef int append(self, Lexeme_addr token) | ||||||
|     cpdef int extend(self, Tokens other) except -1 |     cpdef int extend(self, Tokens other) except -1 | ||||||
|      |      | ||||||
|     cpdef list group_by(self, StringAttr attr) |     cpdef object group_by(self, StringAttr attr) | ||||||
|     cpdef dict count_by(self, StringAttr attr) |     cpdef dict count_by(self, StringAttr attr) | ||||||
|  |  | ||||||
|  | @ -37,21 +37,45 @@ cdef class Tokens: | ||||||
|         for el in other: |         for el in other: | ||||||
|             self.append(el) |             self.append(el) | ||||||
| 
 | 
 | ||||||
|     cpdef list group_by(self, StringAttr attr): |     cpdef object group_by(self, StringAttr attr): | ||||||
|  |         '''Group tokens that share the property attr into Tokens instances, and | ||||||
|  |         return a list of them. Returns a tuple of three lists: | ||||||
|  |          | ||||||
|  |         (string names, hashes, tokens) | ||||||
|  | 
 | ||||||
|  |         The lists are aligned, so the ith entry in string names is the string | ||||||
|  |         that the ith entry in hashes unhashes to, which the Tokens instance | ||||||
|  |         is grouped by. | ||||||
|  |          | ||||||
|  |         You can then use count_by or group_by on the Tokens | ||||||
|  |         for further processing. Calling group_by and then asking the length | ||||||
|  |         of the Tokens objects is equivalent to count_by, but somewhat slower. | ||||||
|  |         ''' | ||||||
|  |         # Implementation here is working around some of the constraints in | ||||||
|  |         # Cython about what type of thing can go in what type of container. | ||||||
|  |         # Long story short, it's pretty hard to get a Python object like | ||||||
|  |         # Tokens into a vector or array. If we really need this to run faster, | ||||||
|  |         # we can be tricky and get the Python list access out of the loop. What | ||||||
|  |         # we'd do is store pointers to the underlying vectors. | ||||||
|  |         # So far, speed isn't mattering here. | ||||||
|         cdef dict indices = {} |         cdef dict indices = {} | ||||||
|         cdef vector[vector[Lexeme_addr]] groups = vector[vector[Lexeme_addr]]() |         cdef list groups = [] | ||||||
|  |         cdef list names = [] | ||||||
|  |         cdef list hashes = [] | ||||||
| 
 | 
 | ||||||
|         cdef StringHash key |         cdef StringHash key | ||||||
|         cdef Lexeme_addr t |         cdef Lexeme_addr t | ||||||
|         for t in self.vctr[0]: |         for t in self.vctr[0]: | ||||||
|             key = attr_of(t, attr) |             key = attr_of(t, attr) | ||||||
|             if key in indices: |             if key in indices: | ||||||
|                 groups[indices[key]].push_back(t) |                 groups[indices[key]].append(t) | ||||||
|             else: |             else: | ||||||
|                 indices[key] = groups.size() |                 indices[key] = len(groups) | ||||||
|                 groups.push_back(vector[Lexeme_addr]()) |                 groups.append(Tokens(self.lang)) | ||||||
|                 groups.back().push_back(t) |                 names.append(self.lang.unhash(key)) | ||||||
|         return groups |                 hashes.append(key) | ||||||
|  |                 groups[-1].append(t) | ||||||
|  |         return names, hashes, groups | ||||||
| 
 | 
 | ||||||
|     cpdef dict count_by(self, StringAttr attr): |     cpdef dict count_by(self, StringAttr attr): | ||||||
|         counts = {} |         counts = {} | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user