mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	* Fiddle with data types on Lexeme, to compress them to a much smaller size.
This commit is contained in:
		
							parent
							
								
									ac88893232
								
							
						
					
					
						commit
						87c2418a89
					
				|  | @ -32,9 +32,6 @@ cdef class Lexicon: | |||
|      | ||||
|     cdef PreshMap _dict | ||||
|      | ||||
|     cdef list _string_features | ||||
|     cdef list _flag_features | ||||
| 
 | ||||
| 
 | ||||
| cdef class Language: | ||||
|     cdef Pool _mem | ||||
|  |  | |||
|  | @ -266,10 +266,10 @@ cdef class Lexicon: | |||
|     cpdef Lexeme lookup(self, unicode uni_string): | ||||
|         """Retrieve (or create, if not found) a Lexeme for a string, and return it. | ||||
|      | ||||
|         Args | ||||
|        Args | ||||
|             string (unicode):  The string to be looked up. Must be unicode, not bytes. | ||||
| 
 | ||||
|         Returns: | ||||
|        Returns: | ||||
|             lexeme (Lexeme): A reference to a lexical type. | ||||
|         """ | ||||
|         cdef String string | ||||
|  |  | |||
|  | @ -1,8 +1,7 @@ | |||
| from .typedefs cimport hash_t, utf8_t, flag_t, id_t | ||||
| 
 | ||||
| from thinc.typedefs cimport atom_t | ||||
| from .typedefs cimport hash_t, utf8_t, flag_t, id_t, len_t, tag_t | ||||
| 
 | ||||
| from .utf8string cimport StringStore | ||||
| from libc.stdint cimport uint16_t | ||||
| 
 | ||||
| cpdef flag_t OOV_DIST_FLAGS | ||||
| 
 | ||||
|  | @ -23,23 +22,24 @@ cpdef enum: | |||
| 
 | ||||
| 
 | ||||
| cdef struct Lexeme: | ||||
|     atom_t length | ||||
|     flag_t flags | ||||
|     | ||||
|     atom_t sic | ||||
|     atom_t norm | ||||
|     atom_t shape | ||||
|     atom_t vocab10k | ||||
|     atom_t asciied | ||||
|     atom_t prefix | ||||
|     atom_t suffix | ||||
| 
 | ||||
|     atom_t cluster | ||||
|     atom_t pos | ||||
|     atom_t supersense | ||||
|     id_t sic | ||||
|     id_t norm | ||||
|     id_t shape | ||||
|     id_t vocab10k | ||||
|     id_t asciied | ||||
|     id_t prefix | ||||
|     id_t suffix | ||||
| 
 | ||||
|     float prob | ||||
|      | ||||
|     len_t length | ||||
|     tag_t cluster | ||||
|     tag_t pos | ||||
|     tag_t supersense | ||||
| 
 | ||||
| 
 | ||||
|     flag_t flags | ||||
| 
 | ||||
| 
 | ||||
| cdef Lexeme EMPTY_LEXEME | ||||
|  |  | |||
|  | @ -52,7 +52,7 @@ cpdef Lexeme init(unicode string, hash_t hashed, | |||
|     lex.flags = get_flags(string, upper_pc, title_pc, lower_pc) | ||||
|     return lex | ||||
| 
 | ||||
| cdef atom_t get_string_id(unicode string, StringStore store) except 0: | ||||
| cdef id_t get_string_id(unicode string, StringStore store) except 0: | ||||
|     cdef bytes byte_string = string.encode('utf8') | ||||
|     cdef Utf8Str* orig_str = store.intern(<char*>byte_string, len(byte_string)) | ||||
|     return orig_str.i | ||||
|  |  | |||
|  | @ -1,8 +1,10 @@ | |||
| from libc.stdint cimport uint64_t, uintptr_t | ||||
| from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t | ||||
| 
 | ||||
| ctypedef uint64_t hash_t | ||||
| ctypedef char* utf8_t | ||||
| ctypedef uint64_t flag_t | ||||
| ctypedef uintptr_t id_t | ||||
| ctypedef uint32_t id_t | ||||
| ctypedef uint16_t len_t | ||||
| ctypedef uint16_t tag_t | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user