mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Revising data model of lexeme. Compiles.
This commit is contained in:
		
							parent
							
								
									e40caae51f
								
							
						
					
					
						commit
						1b0e01d3d8
					
				|  | @ -2,24 +2,86 @@ from .typedefs cimport hash_t, utf8_t, flag_t, id_t | |||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| 
 | ||||
| cpdef flag_t OOV_DIST_FLAGS | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum LexInts: | ||||
|     LexInt_i | ||||
|     LexInt_length | ||||
|     LexInt_cluster | ||||
|     LexInt_pos | ||||
|     LexInt_supersense | ||||
|     LexInt_N | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum LexFloats: | ||||
|     LexFloat_prob | ||||
|     LexFloat_sentiment | ||||
|     LexFloat_N | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum LexStrs: | ||||
|     LexStr_key | ||||
|     LexStr_casefix | ||||
|     LexStr_shape | ||||
|     LexStr_unsparse | ||||
|     LexStr_asciied | ||||
|     LexStr_N | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum LexOrthFlags: | ||||
|     LexOrth_alpha | ||||
|     LexOrth_ascii | ||||
|     LexOrth_digit | ||||
|     LexOrth_lower | ||||
|     LexOrth_punct | ||||
|     LexOrth_space | ||||
|     LexOrth_title | ||||
|     LexOrth_upper | ||||
|     LexOrth_N | ||||
| 
 | ||||
| 
 | ||||
| cpdef enum LexDistFlags: | ||||
|     LexDist_adj | ||||
|     LexDist_adp | ||||
|     LexDist_adv | ||||
|     LexDist_conj | ||||
|     LexDist_det | ||||
|     LexDist_noun | ||||
|     LexDist_num | ||||
|     LexDist_pdt | ||||
|     LexDist_pos | ||||
|     LexDist_pron | ||||
|     LexDist_prt | ||||
|     LexDist_punct | ||||
|     LexDist_verb | ||||
| 
 | ||||
|     LexDist_lower | ||||
|     LexDist_title | ||||
|     LexDist_upper | ||||
| 
 | ||||
|     LexDist_N | ||||
| 
 | ||||
| 
 | ||||
| cdef struct LexemeC: | ||||
|     size_t i | ||||
|     size_t length | ||||
|     double prob | ||||
|     size_t cluster | ||||
| 
 | ||||
|     char* string | ||||
|      | ||||
|     char** views | ||||
|     flag_t flags | ||||
|     int[<int>LexInt_N] ints | ||||
|     float[<int>LexFloat_N] floats | ||||
|     utf8_t[<int>LexStr_N] strings | ||||
|     flag_t orth_flags | ||||
|     flag_t dist_flags | ||||
| 
 | ||||
| 
 | ||||
| cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, size_t cluster, | ||||
|                      list views, set flags) | ||||
| cdef char* intern_and_encode(unicode string, size_t* length) except NULL | ||||
| 
 | ||||
| cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id) | ||||
| cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id) | ||||
| cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except * | ||||
| 
 | ||||
| cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except * | ||||
| 
 | ||||
| cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i) | ||||
| 
 | ||||
| cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except * | ||||
| 
 | ||||
| cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except * | ||||
| 
 | ||||
| cdef dict lexeme_pack(LexemeC* lexeme) | ||||
| cdef int lexeme_unpack(LexemeC* lexeme, dict p) except -1 | ||||
|  |  | |||
							
								
								
									
										113
									
								
								spacy/lexeme.pyx
									
									
									
									
									
								
							
							
						
						
									
										113
									
								
								spacy/lexeme.pyx
									
									
									
									
									
								
							|  | @ -1,25 +1,46 @@ | |||
| from cpython.ref cimport Py_INCREF | ||||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| import orth | ||||
| 
 | ||||
| cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob, | ||||
|                           size_t cluster, list views, set flags): | ||||
|     cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC)) | ||||
|     lexeme.i = i | ||||
|     lexeme.cluster = cluster | ||||
|     lexeme.prob = prob | ||||
|     lexeme.string = intern_and_encode(string, &lexeme.length) | ||||
|     lexeme.views = <char**>mem.alloc(len(views), sizeof(char*)) | ||||
|     cdef size_t length = 0 | ||||
|     for i, string in enumerate(views): | ||||
|         lexeme.views[i] = intern_and_encode(string, &length) | ||||
| 
 | ||||
|     for active_flag in flags: | ||||
|         lexeme.flags |= (1 << active_flag) | ||||
|     return lexeme | ||||
| OOV_DIST_FLAGS = 0 | ||||
| 
 | ||||
| 
 | ||||
| cdef char* intern_and_encode(unicode string, size_t* length): | ||||
| def get_lexeme_dict(size_t i, unicode string): | ||||
|     ints = [None for _ in range(LexInt_N)] | ||||
|     ints[<int>LexInt_i] = i | ||||
|     ints[<int>LexInt_length] = len(string) | ||||
|     ints[<int>LexInt_cluster] = 0 | ||||
|     ints[<int>LexInt_pos] = 0 | ||||
|     ints[<int>LexInt_supersense] = 0 | ||||
|      | ||||
|     floats = [None for _ in range(LexFloat_N)] | ||||
|     floats[<int>LexFloat_prob] = 0 | ||||
|     floats[<int>LexFloat_sentiment] = 0 | ||||
| 
 | ||||
|     cdef size_t length | ||||
|     strings = [None for _ in range(LexStr_N)] | ||||
|     strings[<int>LexStr_key] = intern_and_encode(string, &length) | ||||
|     strings[<int>LexStr_casefix] = strings[<int>LexStr_key] | ||||
|     strings[<int>LexStr_shape] = intern_and_encode(orth.word_shape(string), &length) | ||||
|     strings[<int>LexStr_unsparse] = strings[<int>LexStr_shape] | ||||
|     strings[<int>LexStr_asciied] = intern_and_encode(orth.asciied(string), &length) | ||||
| 
 | ||||
|     orth_flags = get_orth_flags(string) | ||||
|     dist_flags = OOV_DIST_FLAGS | ||||
| 
 | ||||
|     return {'ints': ints, 'floats': floats, 'strings': strings, | ||||
|             'orth_flags': orth_flags, 'dist_flags': dist_flags} | ||||
| 
 | ||||
| def get_orth_flags(unicode string): | ||||
|     return 0 | ||||
| 
 | ||||
| 
 | ||||
| def get_dist_flags(unicode string): | ||||
|     return 0 | ||||
| 
 | ||||
| 
 | ||||
| cdef char* intern_and_encode(unicode string, size_t* length) except NULL: | ||||
|     cdef bytes byte_string = string.encode('utf8') | ||||
|     cdef bytes utf8_string = intern(byte_string) | ||||
|     Py_INCREF(utf8_string) | ||||
|  | @ -27,38 +48,48 @@ cdef char* intern_and_encode(unicode string, size_t* length): | |||
|     return <char*>utf8_string | ||||
| 
 | ||||
| 
 | ||||
| cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id): | ||||
|     return lexeme.flags & (1 << flag_id) | ||||
| cdef int lexeme_get_int(LexemeC* lexeme, size_t i) except *: | ||||
|     return lexeme.ints[i] | ||||
| 
 | ||||
| 
 | ||||
| cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id): | ||||
|     cdef bytes byte_string = lexeme.views[view_id] | ||||
| cdef float lexeme_get_float(LexemeC* lexeme, size_t i) except *: | ||||
|     return lexeme.floats[i] | ||||
| 
 | ||||
| 
 | ||||
| cdef unicode lexeme_get_string(LexemeC* lexeme, size_t i): | ||||
|     cdef bytes byte_string = lexeme.strings[i] | ||||
|     return byte_string.decode('utf8') | ||||
| 
 | ||||
| 
 | ||||
| cdef dict lexeme_pack(LexemeC* lexeme): | ||||
| cdef bint lexeme_check_orth_flag(LexemeC* lexeme, size_t flag_id) except *: | ||||
|     return lexeme.orth_flags & (1 << flag_id) | ||||
| 
 | ||||
| 
 | ||||
| cdef bint lexeme_check_dist_flag(LexemeC* lexeme, size_t flag_id) except *: | ||||
|     return lexeme.dist_flags & (1 << flag_id) | ||||
| 
 | ||||
| 
 | ||||
| cdef dict lexeme_pack(LexemeC* lex): | ||||
|     cdef dict packed = {} | ||||
|     packed['i'] = lexeme.i | ||||
|     packed['length'] = lexeme.length | ||||
|     packed['prob'] = lexeme.prob | ||||
|     packed['cluster'] = lexeme.cluster | ||||
|     packed['string'] = lexeme.string.decode('utf8') | ||||
|     packed['views'] = [] | ||||
|     cdef size_t i = 0 | ||||
|     while lexeme.views[i] != NULL: | ||||
|         packed['views'].append(lexeme.views[i].decode('utf8')) | ||||
|         i += 1 | ||||
|     packed['flags'] = lexeme.flags | ||||
|     packed['ints'] = [lex.ints[i] for i in range(LexInt_N)] | ||||
|     packed['floats'] = [lex.floats[i] for i in range(LexFloat_N)] | ||||
|     packed['strings'] = [lex.strings[i].decode('utf8') for i in range(LexStr_N)] | ||||
|     packed['orth_flags'] = lex.orth_flags | ||||
|     packed['dist_flags'] = lex.orth_flags | ||||
|     return packed | ||||
| 
 | ||||
| 
 | ||||
| cdef int lexeme_unpack(LexemeC* lex, dict p) except -1: | ||||
|     cdef size_t length | ||||
|     lex.i = p['i'] | ||||
|     lex.length = p['length'] | ||||
|     lex.prob = p['prob'] | ||||
|     lex.cluster = p['cluster'] | ||||
|     lex.string = intern_and_encode(p['string'], &length) | ||||
|     for i, view in enumerate(p['views']): | ||||
|         lex.views[i] = intern_and_encode(view, &length) | ||||
|     lex.flags = p['flags'] | ||||
|     cdef size_t i | ||||
|     cdef int lex_int | ||||
|     cdef float lex_float | ||||
|     cdef unicode string | ||||
|     for i, lex_int in enumerate(p['ints']): | ||||
|         lex.ints[i] = lex_int | ||||
|     for i, lex_float in enumerate(p['floats']): | ||||
|         lex.ints[i] = lex_int | ||||
|     cdef size_t _ | ||||
|     for i, lex_string in enumerate(p['strings']): | ||||
|         lex.strings[i] = intern_and_encode(lex_string, &_) | ||||
|     lex.orth_flags = p['orth_flags'] | ||||
|     lex.orth_flags = p['orth_flags'] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user