mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	* Restore happax. commit uncommited work
This commit is contained in:
		
							parent
							
								
									6319ff0f22
								
							
						
					
					
						commit
						365a2af756
					
				
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -48,6 +48,7 @@ exts = [ | |||
|     Extension("spacy.en_ptb", ["spacy/en_ptb.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.lexeme", ["spacy/lexeme.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.spacy", ["spacy/spacy.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy._hashing", ["spacy/_hashing.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.tokens", ["spacy/tokens.pyx"], language="c++", include_dirs=includes), | ||||
|     Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", | ||||
|               include_dirs=includes), | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| from .lexeme import lex_of | ||||
| from .lexeme import sic_of | ||||
| from .lexeme import length_of | ||||
| 
 | ||||
| from .tokens import Tokens | ||||
| 
 | ||||
|  | @ -10,28 +11,6 @@ LEX = 1 | |||
| NORM = 2 | ||||
| SHAPE = 3 | ||||
| LAST3 = 4 | ||||
| LENGTH = 5 | ||||
| 
 | ||||
| __all__ = [Tokens, lex_of, sic_of, SIC, LEX, NORM, SHAPE, LAST3] | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| from .tokens import ids_from_string | ||||
| from .tokens import group_by | ||||
| 
 | ||||
| from .lex import sic_of | ||||
| from .lex import lex_of | ||||
| from .lex import normed_of | ||||
| from .lex import first_of | ||||
| from .lex import last_three_of | ||||
| 
 | ||||
| from .lex import cluster_of | ||||
| from .lex import prob_of | ||||
| 
 | ||||
| from .lex import is_oft_upper | ||||
| from .lex import is_oft_title | ||||
| 
 | ||||
| from .lex import can_noun | ||||
| from .lex import can_verb | ||||
| from .lex import can_adj | ||||
| from .lex import can_adv | ||||
| """ | ||||
| __all__ = [Tokens, lex_of, sic_of, length_of, SIC, LEX, NORM, SHAPE, LAST3, LENGTH] | ||||
|  |  | |||
|  | @ -51,5 +51,3 @@ cdef class FixedTable: | |||
| @cython.cdivision | ||||
| cdef inline size_t _find(uint64_t key, size_t size) nogil: | ||||
|     return key % size | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ from libcpp.vector cimport vector | |||
| from libc.stdint cimport uint64_t | ||||
| 
 | ||||
| from sparsehash.dense_hash_map cimport dense_hash_map | ||||
| from _hashing cimport FixedTable | ||||
| 
 | ||||
| # Circular import problems here | ||||
| ctypedef size_t Lexeme_addr | ||||
|  | @ -24,6 +25,7 @@ from spacy.lexeme cimport Orthography | |||
| 
 | ||||
| cdef class Language: | ||||
|     cdef object name | ||||
|     cdef FixedTable happax | ||||
|     cdef Vocab* vocab | ||||
|     cdef Vocab* distri | ||||
|     cdef Vocab* ortho | ||||
|  | @ -39,3 +41,5 @@ cdef class Language: | |||
|     cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, | ||||
|                              int split, size_t length) | ||||
|     cdef Orthography* init_orth(self, StringHash hashed, unicode lex) | ||||
| 
 | ||||
|     cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr addr) | ||||
|  |  | |||
|  | @ -50,15 +50,18 @@ def get_word_shape(lex, length): | |||
|     return shape | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def set_orth_flags(lex, length): | ||||
|     return 0 | ||||
| 
 | ||||
| 
 | ||||
| DEF MAX_HAPPAX = 1048576 | ||||
| 
 | ||||
| 
 | ||||
| cdef class Language: | ||||
|     def __cinit__(self, name): | ||||
|         self.name = name | ||||
|         self.bacov = {} | ||||
|         self.happax = FixedTable(MAX_HAPPAX) | ||||
|         self.vocab = new Vocab() | ||||
|         self.ortho = new Vocab() | ||||
|         self.distri = new Vocab() | ||||
|  | @ -81,6 +84,7 @@ cdef class Language: | |||
|                 length = len(token_string) | ||||
|                 hashed = self.hash_string(token_string, length) | ||||
|                 word.tail = self._add(hashed, lex, 0, len(lex)) | ||||
|                 self._happax_to_vocab(hashed, <Lexeme_addr>word.tail) | ||||
|                 word = word.tail | ||||
| 
 | ||||
|     def load_clusters(self): | ||||
|  | @ -122,14 +126,27 @@ cdef class Language: | |||
|         # First, check words seen 2+ times | ||||
|         cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed] | ||||
|         if word_ptr == NULL: | ||||
|             start = self.find_split(string, length) if start == -1 else start | ||||
|             word_ptr = self._add(hashed, string, start, length) | ||||
|             # Now check words seen exactly once | ||||
|             word_ptr = <Lexeme*>self.happax.get(hashed) | ||||
|             if word_ptr == NULL: | ||||
|                 start = self.find_split(string, length) if start == -1 else start | ||||
|                 word_ptr = self._add(hashed, string, start, length) | ||||
|             else: | ||||
|                 # Second time word seen, move to vocab | ||||
|                 self._happax_to_vocab(hashed, <Lexeme_addr>word_ptr) | ||||
|         return <Lexeme_addr>word_ptr | ||||
| 
 | ||||
|     cdef int _happax_to_vocab(self, StringHash hashed, Lexeme_addr word_ptr): | ||||
|         self.vocab[0][hashed] = word_ptr | ||||
|         self.happax.erase(hashed) | ||||
| 
 | ||||
|     cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): | ||||
|         cdef size_t i | ||||
|         word = self.init_lexeme(string, hashed, split, length) | ||||
|         self.vocab[0][hashed] = <Lexeme_addr>word | ||||
|         if self.happax.keys[hashed % self.happax.size] != 0: | ||||
|             self._happax_to_vocab(self.happax.keys[hashed % self.happax.size], | ||||
|                                   self.happax.values[hashed % self.happax.size]) | ||||
|         self.happax.insert(hashed, <size_t>word) | ||||
|         self.bacov[hashed] = string | ||||
|         return word    | ||||
| 
 | ||||
|  | @ -194,6 +211,7 @@ cdef class Language: | |||
|         # Now recurse, and deal with the tail | ||||
|         if tail_string: | ||||
|             word.tail = <Lexeme*>self.lookup(-1, tail_string, len(tail_string)) | ||||
|             self._happax_to_vocab(word.tail.sic, <Lexeme_addr>word.tail) | ||||
|         return word | ||||
| 
 | ||||
|     cdef Orthography* init_orth(self, StringHash hashed, unicode lex): | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc | |||
| 
 | ||||
| 
 | ||||
| from spacy.lexeme cimport Lexeme | ||||
| from spacy.lexeme cimport attr_of, norm_of, shape_of | ||||
| from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of | ||||
| from spacy.spacy cimport StringHash | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user