mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Preparations in place to handle hyphenation etc
This commit is contained in:
		
							parent
							
								
									ff79dbac2e
								
							
						
					
					
						commit
						868e558037
					
				| 
						 | 
				
			
			@ -52,6 +52,8 @@ cdef class Language:
 | 
			
		|||
    cpdef Lexeme lookup(self, unicode text)
 | 
			
		||||
 | 
			
		||||
    cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1
 | 
			
		||||
 | 
			
		||||
    cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1
 | 
			
		||||
    cdef int _find_prefix(self, Py_UNICODE* characters, size_t length)
 | 
			
		||||
    cdef int _find_suffix(self, Py_UNICODE* characters, size_t length)
 | 
			
		||||
    
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -155,16 +155,14 @@ cdef class Language:
 | 
			
		|||
        cdef LexemeC* lexeme
 | 
			
		||||
        for lexeme in deref(prefixes):
 | 
			
		||||
            tokens.push_back(lexeme)
 | 
			
		||||
        if string.n != 0:
 | 
			
		||||
        if not _extend_from_map(tokens, string, self.specials):
 | 
			
		||||
                tokens.push_back(self.lexicon.get(string))
 | 
			
		||||
            self._split_body_token(tokens, string)
 | 
			
		||||
        cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
 | 
			
		||||
        while it != suffixes.rend():
 | 
			
		||||
            tokens.push_back(deref(it))
 | 
			
		||||
            preinc(it)
 | 
			
		||||
 | 
			
		||||
    cdef int _save_cached(self, vector[LexemeC*] *tokens,
 | 
			
		||||
                          uint64_t key, size_t n) except -1:
 | 
			
		||||
    cdef int _save_cached(self, vector[LexemeC*] *tokens, uint64_t key, size_t n) except -1:
 | 
			
		||||
        assert tokens.size() > n
 | 
			
		||||
        lexemes = <LexemeC**>self._mem.alloc((tokens.size() - n) + 1, sizeof(LexemeC**))
 | 
			
		||||
        cdef size_t i, j
 | 
			
		||||
| 
						 | 
				
			
			@ -173,6 +171,9 @@ cdef class Language:
 | 
			
		|||
        lexemes[i + 1] = NULL
 | 
			
		||||
        self.cache.set(key, lexemes)
 | 
			
		||||
 | 
			
		||||
    cdef int _split_body_token(self, vector[LexemeC*] *tokens, String* string) except -1:
 | 
			
		||||
        tokens.push_back(self.lexicon.get(string))
 | 
			
		||||
    
 | 
			
		||||
    cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
 | 
			
		||||
        cdef unicode string = chars[:length]
 | 
			
		||||
        match = self.prefix_re.search(string)
 | 
			
		||||
| 
						 | 
				
			
			@ -255,6 +256,8 @@ cdef class Lexicon:
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
cdef int _extend_from_map(vector[LexemeC*] *tokens, String* string, PreshMap map_) except -1:
 | 
			
		||||
    if string.n == 0:
 | 
			
		||||
        return 1
 | 
			
		||||
    lexemes = <LexemeC**>map_.get(string.key)
 | 
			
		||||
    if lexemes == NULL:
 | 
			
		||||
        return 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user