mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	load_vectors should accept arbitrary space characters as word tokens
Fix bug #834
This commit is contained in:
		
							parent
							
								
									813989940e
								
							
						
					
					
						commit
						3fd2742649
					
				|  | @ -12,6 +12,7 @@ import io | |||
| import math | ||||
| import ujson as json | ||||
| import tempfile | ||||
| import re | ||||
| 
 | ||||
| from .lexeme cimport EMPTY_LEXEME | ||||
| from .lexeme cimport Lexeme | ||||
|  | @ -477,9 +478,12 @@ cdef class Vocab: | |||
|         cdef attr_t orth | ||||
|         cdef int32_t vec_len = -1 | ||||
|         cdef double norm = 0.0 | ||||
| 
 | ||||
|         whitespace_pattern = re.compile(r'\s') | ||||
| 
 | ||||
|         for line_num, line in enumerate(file_): | ||||
|             pieces = line.split() | ||||
|             word_str = " " if line.startswith(" ") else pieces.pop(0) | ||||
|             word_str = " " if whitespace_pattern.match(line) else pieces.pop(0) | ||||
|             if vec_len == -1: | ||||
|                 vec_len = len(pieces) | ||||
|             elif vec_len != len(pieces): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user