mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
load_vectors should accept arbitrary space characters as word tokens
Fix bug #834
This commit is contained in:
parent
813989940e
commit
3fd2742649
|
@ -12,6 +12,7 @@ import io
|
|||
import math
|
||||
import ujson as json
|
||||
import tempfile
|
||||
import re
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport Lexeme
|
||||
|
@ -477,9 +478,12 @@ cdef class Vocab:
|
|||
cdef attr_t orth
|
||||
cdef int32_t vec_len = -1
|
||||
cdef double norm = 0.0
|
||||
|
||||
whitespace_pattern = re.compile(r'\s')
|
||||
|
||||
for line_num, line in enumerate(file_):
|
||||
pieces = line.split()
|
||||
word_str = " " if line.startswith(" ") else pieces.pop(0)
|
||||
word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
|
||||
if vec_len == -1:
|
||||
vec_len = len(pieces)
|
||||
elif vec_len != len(pieces):
|
||||
|
|
Loading…
Reference in New Issue
Block a user