load_vectors should accept arbitrary space characters as word tokens

Fix bug #834
2025-08-01 19:00:20 +03:00 · 2017-02-16 12:08:07 +01:00 · 2017-02-16 12:08:07 +01:00 · 3fd2742649
commit 3fd2742649
parent 813989940e
1 changed files with 5 additions and 1 deletions
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -12,6 +12,7 @@ import io
 import math
 import ujson as json
 import tempfile
+import re

 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
@ -477,9 +478,12 @@ cdef class Vocab:
        cdef attr_t orth
        cdef int32_t vec_len = -1
        cdef double norm = 0.0
+
+        whitespace_pattern = re.compile(r'\s')
+
        for line_num, line in enumerate(file_):
            pieces = line.split()
-            word_str = " " if line.startswith(" ") else pieces.pop(0)
+            word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
            if vec_len == -1:
                vec_len = len(pieces)
            elif vec_len != len(pieces):