diff --git a/spacy/tests/regression/test_issue834.py b/spacy/tests/regression/test_issue834.py new file mode 100644 index 000000000..00b227f28 --- /dev/null +++ b/spacy/tests/regression/test_issue834.py @@ -0,0 +1,15 @@ +# coding: utf-8 + +from __future__ import unicode_literals +from io import StringIO + +word2vec_str = """, -0.046107 -0.035951 -0.560418 +de -0.648927 -0.400976 -0.527124 +. 0.113685 0.439990 -0.634510 +  -1.499184 -0.184280 -0.598371""" + + +def test_issue834(en_vocab): + f = StringIO(word2vec_str) + vector_length = en_vocab.load_vectors(f) + assert vector_length == 3 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index cd2b18f81..ab023c3b4 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,22 +1,17 @@ from __future__ import unicode_literals -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE from libc.string cimport memset from libc.stdint cimport int32_t -from libc.stdint cimport uint64_t from libc.math cimport sqrt from pathlib import Path import bz2 -import io -import math import ujson as json -import tempfile +import re from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string -from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer @@ -29,7 +24,6 @@ from . import symbols from cymem.cymem cimport Address from .serialize.packer cimport Packer from .attrs cimport PROB, LANG -from . import deprecated from . import util @@ -477,9 +471,12 @@ cdef class Vocab: cdef attr_t orth cdef int32_t vec_len = -1 cdef double norm = 0.0 + + whitespace_pattern = re.compile(r'\s') + for line_num, line in enumerate(file_): pieces = line.split() - word_str = " " if line.startswith(" ") else pieces.pop(0) + word_str = " " if whitespace_pattern.match(line) else pieces.pop(0) if vec_len == -1: vec_len = len(pieces) elif vec_len != len(pieces):