Revert "Merge pull request #836 from raphael0202/load_vectors (closes #834)"

This reverts commit 7d8c9eee7f, reversing
changes made to f6b69babcc.
This commit is contained in:
ines 2017-02-16 15:27:12 +01:00
parent 0836cbe064
commit ea05f78660
2 changed files with 8 additions and 20 deletions

View File

@ -1,15 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from io import StringIO
word2vec_str = """, -0.046107 -0.035951 -0.560418
de -0.648927 -0.400976 -0.527124
. 0.113685 0.439990 -0.634510
  -1.499184 -0.184280 -0.598371"""
def test_issue834(en_vocab):
f = StringIO(word2vec_str)
vector_length = en_vocab.load_vectors(f)
assert vector_length == 3

View File

@ -1,17 +1,22 @@
from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset
from libc.stdint cimport int32_t
from libc.stdint cimport uint64_t
from libc.math cimport sqrt
from pathlib import Path
import bz2
import io
import math
import ujson as json
import re
import tempfile
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
from .strings cimport hash_string
from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
@ -24,6 +29,7 @@ from . import symbols
from cymem.cymem cimport Address
from .serialize.packer cimport Packer
from .attrs cimport PROB, LANG
from . import deprecated
from . import util
@ -471,12 +477,9 @@ cdef class Vocab:
cdef attr_t orth
cdef int32_t vec_len = -1
cdef double norm = 0.0
whitespace_pattern = re.compile(r'\s')
for line_num, line in enumerate(file_):
pieces = line.split()
word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
word_str = " " if line.startswith(" ") else pieces.pop(0)
if vec_len == -1:
vec_len = len(pieces)
elif vec_len != len(pieces):