From 3fd274264966b394953466ec76cb86104ee43124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Thu, 16 Feb 2017 12:08:07 +0100 Subject: [PATCH] load_vectors should accept arbitrary space characters as word tokens Fix bug #834 --- spacy/vocab.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index cd2b18f81..bff3b5595 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,6 +12,7 @@ import io import math import ujson as json import tempfile +import re from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme @@ -477,9 +478,12 @@ cdef class Vocab: cdef attr_t orth cdef int32_t vec_len = -1 cdef double norm = 0.0 + + whitespace_pattern = re.compile(r'\s') + for line_num, line in enumerate(file_): pieces = line.split() - word_str = " " if line.startswith(" ") else pieces.pop(0) + word_str = " " if whitespace_pattern.match(line) else pieces.pop(0) if vec_len == -1: vec_len = len(pieces) elif vec_len != len(pieces):