From 7a519ea5afa0d5f0e68278888afbe061a0cf749b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Feb 2016 00:13:20 +0100 Subject: [PATCH] * Add cautionary note to vocab about encoding --- spacy/vocab.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index a0a07f305..349f45b0a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -418,6 +418,8 @@ def write_binary_vectors(in_loc, out_loc): with bz2.BZ2File(in_loc, 'r') as file_: for line in file_: pieces = line.split() + # TODO: This should be explicit about the encoding to utf8, + # and the fact that len() refers to number of utf8 characters word = pieces.pop(0) mem = Address(len(pieces), sizeof(float)) vec = mem.ptr