mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-24 12:41:23 +03:00
* Add cautionary note to vocab about encoding
This commit is contained in:
parent
92f62bcb84
commit
7a519ea5af
|
@ -418,6 +418,8 @@ def write_binary_vectors(in_loc, out_loc):
|
|||
with bz2.BZ2File(in_loc, 'r') as file_:
|
||||
for line in file_:
|
||||
pieces = line.split()
|
||||
# TODO: This should be explicit about the encoding to utf8,
|
||||
# and the fact that len() refers to number of utf8 characters
|
||||
word = pieces.pop(0)
|
||||
mem = Address(len(pieces), sizeof(float))
|
||||
vec = <float*>mem.ptr
|
||||
|
|
Loading…
Reference in New Issue
Block a user