mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 13:47:13 +03:00
* Add cautionary note to vocab about encoding
This commit is contained in:
parent
92f62bcb84
commit
7a519ea5af
|
@ -418,6 +418,8 @@ def write_binary_vectors(in_loc, out_loc):
|
||||||
with bz2.BZ2File(in_loc, 'r') as file_:
|
with bz2.BZ2File(in_loc, 'r') as file_:
|
||||||
for line in file_:
|
for line in file_:
|
||||||
pieces = line.split()
|
pieces = line.split()
|
||||||
|
# TODO: This should be explicit about the encoding to utf8,
|
||||||
|
# and the fact that len() refers to number of utf8 characters
|
||||||
word = pieces.pop(0)
|
word = pieces.pop(0)
|
||||||
mem = Address(len(pieces), sizeof(float))
|
mem = Address(len(pieces), sizeof(float))
|
||||||
vec = <float*>mem.ptr
|
vec = <float*>mem.ptr
|
||||||
|
|
Loading…
Reference in New Issue
Block a user