* Fix strings i/o, removing use of ujson library in favour of plain text file. Allows better control of codecs.

2026-03-02 19:01:29 +03:00 · 2014-11-02 13:20:37 +11:00 · 2014-11-02 13:20:37 +11:00 · 437cd2217d
commit 437cd2217d
parent 3352e89e21
1 changed files with 11 additions and 8 deletions
--- a/spacy/utf8string.pyx
+++ b/spacy/utf8string.pyx
@ -1,9 +1,9 @@
 from libc.string cimport memcpy

 from murmurhash.mrmr cimport hash64
+import codecs

-import ujson
-
+SEPARATOR = '\n|-SEP-|\n'

 cdef class StringStore:
    def __init__(self):
@ -61,12 +61,15 @@ cdef class StringStore:
        for i in range(self.size):
            string = &self.strings[i]
            py_string = string.chars[:string.length]
-            strings.append(py_string)
-        with open(loc, 'w') as file_:
-            ujson.dump(strings, file_, ensure_ascii=False)
+            strings.append(py_string.decode('utf8'))
+        with codecs.open(loc, 'w', 'utf8') as file_:
+            file_.write(SEPARATOR.join(strings))

    def load(self, loc):
-        with open(loc) as file_:
-            strings = ujson.load(file_)
+        with codecs.open(loc, 'r', 'utf8') as file_:
+            strings = file_.read().split(SEPARATOR)
+        cdef unicode string
+        cdef bytes byte_string
        for string in strings[1:]:
-            self.intern(string, len(string))
+            byte_string = string.encode('utf8')
+            self.intern(byte_string, len(byte_string))