diff --git a/bin/init_model.py b/bin/init_model.py index 599cd3083..72d7a3aae 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -27,8 +27,8 @@ from pathlib import Path from shutil import copyfile from shutil import copytree -import codecs from collections import defaultdict +import io from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors @@ -61,7 +61,7 @@ def _read_clusters(loc): print("Warning: Clusters file not found") return {} clusters = {} - for line in codecs.open(str(loc), 'r', 'utf8'): + for line in io.open(str(loc), 'r', encoding='utf8'): try: cluster, word, freq = line.split() except ValueError: @@ -88,7 +88,7 @@ def _read_probs(loc): print("Probabilities file not found. Trying freqs.") return {}, 0.0 probs = {} - for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): + for i, line in enumerate(io.open(str(loc), 'r', encoding='utf8')): prob, word = line.split() prob = float(prob) probs[word] = prob diff --git a/bin/ner_tag.py b/bin/ner_tag.py index 34588bd12..f990f21a1 100644 --- a/bin/ner_tag.py +++ b/bin/ner_tag.py @@ -1,11 +1,11 @@ -import codecs +import io import plac from spacy.en import English def main(text_loc): - with codecs.open(text_loc, 'r', 'utf8') as file_: + with io.open(text_loc, 'r', encoding='utf8') as file_: text = file_.read() NLU = English() for paragraph in text.split('\n\n'): diff --git a/bin/parser/train.py b/bin/parser/train.py index c1f81af33..0a9d34ffc 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -6,7 +6,7 @@ from __future__ import print_function import os from os import path import shutil -import codecs +import io import random import plac @@ -177,7 +177,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc): nlp = Language(data_dir=model_dir) gold_tuples = read_json_file(dev_loc) scorer = Scorer() - out_file = codecs.open(out_loc, 'w', 'utf8') + out_file = io.open(out_loc, 'w', 'utf8') for raw_text, sents in gold_tuples: sents = _merge_sents(sents) for annot_tuples, brackets in sents: diff --git a/bin/prepare_treebank.py b/bin/prepare_treebank.py index d13ef7130..f9f4eec21 100644 --- a/bin/prepare_treebank.py +++ b/bin/prepare_treebank.py @@ -27,7 +27,7 @@ import json from os import path import os import re -import codecs +import io from collections import defaultdict from spacy.munge import read_ptb @@ -122,7 +122,7 @@ def read_file(*pieces): if not path.exists(loc): return None else: - return codecs.open(loc, 'r', 'utf8').read().strip() + return io.open(loc, 'r', encoding='utf8').read().strip() def get_file_names(section_dir, subsection): diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 5d0ad36c0..d8b100744 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,5 +1,7 @@ import numpy -import codecs +import io +import json +import ujson import random import re import os diff --git a/spacy/strings.pyx b/spacy/strings.pyx index a4a470158..a247fa6a8 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,5 +1,9 @@ +<<<<<<< HEAD from __future__ import unicode_literals import codecs +======= +import io +>>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd from libc.string cimport memcpy from murmurhash.mrmr cimport hash64 @@ -129,6 +133,7 @@ cdef class StringStore: def dump(self, loc): cdef Utf8Str* string +<<<<<<< HEAD cdef unicode py_string cdef int i with codecs.open(loc, 'w', 'utf8') as file_: @@ -138,9 +143,18 @@ cdef class StringStore: file_.write(py_string) if (i+1) != self.size: file_.write(SEPARATOR) +======= + cdef bytes py_string + for i in range(self.size): + string = &self.strings[i] + py_string = string.chars[:string.length] + strings.append(py_string.decode('utf8')) + with io.open(loc, 'w', encoding='utf8') as file_: + file_.write(SEPARATOR.join(strings)) +>>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd def load(self, loc): - with codecs.open(loc, 'r', 'utf8') as file_: + with io.open(loc, 'r', encoding='utf8') as file_: strings = file_.read().split(SEPARATOR) if strings == ['']: return None diff --git a/spacy/util.py b/spacy/util.py index 9f5b4fe04..93a67c66e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,5 +1,5 @@ from os import path -import codecs +import io import json import re from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE @@ -8,7 +8,7 @@ DATA_DIR = path.join(path.dirname(__file__), '..', 'data') def utf8open(loc, mode='r'): - return codecs.open(loc, mode, 'utf8') + return io.open(loc, mode, encoding='utf8') def read_lang_data(data_dir): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 2cc9094eb..d79da8a79 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -7,7 +7,7 @@ from libc.stdint cimport uint64_t import bz2 from os import path -import codecs +import io import math import json diff --git a/tests/parser/test_parse_navigate.py b/tests/parser/test_parse_navigate.py index a1c8b1a87..8c76199f4 100644 --- a/tests/parser/test_parse_navigate.py +++ b/tests/parser/test_parse_navigate.py @@ -1,13 +1,17 @@ from __future__ import unicode_literals from os import path -import codecs +import io import pytest @pytest.fixture def sun_text(): +<<<<<<< HEAD:tests/parser/test_parse_navigate.py with codecs.open(path.join(path.dirname(__file__), '..', 'sun.txt'), 'r', 'utf8') as file_: +======= + with io.open(path.join(path.dirname(__file__), 'sun.txt'), 'r', encoding='utf8') as file_: +>>>>>>> 8caedba42a5255b9996533a732e17eee3f20a2dd:tests/test_parse_navigate.py text = file_.read() return text