diff --git a/spacy/gold.pyx b/spacy/gold.pyx index aea055ead..985887630 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -10,10 +10,7 @@ from os import path from libc.string cimport memset -try: - import ujson as json -except ImportError: - import json +import ujson as json from .syntax import nonproj @@ -149,7 +146,7 @@ def read_json_file(loc, docs_filter=None): for filename in os.listdir(loc): yield from read_json_file(path.join(loc, filename)) else: - with open(loc) as file_: + with io.open(loc, 'r', encoding='utf8') as file_: docs = json.load(file_) for doc in docs: if docs_filter is not None and not docs_filter(doc): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 78cd744cb..363e99fc5 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -2,10 +2,7 @@ from __future__ import unicode_literals, print_function import codecs import pathlib -try: - import ujson as json -except ImportError: - import json +import ujson as json from .parts_of_speech import NOUN, VERB, ADJ, PUNCT diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 493338079..999f9608c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -57,24 +57,24 @@ cdef class Vocab: "vectors argument to Vocab.load() deprecated. " "Install vectors after loading.") if tag_map is True and (path / 'vocab' / 'tag_map.json').exists(): - with (path / 'vocab' / 'tag_map.json').open() as file_: + with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_: tag_map = json.load(file_) if lex_attr_getters is not None \ and oov_prob is True \ and (path / 'vocab' / 'oov_prob').exists(): - with (path / 'vocab' / 'oov_prob').open() as file_: + with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_: oov_prob = float(file_.read()) lex_attr_getters[PROB] = lambda text: oov_prob if lemmatizer is True: lemmatizer = Lemmatizer.load(path) if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists(): - with (path / 'vocab' / 'serializer.json').open() as file_: + with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_: serializer_freqs = json.load(file_) cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - with (path / 'vocab' / 'strings.json').open() as file_: + with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: self.strings.load(file_) self.load_lexemes(path / 'vocab' / 'lexemes.bin') return self