This commit is contained in:
Matthew Honnibal 2016-10-20 21:32:34 +02:00
commit 6d301fa06f
6 changed files with 11 additions and 17 deletions

View File

@ -183,7 +183,7 @@ def setup_package():
name=about['__title__'], name=about['__title__'],
zip_safe=False, zip_safe=False,
packages=PACKAGES, packages=PACKAGES,
package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens', 'data']}, package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens']},
description=about['__summary__'], description=about['__summary__'],
long_description=readme, long_description=readme,
author=about['__author__'], author=about['__author__'],

View File

@ -4,7 +4,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy' __title__ = 'spacy'
__version__ = '1.0.4' __version__ = '1.0.5'
__summary__ = 'Industrial-strength NLP' __summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io' __uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal' __author__ = 'Matthew Honnibal'

View File

@ -10,10 +10,7 @@ from os import path
from libc.string cimport memset from libc.string cimport memset
try: import ujson as json
import ujson as json
except ImportError:
import json
from .syntax import nonproj from .syntax import nonproj
@ -149,7 +146,7 @@ def read_json_file(loc, docs_filter=None):
for filename in os.listdir(loc): for filename in os.listdir(loc):
yield from read_json_file(path.join(loc, filename)) yield from read_json_file(path.join(loc, filename))
else: else:
with open(loc) as file_: with io.open(loc, 'r', encoding='utf8') as file_:
docs = json.load(file_) docs = json.load(file_)
for doc in docs: for doc in docs:
if docs_filter is not None and not docs_filter(doc): if docs_filter is not None and not docs_filter(doc):

View File

@ -2,10 +2,7 @@ from __future__ import unicode_literals, print_function
import codecs import codecs
import pathlib import pathlib
try: import ujson as json
import ujson as json
except ImportError:
import json
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
@ -28,7 +25,7 @@ class Lemmatizer(object):
exc[pos] = read_exc(file_) exc[pos] = read_exc(file_)
else: else:
exc[pos] = {} exc[pos] = {}
with (path / 'vocab' / 'lemma_rules.json').open('rb') as file_: with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
rules = json.load(file_) rules = json.load(file_)
return cls(index, exc, rules) return cls(index, exc, rules)

View File

@ -175,7 +175,7 @@ cdef class Matcher:
@classmethod @classmethod
def load(cls, path, vocab): def load(cls, path, vocab):
if (path / 'gazetteer.json').exists(): if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('rb') as file_: with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = json.load(file_) patterns = json.load(file_)
else: else:
patterns = {} patterns = {}

View File

@ -57,24 +57,24 @@ cdef class Vocab:
"vectors argument to Vocab.load() deprecated. " "vectors argument to Vocab.load() deprecated. "
"Install vectors after loading.") "Install vectors after loading.")
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists(): if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
with (path / 'vocab' / 'tag_map.json').open() as file_: with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
tag_map = json.load(file_) tag_map = json.load(file_)
if lex_attr_getters is not None \ if lex_attr_getters is not None \
and oov_prob is True \ and oov_prob is True \
and (path / 'vocab' / 'oov_prob').exists(): and (path / 'vocab' / 'oov_prob').exists():
with (path / 'vocab' / 'oov_prob').open() as file_: with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
oov_prob = float(file_.read()) oov_prob = float(file_.read())
lex_attr_getters[PROB] = lambda text: oov_prob lex_attr_getters[PROB] = lambda text: oov_prob
if lemmatizer is True: if lemmatizer is True:
lemmatizer = Lemmatizer.load(path) lemmatizer = Lemmatizer.load(path)
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists(): if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
with (path / 'vocab' / 'serializer.json').open() as file_: with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
serializer_freqs = json.load(file_) serializer_freqs = json.load(file_)
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
with (path / 'vocab' / 'strings.json').open() as file_: with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
self.strings.load(file_) self.strings.load(file_)
self.load_lexemes(path / 'vocab' / 'lexemes.bin') self.load_lexemes(path / 'vocab' / 'lexemes.bin')
return self return self