Merge branch 'master' of https://github.com/explosion/spaCy

2025-10-30 23:47:31 +03:00 · 2016-10-20 21:32:34 +02:00 · 2016-10-20 21:32:34 +02:00 · 6d301fa06f
commit 6d301fa06f
parent 1ae3bde58f f5fe4f595b
6 changed files with 11 additions and 17 deletions
--- a/setup.py
+++ b/setup.py
@ -183,7 +183,7 @@ def setup_package():
            name=about['__title__'],
            zip_safe=False,
            packages=PACKAGES,
-            package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens', 'data']},
+            package_data={'': ['*.pyx', '*.pxd', '*.txt', '*.tokens']},
            description=about['__summary__'],
            long_description=readme,
            author=about['__author__'],
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,7 +4,7 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 __title__ = 'spacy'
-__version__ = '1.0.4'
+__version__ = '1.0.5'
 __summary__ = 'Industrial-strength NLP'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -10,10 +10,7 @@ from os import path
 from libc.string cimport memset
-try:
+import ujson as json
    import ujson as json
 except ImportError:
    import json
 from .syntax import nonproj
@ -149,7 +146,7 @@ def read_json_file(loc, docs_filter=None):
        for filename in os.listdir(loc):
            yield from read_json_file(path.join(loc, filename))
    else:
-        with open(loc) as file_:
+        with io.open(loc, 'r', encoding='utf8') as file_:
            docs = json.load(file_)
        for doc in docs:
            if docs_filter is not None and not docs_filter(doc):
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -2,10 +2,7 @@ from __future__ import unicode_literals, print_function
 import codecs
 import pathlib
-try:
+import ujson as json
    import ujson as json
 except ImportError:
    import json
 from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
@ -28,7 +25,7 @@ class Lemmatizer(object):
                    exc[pos] = read_exc(file_)
            else:
                exc[pos] = {}
-        with (path / 'vocab' / 'lemma_rules.json').open('rb') as file_:
+        with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
            rules = json.load(file_)
        return cls(index, exc, rules)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -175,7 +175,7 @@ cdef class Matcher:
    @classmethod
    def load(cls, path, vocab):
        if (path / 'gazetteer.json').exists():
-            with (path / 'gazetteer.json').open('rb') as file_:
+            with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
                patterns = json.load(file_)
        else:
            patterns = {}
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -57,24 +57,24 @@ cdef class Vocab:
                "vectors argument to Vocab.load() deprecated. "
                "Install vectors after loading.")
        if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
-            with (path / 'vocab' / 'tag_map.json').open() as file_:
+            with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
                tag_map = json.load(file_)
        if lex_attr_getters is not None \
        and oov_prob is True \
        and (path / 'vocab' / 'oov_prob').exists():
-            with (path / 'vocab' / 'oov_prob').open() as file_:
+            with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
                oov_prob = float(file_.read())
            lex_attr_getters[PROB] = lambda text: oov_prob
        if lemmatizer is True:
            lemmatizer = Lemmatizer.load(path)
        if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
-            with (path / 'vocab' / 'serializer.json').open() as file_:
+            with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
                serializer_freqs = json.load(file_)
        cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
                              lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
-        with (path / 'vocab' / 'strings.json').open() as file_:
+        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            self.strings.load(file_)
        self.load_lexemes(path / 'vocab' / 'lexemes.bin')
        return self