access model via sputnik

2025-11-10 12:58:01 +03:00 · 2015-12-07 06:01:28 +01:00 · 2015-12-07 06:01:28 +01:00 · 9027cef3bc
commit 9027cef3bc
parent a9fc35d3bf
20 changed files with 161 additions and 199 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,4 @@ plac
 six
 ujson
 cloudpickle
-sputnik == 0.5.2
+sputnik == 0.6.0
--- a/setup.py
+++ b/setup.py
@ -179,7 +179,7 @@ def run_setup(exts):
        license="MIT",
        install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
                          'thinc == 4.0.0', "text_unidecode", 'plac', 'six',
-                          'ujson', 'cloudpickle', 'sputnik == 0.5.2'],
+                          'ujson', 'cloudpickle', 'sputnik == 0.6.0'],
        setup_requires=["headers_workaround"],
        cmdclass = {'build_ext': build_ext_subclass },
    )
--- a/spacy/de/init.py
+++ b/spacy/de/init.py
@ -6,6 +6,4 @@ from ..language import Language


 class German(Language):
-    @classmethod
-    def default_data_dir(cls):
-        return path.join(path.dirname(__file__), 'data')
+    pass
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -4,8 +4,6 @@ from os import path

 from ..language import Language

-LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
-

 # improved list from Stone, Denis, Kwantes (2010)
 STOPWORDS = """
@ -35,10 +33,6 @@ your yours yourself yourselves
 STOPWORDS = set(w for w in STOPWORDS.split() if w)

 class English(Language):
-    @classmethod
-    def default_data_dir(cls):
-        return LOCAL_DATA_DIR
-
    @staticmethod
    def is_stop(string):
        return 1 if string.lower() in STOPWORDS else 0
--- a/spacy/fi/init.py
+++ b/spacy/fi/init.py
@ -6,6 +6,4 @@ from ..language import Language


 class Finnish(Language):
-    @classmethod
-    def default_data_dir(cls):
-        return path.join(path.dirname(__file__), 'data')
+    pass
--- a/spacy/it/init.py
+++ b/spacy/it/init.py
@ -6,6 +6,4 @@ from ..language import Language


 class Italian(Language):
-    @classmethod
-    def default_data_dir(cls):
-        return path.join(path.dirname(__file__), 'data')
+    pass
--- a/spacy/language.py
+++ b/spacy/language.py
@ -20,6 +20,7 @@ from .syntax.ner import BiluoPushDown
 from .syntax.arc_eager import ArcEager

 from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
+from .util import default_package


 class Language(object):
@ -100,7 +101,7 @@ class Language(object):
        return 0

    @classmethod
-    def default_lex_attrs(cls, data_dir=None):
+    def default_lex_attrs(cls):
        return {
            attrs.LOWER: cls.lower,
            attrs.NORM: cls.norm,
@ -134,73 +135,42 @@ class Language(object):
        return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}

    @classmethod
-    def default_data_dir(cls):
-        return path.join(path.dirname(__file__), 'data')
-
-    @classmethod
-    def default_vocab(cls, data_dir=None, get_lex_attr=None):
-        if data_dir is None:
-            data_dir = cls.default_data_dir()
+    def default_vocab(cls, package=None, get_lex_attr=None):
+        if package is None:
+            package = default_package()
        if get_lex_attr is None:
-            get_lex_attr = cls.default_lex_attrs(data_dir)
-        return Vocab.from_dir(
-                path.join(data_dir, 'vocab'),
-                get_lex_attr=get_lex_attr)
+            get_lex_attr = cls.default_lex_attrs()
+        return Vocab.from_package(package, get_lex_attr=get_lex_attr)

    @classmethod
-    def default_tokenizer(cls, vocab, data_dir):
-        if path.exists(data_dir):
-            return Tokenizer.from_dir(vocab, data_dir)
-        else:
-            return Tokenizer(vocab, {}, None, None, None)
-
-    @classmethod
-    def default_tagger(cls, vocab, data_dir):
-        if path.exists(data_dir):
-            return Tagger.from_dir(data_dir, vocab)
-        else:
-            return None
-
-    @classmethod
-    def default_parser(cls, vocab, data_dir):
-        if path.exists(data_dir):
+    def default_parser(cls, package, vocab):
+        data_dir = package.dir_path('data', 'deps')
        return Parser.from_dir(data_dir, vocab.strings, ArcEager)
-        else:
-            return None

    @classmethod
-    def default_entity(cls, vocab, data_dir):
-        if path.exists(data_dir):
+    def default_entity(cls, package, vocab):
+        data_dir = package.dir_path('data', 'ner')
        return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
-        else:
-            return None

-    @classmethod
-    def default_matcher(cls, vocab, data_dir):
-        if path.exists(data_dir):
-            return Matcher.from_dir(data_dir, vocab)
-        else:
-            return None
-
-    def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
+    def __init__(self, package=None, vocab=None, tokenizer=None, tagger=None,
                 parser=None, entity=None, matcher=None, serializer=None,
                 load_vectors=True):
        if load_vectors is not True:
            warn("load_vectors is deprecated", DeprecationWarning)
-        if data_dir in (None, True):
-            data_dir = self.default_data_dir()
+        if package in (None, True):
+            package = default_package()
        if vocab in (None, True):
-            vocab = self.default_vocab(data_dir)
+            vocab = self.default_vocab(package)
        if tokenizer in (None, True):
-            tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer'))
+            tokenizer = Tokenizer.from_package(package, vocab)
        if tagger in (None, True):
-            tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos'))
+            tagger = Tagger.from_package(package, vocab)
        if entity in (None, True):
-            entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner'))
+            entity = self.default_entity(package, vocab)
        if parser in (None, True):
-            parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps'))
+            parser = self.default_parser(package, vocab)
        if matcher in (None, True):
-            matcher = self.default_matcher(vocab, data_dir=data_dir)
+            matcher = Matcher.from_package(package, vocab)
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.tagger = tagger
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -12,16 +12,21 @@ from .parts_of_speech import NOUN, VERB, ADJ, PUNCT

 class Lemmatizer(object):
    @classmethod
-    def from_dir(cls, data_dir):
+    def from_package(cls, package):
        index = {}
        exc = {}
        for pos in ['adj', 'noun', 'verb']:
-            index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos))
-            exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos))
-        if path.exists(path.join(data_dir, 'vocab', 'lemma_rules.json')):
-            rules = json.load(codecs.open(path.join(data_dir, 'vocab', 'lemma_rules.json'), encoding='utf_8'))
-        else:
-            rules = {}
+            index[pos] = package.load_utf8(read_index,
+                'data', 'wordnet', 'index.%s' % pos,
+                default=set())  # TODO: really optional?
+            exc[pos] = package.load_utf8(read_exc,
+                'data', 'wordnet', '%s.exc' % pos,
+                default={})  # TODO: really optional?
+
+        rules = package.load_utf8(json.load,
+            'data', 'vocab', 'lemma_rules.json',
+            default={})  # TODO: really optional?
+
        return cls(index, exc, rules)

    def __init__(self, index, exceptions, rules):
@ -70,11 +75,9 @@ def lemmatize(string, index, exceptions, rules):
    return set(forms)


-def read_index(loc):
+def read_index(fileobj):
    index = set()
-    if not path.exists(loc):
-        return index
-    for line in codecs.open(loc, 'r', 'utf8'):
+    for line in fileobj:
        if line.startswith(' '):
            continue
        pieces = line.split()
@ -84,11 +87,9 @@ def read_index(loc):
    return index


-def read_exc(loc):
+def read_exc(fileobj):
    exceptions = {}
-    if not path.exists(loc):
-        return exceptions
-    for line in codecs.open(loc, 'r', 'utf8'):
+    for line in fileobj:
        if line.startswith(' '):
            continue
        pieces = line.split()
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -169,14 +169,11 @@ cdef class Matcher:
    cdef object _patterns

    @classmethod
-    def from_dir(cls, data_dir, Vocab vocab):
-        patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json')
-        if path.exists(patterns_loc):
-            patterns_data = open(patterns_loc).read()
-            patterns = json.loads(patterns_data)
+    def from_package(cls, package, Vocab vocab):
+        patterns = package.load_utf8(json.load,
+            'data', 'vocab', 'gazetteer.json',
+            default={})  # TODO: really optional?
        return cls(vocab, patterns)
-        else:
-            return cls(vocab, {})

    def __init__(self, vocab, patterns):
        self.vocab = vocab
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -146,15 +146,17 @@ cdef class Tagger:
        return cls(vocab, model)

    @classmethod
-    def from_dir(cls, data_dir, vocab):
-        if path.exists(path.join(data_dir, 'templates.json')):
-            templates = json.loads(open(path.join(data_dir, 'templates.json')))
-        else:
-            templates = cls.default_templates()
+    def from_package(cls, package, vocab):
+        # TODO: templates.json deprecated? not present in latest package
+        templates = package.load_utf8(json.load,
+            'data', 'pos', 'templates.json',
+            default=cls.default_templates())
+
        model = TaggerModel(vocab.morphology.n_tags,
            ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
-        if path.exists(path.join(data_dir, 'model')):
-            model.load(path.join(data_dir, 'model'))
+
+        model.load(package.file_path('data', 'pos', 'model', require=False))  # TODO: really optional?
+
        return cls(vocab, model)

    def __init__(self, Vocab vocab, TaggerModel model):
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -1,12 +1,11 @@
+from spacy.en import English
+
 import pytest
-from spacy.en import English, LOCAL_DATA_DIR
-import os


@pytest.fixture(scope="session")
 def EN():
-    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
-    return English(data_dir=data_dir)
+    return English()


 def pytest_addoption(parser):
--- a/spacy/tests/serialize/test_packer.py
+++ b/spacy/tests/serialize/test_packer.py
@ -10,7 +10,6 @@ from spacy.en import English
 from spacy.vocab import Vocab
 from spacy.tokens.doc import Doc
 from spacy.tokenizer import Tokenizer
-from spacy.en import LOCAL_DATA_DIR
 from os import path

 from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
--- a/spacy/tests/spans/conftest.py
+++ b/spacy/tests/spans/conftest.py
@ -1,9 +1,8 @@
 import pytest
-from spacy.en import English, LOCAL_DATA_DIR
+from spacy.en import English
 import os


@pytest.fixture(scope="session")
 def en_nlp():
-    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
-    return English(data_dir=data_dir)
+    return English()
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@ -4,31 +4,33 @@ import io
 import pickle

 from spacy.lemmatizer import Lemmatizer, read_index, read_exc
-from spacy.en import LOCAL_DATA_DIR
-from os import path
+from spacy.util import default_package

 import pytest


-def test_read_index():
-    wn = path.join(LOCAL_DATA_DIR, 'wordnet')
-    index = read_index(path.join(wn, 'index.noun'))
+@pytest.fixture
+def package():
+    return default_package()
+
+
+@pytest.fixture
+def lemmatizer(package):
+    return Lemmatizer.from_package(package)
+
+
+def test_read_index(package):
+    index = package.load_utf8(read_index, 'data', 'wordnet', 'index.noun')
    assert 'man' in index
    assert 'plantes' not in index
    assert 'plant' in index


-def test_read_exc():
-    wn = path.join(LOCAL_DATA_DIR, 'wordnet')
-    exc = read_exc(path.join(wn, 'verb.exc'))
+def test_read_exc(package):
+    exc = package.load_utf8(read_exc, 'data', 'wordnet', 'verb.exc')
    assert exc['was'] == ('be',)


-@pytest.fixture
-def lemmatizer():
-    return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
-
-
 def test_noun_lemmas(lemmatizer):
    do = lemmatizer.noun

--- a/spacy/tests/tokens/test_token_references.py
+++ b/spacy/tests/tokens/test_token_references.py
@ -2,16 +2,15 @@ from __future__ import unicode_literals
 import pytest
 import gc

-from spacy.en import English, LOCAL_DATA_DIR
+from spacy.en import English
 import os

-data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
 # Let this have its own instances, as we have to be careful about memory here
 # that's the point, after all

@pytest.mark.models
 def get_orphan_token(text, i):
-    nlp = English(data_dir=data_dir)
+    nlp = English()
    tokens = nlp(text)
    gc.collect()
    token = tokens[i]
@ -41,7 +40,7 @@ def _orphan_from_list(toks):
@pytest.mark.models
 def test_list_orphans():
    # Test case from NSchrading
-    nlp = English(data_dir=data_dir)
+    nlp = English()
    samples = ["a", "test blah wat okay"]
    lst = []
    for sample in samples:
--- a/spacy/tests/website/conftest.py
+++ b/spacy/tests/website/conftest.py
@ -5,9 +5,8 @@ import os

@pytest.fixture(scope='session')
 def nlp():
-    from spacy.en import English, LOCAL_DATA_DIR
-    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
-    return English(data_dir=data_dir)
+    from spacy.en import English
+    return English()


@pytest.fixture()
--- a/spacy/tests/website/test_home.py
+++ b/spacy/tests/website/test_home.py
@ -10,9 +10,8 @@ def token(doc):


 def test_load_resources_and_process_text():
-    from spacy.en import English, LOCAL_DATA_DIR
-    data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
-    nlp = English(data_dir=data_dir)
+    from spacy.en import English
+    nlp = English()
    doc = nlp('Hello, world. Here are two sentences.')


--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -41,8 +41,8 @@ cdef class Tokenizer:
        return (self.__class__, args, None, None)

    @classmethod
-    def from_dir(cls, Vocab vocab, data_dir):
-        rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir)
+    def from_package(cls, package, Vocab vocab):
+        rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
        prefix_re = re.compile(prefix_re)
        suffix_re = re.compile(suffix_re)
        infix_re = re.compile(infix_re)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,10 +1,23 @@
-from os import path
+import os
 import io
 import json
 import re
+
+from sputnik import Sputnik
+
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE

-DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
+
+def default_package():
+    if os.environ.get('SPACY_DATA'):
+        data_path = os.environ.get('SPACY_DATA')
+    else:
+        data_path = os.path.abspath(
+            os.path.join(os.path.dirname(__file__), 'data'))
+
+    sputnik = Sputnik('spacy', '0.99.0')  # TODO: retrieve version
+    pool = sputnik.pool(data_path)
+    return pool.get('en_default')


 def normalize_slice(length, start, stop, step=None):
@ -31,67 +44,63 @@ def utf8open(loc, mode='r'):
    return io.open(loc, mode, encoding='utf8')


-def read_lang_data(data_dir):
-    with open(path.join(data_dir, 'specials.json')) as file_:
-        tokenization = json.load(file_)
-    prefix = read_prefix(data_dir)
-    suffix = read_suffix(data_dir)
-    infix = read_infix(data_dir)
+def read_lang_data(package):
+    tokenization = package.load_utf8(json.load, 'data', 'tokenizer', 'specials.json')
+    prefix = package.load_utf8(read_prefix, 'data', 'tokenizer', 'prefix.txt')
+    suffix = package.load_utf8(read_suffix, 'data', 'tokenizer', 'suffix.txt')
+    infix = package.load_utf8(read_infix, 'data', 'tokenizer', 'infix.txt')
    return tokenization, prefix, suffix, infix


-def read_prefix(data_dir):
-    with  utf8open(path.join(data_dir, 'prefix.txt')) as file_:
-        entries = file_.read().split('\n')
+def read_prefix(fileobj):
+    entries = fileobj.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression


-def read_suffix(data_dir):
-    with utf8open(path.join(data_dir, 'suffix.txt')) as file_:
-        entries = file_.read().split('\n')
+def read_suffix(fileobj):
+    entries = fileobj.read().split('\n')
    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return expression


-def read_infix(data_dir):
-    with utf8open(path.join(data_dir, 'infix.txt')) as file_:
-        entries = file_.read().split('\n')
+def read_infix(fileobj):
+    entries = fileobj.read().split('\n')
    expression = '|'.join([piece for piece in entries if piece.strip()])
    return expression


-def read_tokenization(lang):
-    loc = path.join(DATA_DIR, lang, 'tokenization')
-    entries = []
-    seen = set()
-    with utf8open(loc) as file_:
-        for line in file_:
-            line = line.strip()
-            if line.startswith('#'):
-                continue
-            if not line:
-                continue
-            pieces = line.split()
-            chunk = pieces.pop(0)
-            assert chunk not in seen, chunk
-            seen.add(chunk)
-            entries.append((chunk, list(pieces)))
-            if chunk[0].isalpha() and chunk[0].islower():
-                chunk = chunk[0].title() + chunk[1:]
-                pieces[0] = pieces[0][0].title() + pieces[0][1:]
-                seen.add(chunk)
-                entries.append((chunk, pieces))
-    return entries
+# def read_tokenization(lang):
+#     loc = path.join(DATA_DIR, lang, 'tokenization')
+#     entries = []
+#     seen = set()
+#     with utf8open(loc) as file_:
+#         for line in file_:
+#             line = line.strip()
+#             if line.startswith('#'):
+#                 continue
+#             if not line:
+#                 continue
+#             pieces = line.split()
+#             chunk = pieces.pop(0)
+#             assert chunk not in seen, chunk
+#             seen.add(chunk)
+#             entries.append((chunk, list(pieces)))
+#             if chunk[0].isalpha() and chunk[0].islower():
+#                 chunk = chunk[0].title() + chunk[1:]
+#                 pieces[0] = pieces[0][0].title() + pieces[0][1:]
+#                 seen.add(chunk)
+#                 entries.append((chunk, pieces))
+#     return entries


-def read_detoken_rules(lang): # Deprecated?
-    loc = path.join(DATA_DIR, lang, 'detokenize')
-    entries = []
-    with utf8open(loc) as file_:
-        for line in file_:
-            entries.append(line.strip())
-    return entries
+# def read_detoken_rules(lang): # Deprecated?
+#     loc = path.join(DATA_DIR, lang, 'detokenize')
+#     entries = []
+#     with utf8open(loc) as file_:
+#         for line in file_:
+#             entries.append(line.strip())
+#     return entries


 def align_tokens(ref, indices): # Deprecated, surely?
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -47,28 +47,27 @@ cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
    @classmethod
-    def from_dir(cls, data_dir, get_lex_attr=None):
-        if not path.exists(data_dir):
-            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
-        if not path.isdir(data_dir):
-            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
+    def from_package(cls, package, get_lex_attr=None):
+        tag_map = package.load_utf8(json.load,
+            'data', 'vocab', 'tag_map.json')
+
+        lemmatizer = Lemmatizer.from_package(package)
+
+        serializer_freqs = package.load_utf8(json.load,
+            'data', 'vocab', 'serializer.json',
+            require=False)  # TODO: really optional?

-        tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
-        lemmatizer = Lemmatizer.from_dir(path.join(data_dir, '..'))
-        if path.exists(path.join(data_dir, 'serializer.json')):
-            serializer_freqs = json.load(open(path.join(data_dir, 'serializer.json')))
-        else:
-            serializer_freqs = None
        cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
                              lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)

-        if path.exists(path.join(data_dir, 'strings.json')):
-            with io.open(path.join(data_dir, 'strings.json'), 'r', encoding='utf8') as file_:
-                self.strings.load(file_)
-            self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
+        if package.has_file('data', 'vocab', 'strings.json'):  # TODO: really optional?
+            package.load_utf8(self.strings.load, 'data', 'vocab', 'strings.json')
+            self.load_lexemes(package.file_path('data', 'vocab', 'lexemes.bin'))
+
+        if package.has_file('data', 'vocab', 'vec.bin'):  # TODO: really optional?
+            self.vectors_length = self.load_vectors_from_bin_loc(
+                package.file_path('data', 'vocab', 'vec.bin'))

-        if path.exists(path.join(data_dir, 'vec.bin')):
-            self.vectors_length = self.load_vectors_from_bin_loc(path.join(data_dir, 'vec.bin'))
        return self

    def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):