Use util.Package class for io

Previous Sputnik integration caused API change: Vocab, Tagger, etc
were loaded via a from_package classmethod, that required a
sputnik.Package instance. This forced users to first create a
sputnik.Sputnik() instance, in order to acquire a Package via
sp.pool().

Instead I've created a small file-system shim, util.Package, which
allows classes to have a .load() classmethod, that accepts either
util.Package objects, or strings. We can later gut the internals
of this and make it a proxy for Sputnik if we need more functionality
that should live in the Sputnik library.

Sputnik is now only used to download and install the data, in
spacy.en.download
This commit is contained in:
Matthew Honnibal 2015-12-29 18:00:48 +01:00
parent 0e2498da00
commit aec130af56
14 changed files with 97 additions and 67 deletions

View File

@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package
from .util import get_package, MockPackage
class Language(object):
@ -142,7 +142,7 @@ class Language(object):
package = get_package()
if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs()
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
return Vocab.load(package, get_lex_attr=get_lex_attr)
@classmethod
def default_parser(cls, package, vocab):
@ -182,40 +182,17 @@ class Language(object):
- Language(model='en_default ==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data')
"""
# support non-package data dirs
if data_dir and path.exists(path.join(data_dir, 'vocab')):
class Package(object):
def __init__(self, root):
self.root = root
def has_file(self, *path_parts):
return path.exists(path.join(self.root, *path_parts))
def file_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def dir_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def load_utf8(self, func, *path_parts, **kwargs):
with io.open(self.file_path(path.join(*path_parts)),
mode='r', encoding='utf8') as f:
return func(f)
warn("using non-package data_dir", DeprecationWarning)
package = Package(data_dir)
else:
package = get_package(name=model, data_path=data_dir)
package = MockPackage(data_dir)
if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True):
vocab = self.default_vocab(package)
self.vocab = vocab
if tokenizer in (None, True):
tokenizer = Tokenizer.from_package(package, self.vocab)
tokenizer = Tokenizer.load(package, self.vocab)
self.tokenizer = tokenizer
if tagger in (None, True):
tagger = Tagger.from_package(package, self.vocab)
tagger = Tagger.load(package, self.vocab)
self.tagger = tagger
if entity in (None, True):
entity = self.default_entity(package, self.vocab)
@ -224,7 +201,7 @@ class Language(object):
parser = self.default_parser(package, self.vocab)
self.parser = parser
if matcher in (None, True):
matcher = Matcher.from_package(package, self.vocab)
matcher = Matcher.load(package, self.vocab)
self.matcher = matcher
def __reduce__(self):

View File

@ -8,13 +8,13 @@ except ImportError:
import json
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
from .util import MockPackage
from .util import MockPackage as Package
class Lemmatizer(object):
@classmethod
def load(cls, pkg_or_str_or_file):
pkg = MockPackage.create_or_return(pkg_or_str_or_file)
pkg = Package.create_or_return(pkg_or_str_or_file)
index = {}
exc = {}
for pos in ['adj', 'noun', 'verb']:

View File

@ -21,6 +21,7 @@ from .tokens.doc cimport Doc
from .vocab cimport Vocab
from .attrs import FLAG61 as U_ENT
from .util import MockPackage
from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
@ -169,7 +170,8 @@ cdef class Matcher:
cdef object _patterns
@classmethod
def from_package(cls, package, Vocab vocab):
def load(cls, pkg_or_str_or_file, Vocab vocab):
package = MockPackage.create_or_return(pkg_or_str_or_file)
patterns = package.load_utf8(json.load,
'vocab', 'gazetteer.json',
default={}) # TODO: really optional?

View File

@ -88,6 +88,13 @@ cdef class Parser:
model.load(path.join(model_dir, 'model'))
return cls(strings, moves, model)
@classmethod
def load(cls, pkg_or_str_or_file, vocab):
# TODO
raise NotImplementedError(
"This should be here, but isn't yet =/. Use Parser.from_dir")
def __reduce__(self):
return (Parser, (self.moves.strings, self.moves, self.model), None, None)

View File

@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport *
from .util import Package
cpdef enum:
P2_orth
@ -146,7 +148,8 @@ cdef class Tagger:
return cls(vocab, model)
@classmethod
def from_package(cls, package, vocab):
def load(cls, pkg_or_str_or_file, vocab):
pkg = Package.create_or_return(pkg_or_str_or_file)
# TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates()
# templates = package.load_utf8(json.load,
@ -156,8 +159,9 @@ cdef class Tagger:
model = TaggerModel(vocab.morphology.n_tags,
ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
if package.has_file('pos', 'model'): # TODO: really optional?
model.load(package.file_path('pos', 'model'))
if pkg.has_file('pos', 'model'): # TODO: really optional?
model.load(pkg.file_path('pos', 'model'))
return cls(vocab, model)

View File

@ -1,11 +1,17 @@
from spacy.en import English
import pytest
import os
@pytest.fixture(scope="session")
def EN():
return English()
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
print("Load EN from %s" % data_path)
return English(data_dir=data_path)
def pytest_addoption(parser):

View File

@ -11,6 +11,7 @@ from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
from os import path
import os
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer
@ -20,7 +21,11 @@ from spacy.serialize.bits import BitArray
@pytest.fixture
def vocab():
vocab = English.default_vocab()
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
vocab = English.default_vocab(package=data_path)
lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog'
lex = vocab['the']

View File

@ -1,22 +1,27 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import io
import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.util import get_package
from spacy.util import get_package, MockPackage
import pytest
@pytest.fixture
def package():
return get_package()
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
return get_package(data_path=data_path)
@pytest.fixture
def lemmatizer(package):
return Lemmatizer.from_package(package)
return Lemmatizer.load(package)
def test_read_index(package):

View File

@ -48,3 +48,10 @@ def test_punct(en_tokenizer):
assert len(tokens) == 2
tokens = en_tokenizer("``We've")
assert len(tokens) == 3
def test_therell(en_tokenizer):
tokens = en_tokenizer("there'll")
assert len(tokens) == 2
assert tokens[0].text == "there"
assert tokens[1].text == "'ll"

View File

@ -6,7 +6,11 @@ import os
@pytest.fixture(scope='session')
def nlp():
from spacy.en import English
return English()
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
return English(data_dir=data_path)
@pytest.fixture()

View File

@ -10,8 +10,14 @@ def token(doc):
def test_load_resources_and_process_text():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
print("Load EN from %s" % data_path)
from spacy.en import English
nlp = English()
nlp = English(data_dir=data_path)
doc = nlp('Hello, world. Here are two sentences.')

View File

@ -15,8 +15,9 @@ from .strings cimport hash_string
cimport cython
from . import util
from .util import read_lang_data
from .tokens.doc cimport Doc
from .util import read_lang_data
from .util import MockPackage as Package
cdef class Tokenizer:
@ -41,8 +42,9 @@ cdef class Tokenizer:
return (self.__class__, args, None, None)
@classmethod
def from_package(cls, package, Vocab vocab):
rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
def load(cls, pkg_or_str_or_file, Vocab vocab):
pkg = Package.create_or_return(pkg_or_str_or_file)
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re)

View File

@ -4,6 +4,7 @@ import json
import re
import os.path
from contextlib import contextmanager
import types
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
@ -12,10 +13,10 @@ def local_path(subdir):
return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
class MockPackage(object):
class Package(object):
@classmethod
def create_or_return(cls, me_or_arg):
return me_or_arg if isinstance(me_or_arg, cls) else me_or_arg
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
def __init__(self, data_path=None):
if data_path is None:
@ -46,15 +47,20 @@ class MockPackage(object):
@contextmanager
def open(self, path_parts, default=IOError):
if isinstance(default, Exception):
raise default
# Enter
file_ = io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8')
yield file_
# Exit
file_.close()
if not self.has_file(*path_parts):
if isinstance(default, types.TypeType) and issubclass(default, Exception):
raise default(self.file_path(*path_parts))
elif isinstance(default, Exception):
raise default
else:
yield default
else:
# Enter
file_ = io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8')
yield file_
# Exit
file_.close()

View File

@ -19,7 +19,7 @@ from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from .util import MockPackage
from .util import Package
from . import attrs
from . import symbols
@ -49,24 +49,23 @@ cdef class Vocab:
'''
@classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
package = MockPackage.create_or_return(pkg_or_str_or_file)
tag_map = package.load_utf8(json.load,
'vocab', 'tag_map.json')
package = Package.create_or_return(pkg_or_str_or_file)
with package.open(('vocab', 'tag_map.json'), default=None) as file_:
tag_map = json.load(file_) if file_ is not None else {}
lemmatizer = Lemmatizer.load(package)
serializer_freqs = package.load_utf8(json.load,
'vocab', 'serializer.json',
require=False) # TODO: really optional?
with package.open(('vocab', 'serializer.json'), default=None) as file_:
serializer_freqs = json.load(file_) if file_ is not None else {}
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
if package.has_file('vocab', 'strings.json'): # TODO: really optional?
package.load_utf8(self.strings.load, 'vocab', 'strings.json')
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
with package.open(('vocab', 'strings.json')) as file_:
self.strings.load(file_)
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
if package.has_file('vocab', 'vec.bin'): # TODO: really optional?
if package.has_file('vocab', 'vec.bin'):
self.vectors_length = self.load_vectors_from_bin_loc(
package.file_path('vocab', 'vec.bin'))
return self