Use util.Package class for io

Previous Sputnik integration caused API change: Vocab, Tagger, etc
were loaded via a from_package classmethod, that required a
sputnik.Package instance. This forced users to first create a
sputnik.Sputnik() instance, in order to acquire a Package via
sp.pool().

Instead I've created a small file-system shim, util.Package, which
allows classes to have a .load() classmethod, that accepts either
util.Package objects, or strings. We can later gut the internals
of this and make it a proxy for Sputnik if we need more functionality
that should live in the Sputnik library.

Sputnik is now only used to download and install the data, in
spacy.en.download
This commit is contained in:
Matthew Honnibal 2015-12-29 18:00:48 +01:00
parent 0e2498da00
commit aec130af56
14 changed files with 97 additions and 67 deletions

View File

@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package from .util import get_package, MockPackage
class Language(object): class Language(object):
@ -142,7 +142,7 @@ class Language(object):
package = get_package() package = get_package()
if get_lex_attr is None: if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs() get_lex_attr = cls.default_lex_attrs()
return Vocab.from_package(package, get_lex_attr=get_lex_attr) return Vocab.load(package, get_lex_attr=get_lex_attr)
@classmethod @classmethod
def default_parser(cls, package, vocab): def default_parser(cls, package, vocab):
@ -182,40 +182,17 @@ class Language(object):
- Language(model='en_default ==1.0.0') - Language(model='en_default ==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data') - Language(model='en_default <1.1.0, data_dir='spacy/data')
""" """
# support non-package data dirs package = MockPackage(data_dir)
if data_dir and path.exists(path.join(data_dir, 'vocab')):
class Package(object):
def __init__(self, root):
self.root = root
def has_file(self, *path_parts):
return path.exists(path.join(self.root, *path_parts))
def file_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def dir_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def load_utf8(self, func, *path_parts, **kwargs):
with io.open(self.file_path(path.join(*path_parts)),
mode='r', encoding='utf8') as f:
return func(f)
warn("using non-package data_dir", DeprecationWarning)
package = Package(data_dir)
else:
package = get_package(name=model, data_path=data_dir)
if load_vectors is not True: if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True): if vocab in (None, True):
vocab = self.default_vocab(package) vocab = self.default_vocab(package)
self.vocab = vocab self.vocab = vocab
if tokenizer in (None, True): if tokenizer in (None, True):
tokenizer = Tokenizer.from_package(package, self.vocab) tokenizer = Tokenizer.load(package, self.vocab)
self.tokenizer = tokenizer self.tokenizer = tokenizer
if tagger in (None, True): if tagger in (None, True):
tagger = Tagger.from_package(package, self.vocab) tagger = Tagger.load(package, self.vocab)
self.tagger = tagger self.tagger = tagger
if entity in (None, True): if entity in (None, True):
entity = self.default_entity(package, self.vocab) entity = self.default_entity(package, self.vocab)
@ -224,7 +201,7 @@ class Language(object):
parser = self.default_parser(package, self.vocab) parser = self.default_parser(package, self.vocab)
self.parser = parser self.parser = parser
if matcher in (None, True): if matcher in (None, True):
matcher = Matcher.from_package(package, self.vocab) matcher = Matcher.load(package, self.vocab)
self.matcher = matcher self.matcher = matcher
def __reduce__(self): def __reduce__(self):

View File

@ -8,13 +8,13 @@ except ImportError:
import json import json
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
from .util import MockPackage from .util import MockPackage as Package
class Lemmatizer(object): class Lemmatizer(object):
@classmethod @classmethod
def load(cls, pkg_or_str_or_file): def load(cls, pkg_or_str_or_file):
pkg = MockPackage.create_or_return(pkg_or_str_or_file) pkg = Package.create_or_return(pkg_or_str_or_file)
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'noun', 'verb']: for pos in ['adj', 'noun', 'verb']:

View File

@ -21,6 +21,7 @@ from .tokens.doc cimport Doc
from .vocab cimport Vocab from .vocab cimport Vocab
from .attrs import FLAG61 as U_ENT from .attrs import FLAG61 as U_ENT
from .util import MockPackage
from .attrs import FLAG60 as B2_ENT from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT from .attrs import FLAG59 as B3_ENT
@ -169,7 +170,8 @@ cdef class Matcher:
cdef object _patterns cdef object _patterns
@classmethod @classmethod
def from_package(cls, package, Vocab vocab): def load(cls, pkg_or_str_or_file, Vocab vocab):
package = MockPackage.create_or_return(pkg_or_str_or_file)
patterns = package.load_utf8(json.load, patterns = package.load_utf8(json.load,
'vocab', 'gazetteer.json', 'vocab', 'gazetteer.json',
default={}) # TODO: really optional? default={}) # TODO: really optional?

View File

@ -88,6 +88,13 @@ cdef class Parser:
model.load(path.join(model_dir, 'model')) model.load(path.join(model_dir, 'model'))
return cls(strings, moves, model) return cls(strings, moves, model)
@classmethod
def load(cls, pkg_or_str_or_file, vocab):
# TODO
raise NotImplementedError(
"This should be here, but isn't yet =/. Use Parser.from_dir")
def __reduce__(self): def __reduce__(self):
return (Parser, (self.moves.strings, self.moves, self.model), None, None) return (Parser, (self.moves.strings, self.moves, self.model), None, None)

View File

@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport * from .attrs cimport *
from .util import Package
cpdef enum: cpdef enum:
P2_orth P2_orth
@ -146,7 +148,8 @@ cdef class Tagger:
return cls(vocab, model) return cls(vocab, model)
@classmethod @classmethod
def from_package(cls, package, vocab): def load(cls, pkg_or_str_or_file, vocab):
pkg = Package.create_or_return(pkg_or_str_or_file)
# TODO: templates.json deprecated? not present in latest package # TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates() templates = cls.default_templates()
# templates = package.load_utf8(json.load, # templates = package.load_utf8(json.load,
@ -156,8 +159,9 @@ cdef class Tagger:
model = TaggerModel(vocab.morphology.n_tags, model = TaggerModel(vocab.morphology.n_tags,
ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
if package.has_file('pos', 'model'): # TODO: really optional?
model.load(package.file_path('pos', 'model')) if pkg.has_file('pos', 'model'): # TODO: really optional?
model.load(pkg.file_path('pos', 'model'))
return cls(vocab, model) return cls(vocab, model)

View File

@ -1,11 +1,17 @@
from spacy.en import English from spacy.en import English
import pytest import pytest
import os
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def EN(): def EN():
return English() if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
print("Load EN from %s" % data_path)
return English(data_dir=data_path)
def pytest_addoption(parser): def pytest_addoption(parser):

View File

@ -11,6 +11,7 @@ from spacy.vocab import Vocab
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from os import path from os import path
import os
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer from spacy.serialize.packer import Packer
@ -20,7 +21,11 @@ from spacy.serialize.bits import BitArray
@pytest.fixture @pytest.fixture
def vocab(): def vocab():
vocab = English.default_vocab() if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
vocab = English.default_vocab(package=data_path)
lex = vocab['dog'] lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog' assert vocab[vocab.strings['dog']].orth_ == 'dog'
lex = vocab['the'] lex = vocab['the']

View File

@ -1,22 +1,27 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
import os
import io import io
import pickle import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.util import get_package from spacy.util import get_package, MockPackage
import pytest import pytest
@pytest.fixture @pytest.fixture
def package(): def package():
return get_package() if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
return get_package(data_path=data_path)
@pytest.fixture @pytest.fixture
def lemmatizer(package): def lemmatizer(package):
return Lemmatizer.from_package(package) return Lemmatizer.load(package)
def test_read_index(package): def test_read_index(package):

View File

@ -48,3 +48,10 @@ def test_punct(en_tokenizer):
assert len(tokens) == 2 assert len(tokens) == 2
tokens = en_tokenizer("``We've") tokens = en_tokenizer("``We've")
assert len(tokens) == 3 assert len(tokens) == 3
def test_therell(en_tokenizer):
tokens = en_tokenizer("there'll")
assert len(tokens) == 2
assert tokens[0].text == "there"
assert tokens[1].text == "'ll"

View File

@ -6,7 +6,11 @@ import os
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def nlp(): def nlp():
from spacy.en import English from spacy.en import English
return English() if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
return English(data_dir=data_path)
@pytest.fixture() @pytest.fixture()

View File

@ -10,8 +10,14 @@ def token(doc):
def test_load_resources_and_process_text(): def test_load_resources_and_process_text():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
print("Load EN from %s" % data_path)
from spacy.en import English from spacy.en import English
nlp = English() nlp = English(data_dir=data_path)
doc = nlp('Hello, world. Here are two sentences.') doc = nlp('Hello, world. Here are two sentences.')

View File

@ -15,8 +15,9 @@ from .strings cimport hash_string
cimport cython cimport cython
from . import util from . import util
from .util import read_lang_data
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .util import read_lang_data
from .util import MockPackage as Package
cdef class Tokenizer: cdef class Tokenizer:
@ -41,8 +42,9 @@ cdef class Tokenizer:
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
@classmethod @classmethod
def from_package(cls, package, Vocab vocab): def load(cls, pkg_or_str_or_file, Vocab vocab):
rules, prefix_re, suffix_re, infix_re = read_lang_data(package) pkg = Package.create_or_return(pkg_or_str_or_file)
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)

View File

@ -4,6 +4,7 @@ import json
import re import re
import os.path import os.path
from contextlib import contextmanager from contextlib import contextmanager
import types
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
@ -12,10 +13,10 @@ def local_path(subdir):
return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
class MockPackage(object): class Package(object):
@classmethod @classmethod
def create_or_return(cls, me_or_arg): def create_or_return(cls, me_or_arg):
return me_or_arg if isinstance(me_or_arg, cls) else me_or_arg return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
def __init__(self, data_path=None): def __init__(self, data_path=None):
if data_path is None: if data_path is None:
@ -46,15 +47,20 @@ class MockPackage(object):
@contextmanager @contextmanager
def open(self, path_parts, default=IOError): def open(self, path_parts, default=IOError):
if isinstance(default, Exception): if not self.has_file(*path_parts):
raise default if isinstance(default, types.TypeType) and issubclass(default, Exception):
raise default(self.file_path(*path_parts))
# Enter elif isinstance(default, Exception):
file_ = io.open(self.file_path(os.path.join(*path_parts)), raise default
mode='r', encoding='utf8') else:
yield file_ yield default
# Exit else:
file_.close() # Enter
file_ = io.open(self.file_path(os.path.join(*path_parts)),
mode='r', encoding='utf8')
yield file_
# Exit
file_.close()

View File

@ -19,7 +19,7 @@ from .orth cimport word_shape
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile from .cfile cimport CFile
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .util import MockPackage from .util import Package
from . import attrs from . import attrs
from . import symbols from . import symbols
@ -49,24 +49,23 @@ cdef class Vocab:
''' '''
@classmethod @classmethod
def load(cls, pkg_or_str_or_file, get_lex_attr=None): def load(cls, pkg_or_str_or_file, get_lex_attr=None):
package = MockPackage.create_or_return(pkg_or_str_or_file) package = Package.create_or_return(pkg_or_str_or_file)
tag_map = package.load_utf8(json.load, with package.open(('vocab', 'tag_map.json'), default=None) as file_:
'vocab', 'tag_map.json') tag_map = json.load(file_) if file_ is not None else {}
lemmatizer = Lemmatizer.load(package) lemmatizer = Lemmatizer.load(package)
serializer_freqs = package.load_utf8(json.load, with package.open(('vocab', 'serializer.json'), default=None) as file_:
'vocab', 'serializer.json', serializer_freqs = json.load(file_) if file_ is not None else {}
require=False) # TODO: really optional?
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
if package.has_file('vocab', 'strings.json'): # TODO: really optional? with package.open(('vocab', 'strings.json')) as file_:
package.load_utf8(self.strings.load, 'vocab', 'strings.json') self.strings.load(file_)
self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
if package.has_file('vocab', 'vec.bin'): # TODO: really optional? if package.has_file('vocab', 'vec.bin'):
self.vectors_length = self.load_vectors_from_bin_loc( self.vectors_length = self.load_vectors_from_bin_loc(
package.file_path('vocab', 'vec.bin')) package.file_path('vocab', 'vec.bin'))
return self return self