From aec130af560b1da67a50beaa43366e09a6a3b9d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 29 Dec 2015 18:00:48 +0100 Subject: [PATCH] Use util.Package class for io Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download --- spacy/language.py | 35 ++++------------------ spacy/lemmatizer.py | 4 +-- spacy/matcher.pyx | 4 ++- spacy/syntax/parser.pyx | 7 +++++ spacy/tagger.pyx | 10 +++++-- spacy/tests/conftest.py | 8 ++++- spacy/tests/serialize/test_packer.py | 7 ++++- spacy/tests/tagger/test_lemmatizer.py | 11 +++++-- spacy/tests/tokenizer/test_contractions.py | 7 +++++ spacy/tests/website/conftest.py | 6 +++- spacy/tests/website/test_home.py | 8 ++++- spacy/tokenizer.pyx | 8 +++-- spacy/util.py | 28 ++++++++++------- spacy/vocab.pyx | 21 +++++++------ 14 files changed, 97 insertions(+), 67 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index c992335b3..d16032c47 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import get_package +from .util import get_package, MockPackage class Language(object): @@ -142,7 +142,7 @@ class Language(object): package = get_package() if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() - return Vocab.from_package(package, get_lex_attr=get_lex_attr) + return Vocab.load(package, get_lex_attr=get_lex_attr) @classmethod def default_parser(cls, package, vocab): @@ -182,40 +182,17 @@ class Language(object): - Language(model='en_default ==1.0.0') - Language(model='en_default <1.1.0, data_dir='spacy/data') """ - # support non-package data dirs - if data_dir and path.exists(path.join(data_dir, 'vocab')): - class Package(object): - def __init__(self, root): - self.root = root - - def has_file(self, *path_parts): - return path.exists(path.join(self.root, *path_parts)) - - def file_path(self, *path_parts, **kwargs): - return path.join(self.root, *path_parts) - - def dir_path(self, *path_parts, **kwargs): - return path.join(self.root, *path_parts) - - def load_utf8(self, func, *path_parts, **kwargs): - with io.open(self.file_path(path.join(*path_parts)), - mode='r', encoding='utf8') as f: - return func(f) - - warn("using non-package data_dir", DeprecationWarning) - package = Package(data_dir) - else: - package = get_package(name=model, data_path=data_dir) + package = MockPackage(data_dir) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) if vocab in (None, True): vocab = self.default_vocab(package) self.vocab = vocab if tokenizer in (None, True): - tokenizer = Tokenizer.from_package(package, self.vocab) + tokenizer = Tokenizer.load(package, self.vocab) self.tokenizer = tokenizer if tagger in (None, True): - tagger = Tagger.from_package(package, self.vocab) + tagger = Tagger.load(package, self.vocab) self.tagger = tagger if entity in (None, True): entity = self.default_entity(package, self.vocab) @@ -224,7 +201,7 @@ class Language(object): parser = self.default_parser(package, self.vocab) self.parser = parser if matcher in (None, True): - matcher = Matcher.from_package(package, self.vocab) + matcher = Matcher.load(package, self.vocab) self.matcher = matcher def __reduce__(self): diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 7cd37a331..2362a7842 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -8,13 +8,13 @@ except ImportError: import json from .parts_of_speech import NOUN, VERB, ADJ, PUNCT -from .util import MockPackage +from .util import MockPackage as Package class Lemmatizer(object): @classmethod def load(cls, pkg_or_str_or_file): - pkg = MockPackage.create_or_return(pkg_or_str_or_file) + pkg = Package.create_or_return(pkg_or_str_or_file) index = {} exc = {} for pos in ['adj', 'noun', 'verb']: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 4d36b7742..6c70a6f68 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -21,6 +21,7 @@ from .tokens.doc cimport Doc from .vocab cimport Vocab from .attrs import FLAG61 as U_ENT +from .util import MockPackage from .attrs import FLAG60 as B2_ENT from .attrs import FLAG59 as B3_ENT @@ -169,7 +170,8 @@ cdef class Matcher: cdef object _patterns @classmethod - def from_package(cls, package, Vocab vocab): + def load(cls, pkg_or_str_or_file, Vocab vocab): + package = MockPackage.create_or_return(pkg_or_str_or_file) patterns = package.load_utf8(json.load, 'vocab', 'gazetteer.json', default={}) # TODO: really optional? diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index dd4fb3bea..c29d59758 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -88,6 +88,13 @@ cdef class Parser: model.load(path.join(model_dir, 'model')) return cls(strings, moves, model) + @classmethod + def load(cls, pkg_or_str_or_file, vocab): + # TODO + raise NotImplementedError( + "This should be here, but isn't yet =/. Use Parser.from_dir") + + def __reduce__(self): return (Parser, (self.moves.strings, self.moves, self.model), None, None) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 2c05b4a84..decf918d8 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .attrs cimport * +from .util import Package + cpdef enum: P2_orth @@ -146,7 +148,8 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def from_package(cls, package, vocab): + def load(cls, pkg_or_str_or_file, vocab): + pkg = Package.create_or_return(pkg_or_str_or_file) # TODO: templates.json deprecated? not present in latest package templates = cls.default_templates() # templates = package.load_utf8(json.load, @@ -156,8 +159,9 @@ cdef class Tagger: model = TaggerModel(vocab.morphology.n_tags, ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) - if package.has_file('pos', 'model'): # TODO: really optional? - model.load(package.file_path('pos', 'model')) + + if pkg.has_file('pos', 'model'): # TODO: really optional? + model.load(pkg.file_path('pos', 'model')) return cls(vocab, model) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 03e728a12..b8a620d88 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,11 +1,17 @@ from spacy.en import English import pytest +import os @pytest.fixture(scope="session") def EN(): - return English() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + print("Load EN from %s" % data_path) + return English(data_dir=data_path) def pytest_addoption(parser): diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index 0e13b2de5..1d3b12117 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -11,6 +11,7 @@ from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer from os import path +import os from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.serialize.packer import Packer @@ -20,7 +21,11 @@ from spacy.serialize.bits import BitArray @pytest.fixture def vocab(): - vocab = English.default_vocab() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + vocab = English.default_vocab(package=data_path) lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' lex = vocab['the'] diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index e25fbe199..ebcc4e881 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -1,22 +1,27 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import os import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import get_package +from spacy.util import get_package, MockPackage import pytest @pytest.fixture def package(): - return get_package() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + return get_package(data_path=data_path) @pytest.fixture def lemmatizer(package): - return Lemmatizer.from_package(package) + return Lemmatizer.load(package) def test_read_index(package): diff --git a/spacy/tests/tokenizer/test_contractions.py b/spacy/tests/tokenizer/test_contractions.py index ea93ff8b4..76597ec5a 100644 --- a/spacy/tests/tokenizer/test_contractions.py +++ b/spacy/tests/tokenizer/test_contractions.py @@ -48,3 +48,10 @@ def test_punct(en_tokenizer): assert len(tokens) == 2 tokens = en_tokenizer("``We've") assert len(tokens) == 3 + + +def test_therell(en_tokenizer): + tokens = en_tokenizer("there'll") + assert len(tokens) == 2 + assert tokens[0].text == "there" + assert tokens[1].text == "'ll" diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py index b4934d20b..d7b4b3252 100644 --- a/spacy/tests/website/conftest.py +++ b/spacy/tests/website/conftest.py @@ -6,7 +6,11 @@ import os @pytest.fixture(scope='session') def nlp(): from spacy.en import English - return English() + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + return English(data_dir=data_path) @pytest.fixture() diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py index 5317e7e56..ef13b4677 100644 --- a/spacy/tests/website/test_home.py +++ b/spacy/tests/website/test_home.py @@ -10,8 +10,14 @@ def token(doc): def test_load_resources_and_process_text(): + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = None + print("Load EN from %s" % data_path) + from spacy.en import English - nlp = English() + nlp = English(data_dir=data_path) doc = nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 345734682..b90945678 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -15,8 +15,9 @@ from .strings cimport hash_string cimport cython from . import util -from .util import read_lang_data from .tokens.doc cimport Doc +from .util import read_lang_data +from .util import MockPackage as Package cdef class Tokenizer: @@ -41,8 +42,9 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def from_package(cls, package, Vocab vocab): - rules, prefix_re, suffix_re, infix_re = read_lang_data(package) + def load(cls, pkg_or_str_or_file, Vocab vocab): + pkg = Package.create_or_return(pkg_or_str_or_file) + rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) diff --git a/spacy/util.py b/spacy/util.py index 2b6f50a6b..74558c59c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -4,6 +4,7 @@ import json import re import os.path from contextlib import contextmanager +import types from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE @@ -12,10 +13,10 @@ def local_path(subdir): return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) -class MockPackage(object): +class Package(object): @classmethod def create_or_return(cls, me_or_arg): - return me_or_arg if isinstance(me_or_arg, cls) else me_or_arg + return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg) def __init__(self, data_path=None): if data_path is None: @@ -46,15 +47,20 @@ class MockPackage(object): @contextmanager def open(self, path_parts, default=IOError): - if isinstance(default, Exception): - raise default - - # Enter - file_ = io.open(self.file_path(os.path.join(*path_parts)), - mode='r', encoding='utf8') - yield file_ - # Exit - file_.close() + if not self.has_file(*path_parts): + if isinstance(default, types.TypeType) and issubclass(default, Exception): + raise default(self.file_path(*path_parts)) + elif isinstance(default, Exception): + raise default + else: + yield default + else: + # Enter + file_ = io.open(self.file_path(os.path.join(*path_parts)), + mode='r', encoding='utf8') + yield file_ + # Exit + file_.close() diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ab0a522b1..1444f767e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,7 +19,7 @@ from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile from .lemmatizer import Lemmatizer -from .util import MockPackage +from .util import Package from . import attrs from . import symbols @@ -49,24 +49,23 @@ cdef class Vocab: ''' @classmethod def load(cls, pkg_or_str_or_file, get_lex_attr=None): - package = MockPackage.create_or_return(pkg_or_str_or_file) - tag_map = package.load_utf8(json.load, - 'vocab', 'tag_map.json') + package = Package.create_or_return(pkg_or_str_or_file) + with package.open(('vocab', 'tag_map.json'), default=None) as file_: + tag_map = json.load(file_) if file_ is not None else {} lemmatizer = Lemmatizer.load(package) - serializer_freqs = package.load_utf8(json.load, - 'vocab', 'serializer.json', - require=False) # TODO: really optional? + with package.open(('vocab', 'serializer.json'), default=None) as file_: + serializer_freqs = json.load(file_) if file_ is not None else {} cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - if package.has_file('vocab', 'strings.json'): # TODO: really optional? - package.load_utf8(self.strings.load, 'vocab', 'strings.json') - self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) + with package.open(('vocab', 'strings.json')) as file_: + self.strings.load(file_) + self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) - if package.has_file('vocab', 'vec.bin'): # TODO: really optional? + if package.has_file('vocab', 'vec.bin'): self.vectors_length = self.load_vectors_from_bin_loc( package.file_path('vocab', 'vec.bin')) return self