mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Use util.Package class for io
Previous Sputnik integration caused API change: Vocab, Tagger, etc were loaded via a from_package classmethod, that required a sputnik.Package instance. This forced users to first create a sputnik.Sputnik() instance, in order to acquire a Package via sp.pool(). Instead I've created a small file-system shim, util.Package, which allows classes to have a .load() classmethod, that accepts either util.Package objects, or strings. We can later gut the internals of this and make it a proxy for Sputnik if we need more functionality that should live in the Sputnik library. Sputnik is now only used to download and install the data, in spacy.en.download
This commit is contained in:
parent
0e2498da00
commit
aec130af56
|
@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown
|
|||
from .syntax.arc_eager import ArcEager
|
||||
|
||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
||||
from .util import get_package
|
||||
from .util import get_package, MockPackage
|
||||
|
||||
|
||||
class Language(object):
|
||||
|
@ -142,7 +142,7 @@ class Language(object):
|
|||
package = get_package()
|
||||
if get_lex_attr is None:
|
||||
get_lex_attr = cls.default_lex_attrs()
|
||||
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
|
||||
return Vocab.load(package, get_lex_attr=get_lex_attr)
|
||||
|
||||
@classmethod
|
||||
def default_parser(cls, package, vocab):
|
||||
|
@ -182,40 +182,17 @@ class Language(object):
|
|||
- Language(model='en_default ==1.0.0')
|
||||
- Language(model='en_default <1.1.0, data_dir='spacy/data')
|
||||
"""
|
||||
# support non-package data dirs
|
||||
if data_dir and path.exists(path.join(data_dir, 'vocab')):
|
||||
class Package(object):
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
|
||||
def has_file(self, *path_parts):
|
||||
return path.exists(path.join(self.root, *path_parts))
|
||||
|
||||
def file_path(self, *path_parts, **kwargs):
|
||||
return path.join(self.root, *path_parts)
|
||||
|
||||
def dir_path(self, *path_parts, **kwargs):
|
||||
return path.join(self.root, *path_parts)
|
||||
|
||||
def load_utf8(self, func, *path_parts, **kwargs):
|
||||
with io.open(self.file_path(path.join(*path_parts)),
|
||||
mode='r', encoding='utf8') as f:
|
||||
return func(f)
|
||||
|
||||
warn("using non-package data_dir", DeprecationWarning)
|
||||
package = Package(data_dir)
|
||||
else:
|
||||
package = get_package(name=model, data_path=data_dir)
|
||||
package = MockPackage(data_dir)
|
||||
if load_vectors is not True:
|
||||
warn("load_vectors is deprecated", DeprecationWarning)
|
||||
if vocab in (None, True):
|
||||
vocab = self.default_vocab(package)
|
||||
self.vocab = vocab
|
||||
if tokenizer in (None, True):
|
||||
tokenizer = Tokenizer.from_package(package, self.vocab)
|
||||
tokenizer = Tokenizer.load(package, self.vocab)
|
||||
self.tokenizer = tokenizer
|
||||
if tagger in (None, True):
|
||||
tagger = Tagger.from_package(package, self.vocab)
|
||||
tagger = Tagger.load(package, self.vocab)
|
||||
self.tagger = tagger
|
||||
if entity in (None, True):
|
||||
entity = self.default_entity(package, self.vocab)
|
||||
|
@ -224,7 +201,7 @@ class Language(object):
|
|||
parser = self.default_parser(package, self.vocab)
|
||||
self.parser = parser
|
||||
if matcher in (None, True):
|
||||
matcher = Matcher.from_package(package, self.vocab)
|
||||
matcher = Matcher.load(package, self.vocab)
|
||||
self.matcher = matcher
|
||||
|
||||
def __reduce__(self):
|
||||
|
|
|
@ -8,13 +8,13 @@ except ImportError:
|
|||
import json
|
||||
|
||||
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
||||
from .util import MockPackage
|
||||
from .util import MockPackage as Package
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
@classmethod
|
||||
def load(cls, pkg_or_str_or_file):
|
||||
pkg = MockPackage.create_or_return(pkg_or_str_or_file)
|
||||
pkg = Package.create_or_return(pkg_or_str_or_file)
|
||||
index = {}
|
||||
exc = {}
|
||||
for pos in ['adj', 'noun', 'verb']:
|
||||
|
|
|
@ -21,6 +21,7 @@ from .tokens.doc cimport Doc
|
|||
from .vocab cimport Vocab
|
||||
|
||||
from .attrs import FLAG61 as U_ENT
|
||||
from .util import MockPackage
|
||||
|
||||
from .attrs import FLAG60 as B2_ENT
|
||||
from .attrs import FLAG59 as B3_ENT
|
||||
|
@ -169,7 +170,8 @@ cdef class Matcher:
|
|||
cdef object _patterns
|
||||
|
||||
@classmethod
|
||||
def from_package(cls, package, Vocab vocab):
|
||||
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
||||
package = MockPackage.create_or_return(pkg_or_str_or_file)
|
||||
patterns = package.load_utf8(json.load,
|
||||
'vocab', 'gazetteer.json',
|
||||
default={}) # TODO: really optional?
|
||||
|
|
|
@ -88,6 +88,13 @@ cdef class Parser:
|
|||
model.load(path.join(model_dir, 'model'))
|
||||
return cls(strings, moves, model)
|
||||
|
||||
@classmethod
|
||||
def load(cls, pkg_or_str_or_file, vocab):
|
||||
# TODO
|
||||
raise NotImplementedError(
|
||||
"This should be here, but isn't yet =/. Use Parser.from_dir")
|
||||
|
||||
|
||||
def __reduce__(self):
|
||||
return (Parser, (self.moves.strings, self.moves, self.model), None, None)
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
|||
|
||||
from .attrs cimport *
|
||||
|
||||
from .util import Package
|
||||
|
||||
|
||||
cpdef enum:
|
||||
P2_orth
|
||||
|
@ -146,7 +148,8 @@ cdef class Tagger:
|
|||
return cls(vocab, model)
|
||||
|
||||
@classmethod
|
||||
def from_package(cls, package, vocab):
|
||||
def load(cls, pkg_or_str_or_file, vocab):
|
||||
pkg = Package.create_or_return(pkg_or_str_or_file)
|
||||
# TODO: templates.json deprecated? not present in latest package
|
||||
templates = cls.default_templates()
|
||||
# templates = package.load_utf8(json.load,
|
||||
|
@ -156,8 +159,9 @@ cdef class Tagger:
|
|||
model = TaggerModel(vocab.morphology.n_tags,
|
||||
ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
|
||||
|
||||
if package.has_file('pos', 'model'): # TODO: really optional?
|
||||
model.load(package.file_path('pos', 'model'))
|
||||
|
||||
if pkg.has_file('pos', 'model'): # TODO: really optional?
|
||||
model.load(pkg.file_path('pos', 'model'))
|
||||
|
||||
return cls(vocab, model)
|
||||
|
||||
|
|
|
@ -1,11 +1,17 @@
|
|||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
import os
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def EN():
|
||||
return English()
|
||||
if os.environ.get('SPACY_DATA'):
|
||||
data_path = os.environ.get('SPACY_DATA')
|
||||
else:
|
||||
data_path = None
|
||||
print("Load EN from %s" % data_path)
|
||||
return English(data_dir=data_path)
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
|
|
|
@ -11,6 +11,7 @@ from spacy.vocab import Vocab
|
|||
from spacy.tokens.doc import Doc
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from os import path
|
||||
import os
|
||||
|
||||
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
|
||||
from spacy.serialize.packer import Packer
|
||||
|
@ -20,7 +21,11 @@ from spacy.serialize.bits import BitArray
|
|||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
vocab = English.default_vocab()
|
||||
if os.environ.get('SPACY_DATA'):
|
||||
data_path = os.environ.get('SPACY_DATA')
|
||||
else:
|
||||
data_path = None
|
||||
vocab = English.default_vocab(package=data_path)
|
||||
lex = vocab['dog']
|
||||
assert vocab[vocab.strings['dog']].orth_ == 'dog'
|
||||
lex = vocab['the']
|
||||
|
|
|
@ -1,22 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
import os
|
||||
import io
|
||||
import pickle
|
||||
|
||||
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||
from spacy.util import get_package
|
||||
from spacy.util import get_package, MockPackage
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def package():
|
||||
return get_package()
|
||||
if os.environ.get('SPACY_DATA'):
|
||||
data_path = os.environ.get('SPACY_DATA')
|
||||
else:
|
||||
data_path = None
|
||||
return get_package(data_path=data_path)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer(package):
|
||||
return Lemmatizer.from_package(package)
|
||||
return Lemmatizer.load(package)
|
||||
|
||||
|
||||
def test_read_index(package):
|
||||
|
|
|
@ -48,3 +48,10 @@ def test_punct(en_tokenizer):
|
|||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer("``We've")
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_therell(en_tokenizer):
|
||||
tokens = en_tokenizer("there'll")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "there"
|
||||
assert tokens[1].text == "'ll"
|
||||
|
|
|
@ -6,7 +6,11 @@ import os
|
|||
@pytest.fixture(scope='session')
|
||||
def nlp():
|
||||
from spacy.en import English
|
||||
return English()
|
||||
if os.environ.get('SPACY_DATA'):
|
||||
data_path = os.environ.get('SPACY_DATA')
|
||||
else:
|
||||
data_path = None
|
||||
return English(data_dir=data_path)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
|
|
@ -10,8 +10,14 @@ def token(doc):
|
|||
|
||||
|
||||
def test_load_resources_and_process_text():
|
||||
if os.environ.get('SPACY_DATA'):
|
||||
data_path = os.environ.get('SPACY_DATA')
|
||||
else:
|
||||
data_path = None
|
||||
print("Load EN from %s" % data_path)
|
||||
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
nlp = English(data_dir=data_path)
|
||||
doc = nlp('Hello, world. Here are two sentences.')
|
||||
|
||||
|
||||
|
|
|
@ -15,8 +15,9 @@ from .strings cimport hash_string
|
|||
cimport cython
|
||||
|
||||
from . import util
|
||||
from .util import read_lang_data
|
||||
from .tokens.doc cimport Doc
|
||||
from .util import read_lang_data
|
||||
from .util import MockPackage as Package
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
|
@ -41,8 +42,9 @@ cdef class Tokenizer:
|
|||
return (self.__class__, args, None, None)
|
||||
|
||||
@classmethod
|
||||
def from_package(cls, package, Vocab vocab):
|
||||
rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
|
||||
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
||||
pkg = Package.create_or_return(pkg_or_str_or_file)
|
||||
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
|
||||
prefix_re = re.compile(prefix_re)
|
||||
suffix_re = re.compile(suffix_re)
|
||||
infix_re = re.compile(infix_re)
|
||||
|
|
|
@ -4,6 +4,7 @@ import json
|
|||
import re
|
||||
import os.path
|
||||
from contextlib import contextmanager
|
||||
import types
|
||||
|
||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
|
||||
|
@ -12,10 +13,10 @@ def local_path(subdir):
|
|||
return os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
|
||||
|
||||
|
||||
class MockPackage(object):
|
||||
class Package(object):
|
||||
@classmethod
|
||||
def create_or_return(cls, me_or_arg):
|
||||
return me_or_arg if isinstance(me_or_arg, cls) else me_or_arg
|
||||
return me_or_arg if isinstance(me_or_arg, cls) else cls(me_or_arg)
|
||||
|
||||
def __init__(self, data_path=None):
|
||||
if data_path is None:
|
||||
|
@ -46,15 +47,20 @@ class MockPackage(object):
|
|||
|
||||
@contextmanager
|
||||
def open(self, path_parts, default=IOError):
|
||||
if isinstance(default, Exception):
|
||||
raise default
|
||||
|
||||
# Enter
|
||||
file_ = io.open(self.file_path(os.path.join(*path_parts)),
|
||||
mode='r', encoding='utf8')
|
||||
yield file_
|
||||
# Exit
|
||||
file_.close()
|
||||
if not self.has_file(*path_parts):
|
||||
if isinstance(default, types.TypeType) and issubclass(default, Exception):
|
||||
raise default(self.file_path(*path_parts))
|
||||
elif isinstance(default, Exception):
|
||||
raise default
|
||||
else:
|
||||
yield default
|
||||
else:
|
||||
# Enter
|
||||
file_ = io.open(self.file_path(os.path.join(*path_parts)),
|
||||
mode='r', encoding='utf8')
|
||||
yield file_
|
||||
# Exit
|
||||
file_.close()
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ from .orth cimport word_shape
|
|||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .util import MockPackage
|
||||
from .util import Package
|
||||
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
@ -49,24 +49,23 @@ cdef class Vocab:
|
|||
'''
|
||||
@classmethod
|
||||
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
|
||||
package = MockPackage.create_or_return(pkg_or_str_or_file)
|
||||
tag_map = package.load_utf8(json.load,
|
||||
'vocab', 'tag_map.json')
|
||||
package = Package.create_or_return(pkg_or_str_or_file)
|
||||
with package.open(('vocab', 'tag_map.json'), default=None) as file_:
|
||||
tag_map = json.load(file_) if file_ is not None else {}
|
||||
|
||||
lemmatizer = Lemmatizer.load(package)
|
||||
|
||||
serializer_freqs = package.load_utf8(json.load,
|
||||
'vocab', 'serializer.json',
|
||||
require=False) # TODO: really optional?
|
||||
with package.open(('vocab', 'serializer.json'), default=None) as file_:
|
||||
serializer_freqs = json.load(file_) if file_ is not None else {}
|
||||
|
||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
|
||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
||||
|
||||
if package.has_file('vocab', 'strings.json'): # TODO: really optional?
|
||||
package.load_utf8(self.strings.load, 'vocab', 'strings.json')
|
||||
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
|
||||
with package.open(('vocab', 'strings.json')) as file_:
|
||||
self.strings.load(file_)
|
||||
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
|
||||
|
||||
if package.has_file('vocab', 'vec.bin'): # TODO: really optional?
|
||||
if package.has_file('vocab', 'vec.bin'):
|
||||
self.vectors_length = self.load_vectors_from_bin_loc(
|
||||
package.file_path('vocab', 'vec.bin'))
|
||||
return self
|
||||
|
|
Loading…
Reference in New Issue
Block a user