* Fix merge conflict in requirements.txt

This commit is contained in:
Matthew Honnibal 2016-01-16 16:20:49 +01:00
commit 3dc398b727
22 changed files with 220 additions and 222 deletions

View File

@ -61,7 +61,7 @@ build_script:
- "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data" - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data" - "cp package.json data"
- "%CMD_IN_ENV% sputnik build data en_default.sputnik" - "%CMD_IN_ENV% sputnik build data en_default.sputnik"
- "%CMD_IN_ENV% sputnik install en_default.sputnik" - "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik"
test_script: test_script:
# Run the project tests # Run the project tests

View File

@ -31,7 +31,7 @@ install:
- "python bin/init_model.py en lang_data/ corpora/ data" - "python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data" - "cp package.json data"
- "sputnik build data en_default.sputnik" - "sputnik build data en_default.sputnik"
- "sputnik install en_default.sputnik" - "sputnik --name spacy install en_default.sputnik"
script: script:
- python build.py $MODE; - python build.py $MODE;

View File

@ -1,17 +1,14 @@
{ {
"name": "en_default", "name": "en_test",
"version": "0.100.0", "version": "1.0.0",
"description": "english default model", "description": "english test model",
"license": "public domain", "license": "public domain",
"include": [ "include": [
"deps/*", ["deps", "*"],
"ner/*", ["ner", "*"],
"pos/*", ["pos", "*"],
"tokenizer/*", ["tokenizer", "*"],
"vocab/*", ["vocab", "*"],
"wordnet/*" ["wordnet", "*"]
], ]
"compatibility": {
"spacy": "==0.100.0"
}
} }

View File

@ -10,4 +10,4 @@ plac
six six
ujson ujson
cloudpickle cloudpickle
sputnik==0.7.* sputnik>=0.7.0,<0.8.0

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import division, print_function from __future__ import print_function
import os import os
import shutil import shutil
import subprocess import subprocess
@ -14,13 +14,6 @@ except ImportError:
from distutils.core import Extension, setup from distutils.core import Extension, setup
MAJOR = 0
MINOR = 100
MICRO = 0
ISRELEASED = False
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
PACKAGES = [ PACKAGES = [
'spacy', 'spacy',
'spacy.tokens', 'spacy.tokens',
@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options):
build_ext.build_extensions(self) build_ext.build_extensions(self)
# Return the git revision as a string
def git_version():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
return out
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
GIT_REVISION = out.strip().decode('ascii')
except OSError:
GIT_REVISION = 'Unknown'
return GIT_REVISION
def get_version_info():
# Adding the git rev number needs to be done inside write_version_py(),
# otherwise the import of spacy.about messes up the build under Python 3.
FULLVERSION = VERSION
if os.path.exists('.git'):
GIT_REVISION = git_version()
elif os.path.exists(os.path.join('spacy', 'about.py')):
# must be a source distribution, use existing version file
try:
from spacy.about import git_revision as GIT_REVISION
except ImportError:
raise ImportError('Unable to import git_revision. Try removing '
'spacy/about.py and the build directory '
'before building.')
else:
GIT_REVISION = 'Unknown'
if not ISRELEASED:
FULLVERSION += '.dev0+' + GIT_REVISION[:7]
return FULLVERSION, GIT_REVISION
def write_version(path):
cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
short_version = '%(version)s'
version = '%(version)s'
full_version = '%(full_version)s'
git_revision = '%(git_revision)s'
release = %(isrelease)s
if not release:
version = full_version
"""
FULLVERSION, GIT_REVISION = get_version_info()
with open(path, 'w') as f:
f.write(cnt % {'version': VERSION,
'full_version' : FULLVERSION,
'git_revision' : GIT_REVISION,
'isrelease': str(ISRELEASED)})
def generate_cython(root, source): def generate_cython(root, source):
print('Cythonizing sources') print('Cythonizing sources')
p = subprocess.call([sys.executable, p = subprocess.call([sys.executable,
@ -241,7 +167,9 @@ def setup_package():
return clean(root) return clean(root)
with chdir(root): with chdir(root):
write_version(os.path.join(root, 'spacy', 'about.py')) about = {}
with open(os.path.join(root, "spacy", "about.py")) as f:
exec(f.read(), about)
include_dirs = [ include_dirs = [
get_python_inc(plat_specific=True), get_python_inc(plat_specific=True),
@ -259,19 +187,20 @@ def setup_package():
prepare_includes(root) prepare_includes(root)
setup( setup(
name='spacy', name=about['__name__'],
zip_safe=False,
packages=PACKAGES, packages=PACKAGES,
package_data={'': ['*.pyx', '*.pxd']}, package_data={'': ['*.pyx', '*.pxd']},
description='Industrial-strength NLP', description=about['__summary__'],
author='Matthew Honnibal', author=about['__author__'],
author_email='matt@spacy.io', author_email=about['__email__'],
version=VERSION, version=about['__version__'],
url='https://spacy.io', url=about['__uri__'],
license='MIT', license=about['__license__'],
ext_modules=ext_modules, ext_modules=ext_modules,
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47', install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six', 'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'], 'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'],
cmdclass = { cmdclass = {
'build_ext': build_ext_subclass}, 'build_ext': build_ext_subclass},
) )

View File

@ -0,0 +1,7 @@
from . import util
from .en import English
def load(name, via=None):
package = util.get_package_by_name(name, via=via)
return English(package=package)

14
spacy/about.py Normal file
View File

@ -0,0 +1,14 @@
# inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__name__ = 'spacy'
__version__ = '0.100.0'
__summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@spacy.io'
__license__ = 'MIT'
__release__ = False
__default_model__ = 'en_default==1.0.4'

View File

@ -1,9 +1,15 @@
from __future__ import print_function
import sys import sys
import os import os
import shutil import shutil
import plac import plac
from sputnik import Sputnik import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from .. import about
def migrate(path): def migrate(path):
@ -18,43 +24,34 @@ def migrate(path):
os.unlink(os.path.join(path, filename)) os.unlink(os.path.join(path, filename))
def link(package, path):
if os.path.exists(path):
if os.path.isdir(path):
shutil.rmtree(path)
else:
os.unlink(path)
if not hasattr(os, 'symlink'): # not supported by win+py27
shutil.copytree(package.dir_path('data'), path)
else:
os.symlink(package.dir_path('data'), path)
@plac.annotations( @plac.annotations(
force=("Force overwrite", "flag", "f", bool), force=("Force overwrite", "flag", "f", bool),
) )
def main(data_size='all', force=False): def main(data_size='all', force=False):
# TODO read version from the same source as the setup
sputnik = Sputnik('spacy', '0.100.0', console=sys.stdout)
path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
if not os.path.isdir(data_path):
os.mkdir(data_path)
command = sputnik.command(
data_path=data_path,
repository_url='https://index.spacy.io')
if force: if force:
command.purge() sputnik.purge(about.__name__, about.__version__)
package = command.install('en_default') try:
sputnik.package(about.__name__, about.__version__, about.__default_model__)
print("Model already installed. Please run 'python -m "
"spacy.en.download --force' to reinstall.", file=sys.stderr)
sys.exit(1)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
try:
sputnik.package(about.__name__, about.__version__, about.__default_model__)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
print("Model failed to install. Please run 'python -m "
"spacy.en.download --force'.", file=sys.stderr)
sys.exit(1)
# FIXME clean up old-style packages # FIXME clean up old-style packages
migrate(path) migrate(os.path.dirname(os.path.abspath(__file__)))
print("Model successfully installed.", file=sys.stderr)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -19,8 +19,8 @@ from . import orth
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from . import util
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package
class Language(object): class Language(object):
@ -137,28 +137,25 @@ class Language(object):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod @classmethod
def default_vocab(cls, package=None, get_lex_attr=None): def default_vocab(cls, package, get_lex_attr=None):
if package is None:
package = get_package()
if get_lex_attr is None: if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs() get_lex_attr = cls.default_lex_attrs()
return Vocab.from_package(package, get_lex_attr=get_lex_attr) return Vocab.from_package(package, get_lex_attr=get_lex_attr)
@classmethod @classmethod
def default_parser(cls, package, vocab): def default_parser(cls, package, vocab):
data_dir = package.dir_path('deps', require=False) data_dir = package.dir_path('deps')
if data_dir and path.exists(data_dir): if data_dir and path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, ArcEager) return Parser.from_dir(data_dir, vocab.strings, ArcEager)
@classmethod @classmethod
def default_entity(cls, package, vocab): def default_entity(cls, package, vocab):
data_dir = package.dir_path('ner', require=False) data_dir = package.dir_path('ner')
if data_dir and path.exists(data_dir): if data_dir and path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
def __init__(self, def __init__(self,
data_dir=None, data_dir=None,
model=None,
vocab=None, vocab=None,
tokenizer=None, tokenizer=None,
tagger=None, tagger=None,
@ -166,48 +163,36 @@ class Language(object):
entity=None, entity=None,
matcher=None, matcher=None,
serializer=None, serializer=None,
load_vectors=True): load_vectors=True,
package=None):
""" """
a model can be specified: a model can be specified:
1) by a path to the model directory (DEPRECATED) 1) by calling a Language subclass
- Language(data_dir='path/to/data') - spacy.en.English()
2) by a language identifier (and optionally a package root dir) 2) by calling a Language subclass with data_dir
- Language(lang='en') - spacy.en.English('my/model/root')
- Language(lang='en', data_dir='spacy/data') - spacy.en.English(data_dir='my/model/root')
3) by a model name/version (and optionally a package root dir) 3) by package name
- Language(model='en_default') - spacy.load('en_default')
- Language(model='en_default ==1.0.0') - spacy.load('en_default==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data')
4) by package name with a relocated package base
- spacy.load('en_default', via='/my/package/root')
- spacy.load('en_default==1.0.0', via='/my/package/root')
""" """
# support non-package data dirs
if data_dir and path.exists(path.join(data_dir, 'vocab')):
class Package(object):
def __init__(self, root):
self.root = root
def has_file(self, *path_parts): if package is None:
return path.exists(path.join(self.root, *path_parts)) if data_dir is None:
package = util.get_package_by_name()
else:
package = util.get_package(data_dir)
def file_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def dir_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def load_utf8(self, func, *path_parts, **kwargs):
with io.open(self.file_path(path.join(*path_parts)),
mode='r', encoding='utf8') as f:
return func(f)
warn("using non-package data_dir", DeprecationWarning)
package = Package(data_dir)
else:
package = get_package(name=model, data_path=data_dir)
if load_vectors is not True: if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True): if vocab in (None, True):
vocab = self.default_vocab(package) vocab = self.default_vocab(package)
self.vocab = vocab self.vocab = vocab
@ -230,7 +215,6 @@ class Language(object):
def __reduce__(self): def __reduce__(self):
args = ( args = (
None, # data_dir None, # data_dir
None, # model
self.vocab, self.vocab,
self.tokenizer, self.tokenizer,
self.tagger, self.tagger,

View File

@ -8,25 +8,24 @@ except ImportError:
import json import json
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
from .util import get_package
class Lemmatizer(object): class Lemmatizer(object):
@classmethod @classmethod
def from_package(cls, package): def load(cls, via):
return cls.from_package(get_package(via))
@classmethod
def from_package(cls, pkg):
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'noun', 'verb']: for pos in ['adj', 'noun', 'verb']:
index[pos] = package.load_utf8(read_index, with pkg.open(('wordnet', 'index.%s' % pos), default=None) as file_:
'wordnet', 'index.%s' % pos, index[pos] = read_index(file_) if file_ is not None else set()
default=set()) # TODO: really optional? with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
exc[pos] = package.load_utf8(read_exc, exc[pos] = read_exc(file_) if file_ is not None else {}
'wordnet', '%s.exc' % pos, rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
default={}) # TODO: really optional?
rules = package.load_utf8(json.load,
'vocab', 'lemma_rules.json',
default={}) # TODO: really optional?
return cls(index, exc, rules) return cls(index, exc, rules)
def __init__(self, index, exceptions, rules): def __init__(self, index, exceptions, rules):

View File

@ -21,6 +21,7 @@ from .tokens.doc cimport Doc
from .vocab cimport Vocab from .vocab cimport Vocab
from .attrs import FLAG61 as U_ENT from .attrs import FLAG61 as U_ENT
from .util import get_package
from .attrs import FLAG60 as B2_ENT from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT from .attrs import FLAG59 as B3_ENT
@ -168,11 +169,13 @@ cdef class Matcher:
cdef readonly Vocab vocab cdef readonly Vocab vocab
cdef object _patterns cdef object _patterns
@classmethod
def load(cls, data_dir, Vocab vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod @classmethod
def from_package(cls, package, Vocab vocab): def from_package(cls, package, Vocab vocab):
patterns = package.load_utf8(json.load, patterns = package.load_json(('vocab', 'gazetteer.json'))
'vocab', 'gazetteer.json',
default={}) # TODO: really optional?
return cls(vocab, patterns) return cls(vocab, patterns)
def __init__(self, vocab, patterns): def __init__(self, vocab, patterns):

View File

@ -89,6 +89,13 @@ cdef class Parser:
model.load(path.join(model_dir, 'model')) model.load(path.join(model_dir, 'model'))
return cls(strings, moves, model) return cls(strings, moves, model)
@classmethod
def load(cls, pkg_or_str_or_file, vocab):
# TODO
raise NotImplementedError(
"This should be here, but isn't yet =/. Use Parser.from_dir")
def __reduce__(self): def __reduce__(self):
return (Parser, (self.moves.strings, self.moves, self.model), None, None) return (Parser, (self.moves.strings, self.moves, self.model), None, None)

View File

@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport * from .attrs cimport *
from .util import get_package
cpdef enum: cpdef enum:
P2_orth P2_orth
@ -146,7 +148,11 @@ cdef class Tagger:
return cls(vocab, model) return cls(vocab, model)
@classmethod @classmethod
def from_package(cls, package, vocab): def load(cls, data_dir, vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, pkg, vocab):
# TODO: templates.json deprecated? not present in latest package # TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates() templates = cls.default_templates()
# templates = package.load_utf8(json.load, # templates = package.load_utf8(json.load,
@ -156,8 +162,9 @@ cdef class Tagger:
model = TaggerModel(vocab.morphology.n_tags, model = TaggerModel(vocab.morphology.n_tags,
ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
if package.has_file('pos', 'model'): # TODO: really optional?
model.load(package.file_path('pos', 'model')) if pkg.has_file('pos', 'model'): # TODO: really optional?
model.load(pkg.file_path('pos', 'model'))
return cls(vocab, model) return cls(vocab, model)

View File

@ -1,11 +1,17 @@
from spacy.en import English from spacy.en import English
import pytest import pytest
import os
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def EN(): def EN():
return English() if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = None
print("Load EN from %s" % data_dir)
return English(data_dir=data_dir)
def pytest_addoption(parser): def pytest_addoption(parser):

View File

@ -11,7 +11,9 @@ from spacy.vocab import Vocab
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from os import path from os import path
import os
from spacy import util
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer from spacy.serialize.packer import Packer
@ -20,7 +22,13 @@ from spacy.serialize.bits import BitArray
@pytest.fixture @pytest.fixture
def vocab(): def vocab():
vocab = English.default_vocab() data_dir = os.environ.get('SPACY_DATA')
if data_dir is None:
package = util.get_package_by_name()
else:
package = util.get_package(data_dir)
vocab = English.default_vocab(package=package)
lex = vocab['dog'] lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog' assert vocab[vocab.strings['dog']].orth_ == 'dog'
lex = vocab['the'] lex = vocab['the']

View File

@ -1,17 +1,22 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import unicode_literals from __future__ import unicode_literals
import os
import io import io
import pickle import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.util import get_package from spacy import util
import pytest import pytest
@pytest.fixture @pytest.fixture
def package(): def package():
return get_package() data_dir = os.environ.get('SPACY_DATA')
if data_dir is None:
return util.get_package_by_name()
else:
return util.get_package(data_dir)
@pytest.fixture @pytest.fixture
@ -20,14 +25,16 @@ def lemmatizer(package):
def test_read_index(package): def test_read_index(package):
index = package.load_utf8(read_index, 'wordnet', 'index.noun') with package.open(('wordnet', 'index.noun')) as file_:
index = read_index(file_)
assert 'man' in index assert 'man' in index
assert 'plantes' not in index assert 'plantes' not in index
assert 'plant' in index assert 'plant' in index
def test_read_exc(package): def test_read_exc(package):
exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc') with package.open(('wordnet', 'verb.exc')) as file_:
exc = read_exc(file_)
assert exc['was'] == ('be',) assert exc['was'] == ('be',)

View File

@ -50,6 +50,7 @@ def test_punct(en_tokenizer):
assert len(tokens) == 3 assert len(tokens) == 3
@pytest.mark.xfail
def test_therell(en_tokenizer): def test_therell(en_tokenizer):
tokens = en_tokenizer("there'll") tokens = en_tokenizer("there'll")
assert len(tokens) == 2 assert len(tokens) == 2

View File

@ -6,7 +6,11 @@ import os
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def nlp(): def nlp():
from spacy.en import English from spacy.en import English
return English() if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = None
return English(data_dir=data_dir)
@pytest.fixture() @pytest.fixture()

View File

@ -10,8 +10,14 @@ def token(doc):
def test_load_resources_and_process_text(): def test_load_resources_and_process_text():
if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = None
print("Load EN from %s" % data_dir)
from spacy.en import English from spacy.en import English
nlp = English() nlp = English(data_dir=data_dir)
doc = nlp('Hello, world. Here are two sentences.') doc = nlp('Hello, world. Here are two sentences.')

View File

@ -15,8 +15,9 @@ from .strings cimport hash_string
cimport cython cimport cython
from . import util from . import util
from .util import read_lang_data
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .util import read_lang_data
from .util import get_package
cdef class Tokenizer: cdef class Tokenizer:
@ -40,6 +41,10 @@ cdef class Tokenizer:
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
@classmethod
def load(cls, data_dir, Vocab vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod @classmethod
def from_package(cls, package, Vocab vocab): def from_package(cls, package, Vocab vocab):
rules, prefix_re, suffix_re, infix_re = read_lang_data(package) rules, prefix_re, suffix_re, infix_re = read_lang_data(package)

View File

@ -2,23 +2,36 @@ import os
import io import io
import json import json
import re import re
import os.path
from sputnik import Sputnik import six
import sputnik
from sputnik.dir_package import DirPackage
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from . import about
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def get_package(name=None, data_path=None): def get_package(data_dir):
if data_path is None: if not isinstance(data_dir, six.string_types):
if os.environ.get('SPACY_DATA'): raise RuntimeError('data_dir must be a string')
data_path = os.environ.get('SPACY_DATA') return DirPackage(data_dir)
else:
data_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), 'data'))
sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version
pool = sputnik.pool(data_path) def get_package_by_name(name=None, via=None):
return pool.get(name or 'en_default') try:
return sputnik.package(about.__name__, about.__version__,
name or about.__default_model__, data_path=via)
except PackageNotFoundException as e:
raise RuntimeError("Model not installed. Please run 'python -m "
"spacy.en.download' to install latest compatible "
"model.")
except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m spacy.en.download "
"--force' to install latest compatible model.")
def normalize_slice(length, start, stop, step=None): def normalize_slice(length, start, stop, step=None):
@ -46,10 +59,13 @@ def utf8open(loc, mode='r'):
def read_lang_data(package): def read_lang_data(package):
tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json') tokenization = package.load_json(('tokenizer', 'specials.json'))
prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt') with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt') prefix = read_prefix(file_) if file_ is not None else None
infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt') with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
suffix = read_suffix(file_) if file_ is not None else None
with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
infix = read_infix(file_) if file_ is not None else None
return tokenization, prefix, suffix, infix return tokenization, prefix, suffix, infix

View File

@ -19,6 +19,7 @@ from .orth cimport word_shape
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile from .cfile cimport CFile
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .util import get_package
from . import attrs from . import attrs
from . import symbols from . import symbols
@ -46,28 +47,28 @@ EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
@classmethod
def load(cls, data_dir, get_lex_attr=None):
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
@classmethod @classmethod
def from_package(cls, package, get_lex_attr=None): def from_package(cls, package, get_lex_attr=None):
tag_map = package.load_utf8(json.load, tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
'vocab', 'tag_map.json')
lemmatizer = Lemmatizer.from_package(package) lemmatizer = Lemmatizer.from_package(package)
serializer_freqs = package.load_utf8(json.load, serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
'vocab', 'serializer.json',
require=False) # TODO: really optional?
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
if package.has_file('vocab', 'strings.json'): # TODO: really optional? with package.open(('vocab', 'strings.json')) as file_:
package.load_utf8(self.strings.load, 'vocab', 'strings.json') self.strings.load(file_)
self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
if package.has_file('vocab', 'vec.bin'): # TODO: really optional? if package.has_file('vocab', 'vec.bin'):
self.vectors_length = self.load_vectors_from_bin_loc( self.vectors_length = self.load_vectors_from_bin_loc(
package.file_path('vocab', 'vec.bin')) package.file_path('vocab', 'vec.bin'))
return self return self
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None): def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):