* Fix merge conflict in requirements.txt

This commit is contained in:
Matthew Honnibal 2016-01-16 16:20:49 +01:00
commit 3dc398b727
22 changed files with 220 additions and 222 deletions

View File

@ -61,7 +61,7 @@ build_script:
- "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data"
- "%CMD_IN_ENV% sputnik build data en_default.sputnik"
- "%CMD_IN_ENV% sputnik install en_default.sputnik"
- "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik"
test_script:
# Run the project tests

View File

@ -31,7 +31,7 @@ install:
- "python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data"
- "sputnik build data en_default.sputnik"
- "sputnik install en_default.sputnik"
- "sputnik --name spacy install en_default.sputnik"
script:
- python build.py $MODE;

View File

@ -1,17 +1,14 @@
{
"name": "en_default",
"version": "0.100.0",
"description": "english default model",
"name": "en_test",
"version": "1.0.0",
"description": "english test model",
"license": "public domain",
"include": [
"deps/*",
"ner/*",
"pos/*",
"tokenizer/*",
"vocab/*",
"wordnet/*"
],
"compatibility": {
"spacy": "==0.100.0"
}
["deps", "*"],
["ner", "*"],
["pos", "*"],
["tokenizer", "*"],
["vocab", "*"],
["wordnet", "*"]
]
}

View File

@ -10,4 +10,4 @@ plac
six
ujson
cloudpickle
sputnik==0.7.*
sputnik>=0.7.0,<0.8.0

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
from __future__ import division, print_function
from __future__ import print_function
import os
import shutil
import subprocess
@ -14,13 +14,6 @@ except ImportError:
from distutils.core import Extension, setup
MAJOR = 0
MINOR = 100
MICRO = 0
ISRELEASED = False
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
PACKAGES = [
'spacy',
'spacy.tokens',
@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options):
build_ext.build_extensions(self)
# Return the git revision as a string
def git_version():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
return out
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
GIT_REVISION = out.strip().decode('ascii')
except OSError:
GIT_REVISION = 'Unknown'
return GIT_REVISION
def get_version_info():
# Adding the git rev number needs to be done inside write_version_py(),
# otherwise the import of spacy.about messes up the build under Python 3.
FULLVERSION = VERSION
if os.path.exists('.git'):
GIT_REVISION = git_version()
elif os.path.exists(os.path.join('spacy', 'about.py')):
# must be a source distribution, use existing version file
try:
from spacy.about import git_revision as GIT_REVISION
except ImportError:
raise ImportError('Unable to import git_revision. Try removing '
'spacy/about.py and the build directory '
'before building.')
else:
GIT_REVISION = 'Unknown'
if not ISRELEASED:
FULLVERSION += '.dev0+' + GIT_REVISION[:7]
return FULLVERSION, GIT_REVISION
def write_version(path):
cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
short_version = '%(version)s'
version = '%(version)s'
full_version = '%(full_version)s'
git_revision = '%(git_revision)s'
release = %(isrelease)s
if not release:
version = full_version
"""
FULLVERSION, GIT_REVISION = get_version_info()
with open(path, 'w') as f:
f.write(cnt % {'version': VERSION,
'full_version' : FULLVERSION,
'git_revision' : GIT_REVISION,
'isrelease': str(ISRELEASED)})
def generate_cython(root, source):
print('Cythonizing sources')
p = subprocess.call([sys.executable,
@ -241,7 +167,9 @@ def setup_package():
return clean(root)
with chdir(root):
write_version(os.path.join(root, 'spacy', 'about.py'))
about = {}
with open(os.path.join(root, "spacy", "about.py")) as f:
exec(f.read(), about)
include_dirs = [
get_python_inc(plat_specific=True),
@ -259,19 +187,20 @@ def setup_package():
prepare_includes(root)
setup(
name='spacy',
name=about['__name__'],
zip_safe=False,
packages=PACKAGES,
package_data={'': ['*.pyx', '*.pxd']},
description='Industrial-strength NLP',
author='Matthew Honnibal',
author_email='matt@spacy.io',
version=VERSION,
url='https://spacy.io',
license='MIT',
description=about['__summary__'],
author=about['__author__'],
author_email=about['__email__'],
version=about['__version__'],
url=about['__uri__'],
license=about['__license__'],
ext_modules=ext_modules,
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'],
'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'],
cmdclass = {
'build_ext': build_ext_subclass},
)

View File

@ -0,0 +1,7 @@
from . import util
from .en import English
def load(name, via=None):
package = util.get_package_by_name(name, via=via)
return English(package=package)

14
spacy/about.py Normal file
View File

@ -0,0 +1,14 @@
# inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__name__ = 'spacy'
__version__ = '0.100.0'
__summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
__email__ = 'matt@spacy.io'
__license__ = 'MIT'
__release__ = False
__default_model__ = 'en_default==1.0.4'

View File

@ -1,9 +1,15 @@
from __future__ import print_function
import sys
import os
import shutil
import plac
from sputnik import Sputnik
import sputnik
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from .. import about
def migrate(path):
@ -18,43 +24,34 @@ def migrate(path):
os.unlink(os.path.join(path, filename))
def link(package, path):
if os.path.exists(path):
if os.path.isdir(path):
shutil.rmtree(path)
else:
os.unlink(path)
if not hasattr(os, 'symlink'): # not supported by win+py27
shutil.copytree(package.dir_path('data'), path)
else:
os.symlink(package.dir_path('data'), path)
@plac.annotations(
force=("Force overwrite", "flag", "f", bool),
)
def main(data_size='all', force=False):
# TODO read version from the same source as the setup
sputnik = Sputnik('spacy', '0.100.0', console=sys.stdout)
path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
if not os.path.isdir(data_path):
os.mkdir(data_path)
command = sputnik.command(
data_path=data_path,
repository_url='https://index.spacy.io')
if force:
command.purge()
sputnik.purge(about.__name__, about.__version__)
package = command.install('en_default')
try:
sputnik.package(about.__name__, about.__version__, about.__default_model__)
print("Model already installed. Please run 'python -m "
"spacy.en.download --force' to reinstall.", file=sys.stderr)
sys.exit(1)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
pass
package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
try:
sputnik.package(about.__name__, about.__version__, about.__default_model__)
except (PackageNotFoundException, CompatiblePackageNotFoundException):
print("Model failed to install. Please run 'python -m "
"spacy.en.download --force'.", file=sys.stderr)
sys.exit(1)
# FIXME clean up old-style packages
migrate(path)
migrate(os.path.dirname(os.path.abspath(__file__)))
print("Model successfully installed.", file=sys.stderr)
if __name__ == '__main__':

View File

@ -19,8 +19,8 @@ from . import orth
from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager
from . import util
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import get_package
class Language(object):
@ -137,28 +137,25 @@ class Language(object):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod
def default_vocab(cls, package=None, get_lex_attr=None):
if package is None:
package = get_package()
def default_vocab(cls, package, get_lex_attr=None):
if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs()
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
@classmethod
def default_parser(cls, package, vocab):
data_dir = package.dir_path('deps', require=False)
data_dir = package.dir_path('deps')
if data_dir and path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
@classmethod
def default_entity(cls, package, vocab):
data_dir = package.dir_path('ner', require=False)
data_dir = package.dir_path('ner')
if data_dir and path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
def __init__(self,
data_dir=None,
model=None,
vocab=None,
tokenizer=None,
tagger=None,
@ -166,48 +163,36 @@ class Language(object):
entity=None,
matcher=None,
serializer=None,
load_vectors=True):
load_vectors=True,
package=None):
"""
a model can be specified:
1) by a path to the model directory (DEPRECATED)
- Language(data_dir='path/to/data')
1) by calling a Language subclass
- spacy.en.English()
2) by a language identifier (and optionally a package root dir)
- Language(lang='en')
- Language(lang='en', data_dir='spacy/data')
2) by calling a Language subclass with data_dir
- spacy.en.English('my/model/root')
- spacy.en.English(data_dir='my/model/root')
3) by a model name/version (and optionally a package root dir)
- Language(model='en_default')
- Language(model='en_default ==1.0.0')
- Language(model='en_default <1.1.0, data_dir='spacy/data')
3) by package name
- spacy.load('en_default')
- spacy.load('en_default==1.0.0')
4) by package name with a relocated package base
- spacy.load('en_default', via='/my/package/root')
- spacy.load('en_default==1.0.0', via='/my/package/root')
"""
# support non-package data dirs
if data_dir and path.exists(path.join(data_dir, 'vocab')):
class Package(object):
def __init__(self, root):
self.root = root
def has_file(self, *path_parts):
return path.exists(path.join(self.root, *path_parts))
def file_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def dir_path(self, *path_parts, **kwargs):
return path.join(self.root, *path_parts)
def load_utf8(self, func, *path_parts, **kwargs):
with io.open(self.file_path(path.join(*path_parts)),
mode='r', encoding='utf8') as f:
return func(f)
warn("using non-package data_dir", DeprecationWarning)
package = Package(data_dir)
if package is None:
if data_dir is None:
package = util.get_package_by_name()
else:
package = get_package(name=model, data_path=data_dir)
package = util.get_package(data_dir)
if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning)
if vocab in (None, True):
vocab = self.default_vocab(package)
self.vocab = vocab
@ -230,7 +215,6 @@ class Language(object):
def __reduce__(self):
args = (
None, # data_dir
None, # model
self.vocab,
self.tokenizer,
self.tagger,

View File

@ -8,25 +8,24 @@ except ImportError:
import json
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
from .util import get_package
class Lemmatizer(object):
@classmethod
def from_package(cls, package):
def load(cls, via):
return cls.from_package(get_package(via))
@classmethod
def from_package(cls, pkg):
index = {}
exc = {}
for pos in ['adj', 'noun', 'verb']:
index[pos] = package.load_utf8(read_index,
'wordnet', 'index.%s' % pos,
default=set()) # TODO: really optional?
exc[pos] = package.load_utf8(read_exc,
'wordnet', '%s.exc' % pos,
default={}) # TODO: really optional?
rules = package.load_utf8(json.load,
'vocab', 'lemma_rules.json',
default={}) # TODO: really optional?
with pkg.open(('wordnet', 'index.%s' % pos), default=None) as file_:
index[pos] = read_index(file_) if file_ is not None else set()
with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
exc[pos] = read_exc(file_) if file_ is not None else {}
rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):

View File

@ -21,6 +21,7 @@ from .tokens.doc cimport Doc
from .vocab cimport Vocab
from .attrs import FLAG61 as U_ENT
from .util import get_package
from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
@ -168,11 +169,13 @@ cdef class Matcher:
cdef readonly Vocab vocab
cdef object _patterns
@classmethod
def load(cls, data_dir, Vocab vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, package, Vocab vocab):
patterns = package.load_utf8(json.load,
'vocab', 'gazetteer.json',
default={}) # TODO: really optional?
patterns = package.load_json(('vocab', 'gazetteer.json'))
return cls(vocab, patterns)
def __init__(self, vocab, patterns):

View File

@ -89,6 +89,13 @@ cdef class Parser:
model.load(path.join(model_dir, 'model'))
return cls(strings, moves, model)
@classmethod
def load(cls, pkg_or_str_or_file, vocab):
# TODO
raise NotImplementedError(
"This should be here, but isn't yet =/. Use Parser.from_dir")
def __reduce__(self):
return (Parser, (self.moves.strings, self.moves, self.model), None, None)

View File

@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport *
from .util import get_package
cpdef enum:
P2_orth
@ -146,7 +148,11 @@ cdef class Tagger:
return cls(vocab, model)
@classmethod
def from_package(cls, package, vocab):
def load(cls, data_dir, vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, pkg, vocab):
# TODO: templates.json deprecated? not present in latest package
templates = cls.default_templates()
# templates = package.load_utf8(json.load,
@ -156,8 +162,9 @@ cdef class Tagger:
model = TaggerModel(vocab.morphology.n_tags,
ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
if package.has_file('pos', 'model'): # TODO: really optional?
model.load(package.file_path('pos', 'model'))
if pkg.has_file('pos', 'model'): # TODO: really optional?
model.load(pkg.file_path('pos', 'model'))
return cls(vocab, model)

View File

@ -1,11 +1,17 @@
from spacy.en import English
import pytest
import os
@pytest.fixture(scope="session")
def EN():
return English()
if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = None
print("Load EN from %s" % data_dir)
return English(data_dir=data_dir)
def pytest_addoption(parser):

View File

@ -11,7 +11,9 @@ from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
from os import path
import os
from spacy import util
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer
@ -20,7 +22,13 @@ from spacy.serialize.bits import BitArray
@pytest.fixture
def vocab():
vocab = English.default_vocab()
data_dir = os.environ.get('SPACY_DATA')
if data_dir is None:
package = util.get_package_by_name()
else:
package = util.get_package(data_dir)
vocab = English.default_vocab(package=package)
lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog'
lex = vocab['the']

View File

@ -1,17 +1,22 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import os
import io
import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.util import get_package
from spacy import util
import pytest
@pytest.fixture
def package():
return get_package()
data_dir = os.environ.get('SPACY_DATA')
if data_dir is None:
return util.get_package_by_name()
else:
return util.get_package(data_dir)
@pytest.fixture
@ -20,14 +25,16 @@ def lemmatizer(package):
def test_read_index(package):
index = package.load_utf8(read_index, 'wordnet', 'index.noun')
with package.open(('wordnet', 'index.noun')) as file_:
index = read_index(file_)
assert 'man' in index
assert 'plantes' not in index
assert 'plant' in index
def test_read_exc(package):
exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc')
with package.open(('wordnet', 'verb.exc')) as file_:
exc = read_exc(file_)
assert exc['was'] == ('be',)

View File

@ -50,6 +50,7 @@ def test_punct(en_tokenizer):
assert len(tokens) == 3
@pytest.mark.xfail
def test_therell(en_tokenizer):
tokens = en_tokenizer("there'll")
assert len(tokens) == 2

View File

@ -6,7 +6,11 @@ import os
@pytest.fixture(scope='session')
def nlp():
from spacy.en import English
return English()
if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = None
return English(data_dir=data_dir)
@pytest.fixture()

View File

@ -10,8 +10,14 @@ def token(doc):
def test_load_resources_and_process_text():
if os.environ.get('SPACY_DATA'):
data_dir = os.environ.get('SPACY_DATA')
else:
data_dir = None
print("Load EN from %s" % data_dir)
from spacy.en import English
nlp = English()
nlp = English(data_dir=data_dir)
doc = nlp('Hello, world. Here are two sentences.')

View File

@ -15,8 +15,9 @@ from .strings cimport hash_string
cimport cython
from . import util
from .util import read_lang_data
from .tokens.doc cimport Doc
from .util import read_lang_data
from .util import get_package
cdef class Tokenizer:
@ -40,6 +41,10 @@ cdef class Tokenizer:
return (self.__class__, args, None, None)
@classmethod
def load(cls, data_dir, Vocab vocab):
return cls.from_package(get_package(data_dir), vocab=vocab)
@classmethod
def from_package(cls, package, Vocab vocab):
rules, prefix_re, suffix_re, infix_re = read_lang_data(package)

View File

@ -2,23 +2,36 @@ import os
import io
import json
import re
import os.path
from sputnik import Sputnik
import six
import sputnik
from sputnik.dir_package import DirPackage
from sputnik.package_list import (PackageNotFoundException,
CompatiblePackageNotFoundException)
from . import about
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
def get_package(name=None, data_path=None):
if data_path is None:
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), 'data'))
def get_package(data_dir):
if not isinstance(data_dir, six.string_types):
raise RuntimeError('data_dir must be a string')
return DirPackage(data_dir)
sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version
pool = sputnik.pool(data_path)
return pool.get(name or 'en_default')
def get_package_by_name(name=None, via=None):
try:
return sputnik.package(about.__name__, about.__version__,
name or about.__default_model__, data_path=via)
except PackageNotFoundException as e:
raise RuntimeError("Model not installed. Please run 'python -m "
"spacy.en.download' to install latest compatible "
"model.")
except CompatiblePackageNotFoundException as e:
raise RuntimeError("Installed model is not compatible with spaCy "
"version. Please run 'python -m spacy.en.download "
"--force' to install latest compatible model.")
def normalize_slice(length, start, stop, step=None):
@ -46,10 +59,13 @@ def utf8open(loc, mode='r'):
def read_lang_data(package):
tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json')
prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt')
suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt')
infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt')
tokenization = package.load_json(('tokenizer', 'specials.json'))
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
prefix = read_prefix(file_) if file_ is not None else None
with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
suffix = read_suffix(file_) if file_ is not None else None
with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
infix = read_infix(file_) if file_ is not None else None
return tokenization, prefix, suffix, infix

View File

@ -19,6 +19,7 @@ from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from .lemmatizer import Lemmatizer
from .util import get_package
from . import attrs
from . import symbols
@ -46,28 +47,28 @@ EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab:
'''A map container for a language's LexemeC structs.
'''
@classmethod
def load(cls, data_dir, get_lex_attr=None):
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
@classmethod
def from_package(cls, package, get_lex_attr=None):
tag_map = package.load_utf8(json.load,
'vocab', 'tag_map.json')
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
lemmatizer = Lemmatizer.from_package(package)
serializer_freqs = package.load_utf8(json.load,
'vocab', 'serializer.json',
require=False) # TODO: really optional?
serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
if package.has_file('vocab', 'strings.json'): # TODO: really optional?
package.load_utf8(self.strings.load, 'vocab', 'strings.json')
with package.open(('vocab', 'strings.json')) as file_:
self.strings.load(file_)
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
if package.has_file('vocab', 'vec.bin'): # TODO: really optional?
if package.has_file('vocab', 'vec.bin'):
self.vectors_length = self.load_vectors_from_bin_loc(
package.file_path('vocab', 'vec.bin'))
return self
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):