mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
* Fix merge conflict in requirements.txt
This commit is contained in:
commit
3dc398b727
|
@ -61,7 +61,7 @@ build_script:
|
||||||
- "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
|
- "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data"
|
||||||
- "cp package.json data"
|
- "cp package.json data"
|
||||||
- "%CMD_IN_ENV% sputnik build data en_default.sputnik"
|
- "%CMD_IN_ENV% sputnik build data en_default.sputnik"
|
||||||
- "%CMD_IN_ENV% sputnik install en_default.sputnik"
|
- "%CMD_IN_ENV% sputnik --name spacy install en_default.sputnik"
|
||||||
|
|
||||||
test_script:
|
test_script:
|
||||||
# Run the project tests
|
# Run the project tests
|
||||||
|
|
|
@ -31,7 +31,7 @@ install:
|
||||||
- "python bin/init_model.py en lang_data/ corpora/ data"
|
- "python bin/init_model.py en lang_data/ corpora/ data"
|
||||||
- "cp package.json data"
|
- "cp package.json data"
|
||||||
- "sputnik build data en_default.sputnik"
|
- "sputnik build data en_default.sputnik"
|
||||||
- "sputnik install en_default.sputnik"
|
- "sputnik --name spacy install en_default.sputnik"
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- python build.py $MODE;
|
- python build.py $MODE;
|
||||||
|
|
23
package.json
23
package.json
|
@ -1,17 +1,14 @@
|
||||||
{
|
{
|
||||||
"name": "en_default",
|
"name": "en_test",
|
||||||
"version": "0.100.0",
|
"version": "1.0.0",
|
||||||
"description": "english default model",
|
"description": "english test model",
|
||||||
"license": "public domain",
|
"license": "public domain",
|
||||||
"include": [
|
"include": [
|
||||||
"deps/*",
|
["deps", "*"],
|
||||||
"ner/*",
|
["ner", "*"],
|
||||||
"pos/*",
|
["pos", "*"],
|
||||||
"tokenizer/*",
|
["tokenizer", "*"],
|
||||||
"vocab/*",
|
["vocab", "*"],
|
||||||
"wordnet/*"
|
["wordnet", "*"]
|
||||||
],
|
]
|
||||||
"compatibility": {
|
|
||||||
"spacy": "==0.100.0"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,4 +10,4 @@ plac
|
||||||
six
|
six
|
||||||
ujson
|
ujson
|
||||||
cloudpickle
|
cloudpickle
|
||||||
sputnik==0.7.*
|
sputnik>=0.7.0,<0.8.0
|
||||||
|
|
97
setup.py
97
setup.py
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import division, print_function
|
from __future__ import print_function
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -14,13 +14,6 @@ except ImportError:
|
||||||
from distutils.core import Extension, setup
|
from distutils.core import Extension, setup
|
||||||
|
|
||||||
|
|
||||||
MAJOR = 0
|
|
||||||
MINOR = 100
|
|
||||||
MICRO = 0
|
|
||||||
ISRELEASED = False
|
|
||||||
VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)
|
|
||||||
|
|
||||||
|
|
||||||
PACKAGES = [
|
PACKAGES = [
|
||||||
'spacy',
|
'spacy',
|
||||||
'spacy.tokens',
|
'spacy.tokens',
|
||||||
|
@ -103,73 +96,6 @@ class build_ext_subclass(build_ext, build_ext_options):
|
||||||
build_ext.build_extensions(self)
|
build_ext.build_extensions(self)
|
||||||
|
|
||||||
|
|
||||||
# Return the git revision as a string
|
|
||||||
def git_version():
|
|
||||||
def _minimal_ext_cmd(cmd):
|
|
||||||
# construct minimal environment
|
|
||||||
env = {}
|
|
||||||
for k in ['SYSTEMROOT', 'PATH']:
|
|
||||||
v = os.environ.get(k)
|
|
||||||
if v is not None:
|
|
||||||
env[k] = v
|
|
||||||
# LANGUAGE is used on win32
|
|
||||||
env['LANGUAGE'] = 'C'
|
|
||||||
env['LANG'] = 'C'
|
|
||||||
env['LC_ALL'] = 'C'
|
|
||||||
out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
|
|
||||||
return out
|
|
||||||
|
|
||||||
try:
|
|
||||||
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
|
|
||||||
GIT_REVISION = out.strip().decode('ascii')
|
|
||||||
except OSError:
|
|
||||||
GIT_REVISION = 'Unknown'
|
|
||||||
|
|
||||||
return GIT_REVISION
|
|
||||||
|
|
||||||
|
|
||||||
def get_version_info():
|
|
||||||
# Adding the git rev number needs to be done inside write_version_py(),
|
|
||||||
# otherwise the import of spacy.about messes up the build under Python 3.
|
|
||||||
FULLVERSION = VERSION
|
|
||||||
if os.path.exists('.git'):
|
|
||||||
GIT_REVISION = git_version()
|
|
||||||
elif os.path.exists(os.path.join('spacy', 'about.py')):
|
|
||||||
# must be a source distribution, use existing version file
|
|
||||||
try:
|
|
||||||
from spacy.about import git_revision as GIT_REVISION
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError('Unable to import git_revision. Try removing '
|
|
||||||
'spacy/about.py and the build directory '
|
|
||||||
'before building.')
|
|
||||||
else:
|
|
||||||
GIT_REVISION = 'Unknown'
|
|
||||||
|
|
||||||
if not ISRELEASED:
|
|
||||||
FULLVERSION += '.dev0+' + GIT_REVISION[:7]
|
|
||||||
|
|
||||||
return FULLVERSION, GIT_REVISION
|
|
||||||
|
|
||||||
|
|
||||||
def write_version(path):
|
|
||||||
cnt = """# THIS FILE IS GENERATED FROM SPACY SETUP.PY
|
|
||||||
short_version = '%(version)s'
|
|
||||||
version = '%(version)s'
|
|
||||||
full_version = '%(full_version)s'
|
|
||||||
git_revision = '%(git_revision)s'
|
|
||||||
release = %(isrelease)s
|
|
||||||
if not release:
|
|
||||||
version = full_version
|
|
||||||
"""
|
|
||||||
FULLVERSION, GIT_REVISION = get_version_info()
|
|
||||||
|
|
||||||
with open(path, 'w') as f:
|
|
||||||
f.write(cnt % {'version': VERSION,
|
|
||||||
'full_version' : FULLVERSION,
|
|
||||||
'git_revision' : GIT_REVISION,
|
|
||||||
'isrelease': str(ISRELEASED)})
|
|
||||||
|
|
||||||
|
|
||||||
def generate_cython(root, source):
|
def generate_cython(root, source):
|
||||||
print('Cythonizing sources')
|
print('Cythonizing sources')
|
||||||
p = subprocess.call([sys.executable,
|
p = subprocess.call([sys.executable,
|
||||||
|
@ -241,7 +167,9 @@ def setup_package():
|
||||||
return clean(root)
|
return clean(root)
|
||||||
|
|
||||||
with chdir(root):
|
with chdir(root):
|
||||||
write_version(os.path.join(root, 'spacy', 'about.py'))
|
about = {}
|
||||||
|
with open(os.path.join(root, "spacy", "about.py")) as f:
|
||||||
|
exec(f.read(), about)
|
||||||
|
|
||||||
include_dirs = [
|
include_dirs = [
|
||||||
get_python_inc(plat_specific=True),
|
get_python_inc(plat_specific=True),
|
||||||
|
@ -259,19 +187,20 @@ def setup_package():
|
||||||
prepare_includes(root)
|
prepare_includes(root)
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='spacy',
|
name=about['__name__'],
|
||||||
|
zip_safe=False,
|
||||||
packages=PACKAGES,
|
packages=PACKAGES,
|
||||||
package_data={'': ['*.pyx', '*.pxd']},
|
package_data={'': ['*.pyx', '*.pxd']},
|
||||||
description='Industrial-strength NLP',
|
description=about['__summary__'],
|
||||||
author='Matthew Honnibal',
|
author=about['__author__'],
|
||||||
author_email='matt@spacy.io',
|
author_email=about['__email__'],
|
||||||
version=VERSION,
|
version=about['__version__'],
|
||||||
url='https://spacy.io',
|
url=about['__uri__'],
|
||||||
license='MIT',
|
license=about['__license__'],
|
||||||
ext_modules=ext_modules,
|
ext_modules=ext_modules,
|
||||||
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
|
install_requires=['numpy', 'murmurhash>=0.26,<0.27', 'cymem>=1.30,<1.31', 'preshed>=0.46.1,<0.47',
|
||||||
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
|
'thinc>=4.2.0,<4.3.0', 'text_unidecode', 'plac', 'six',
|
||||||
'ujson', 'cloudpickle', 'sputnik>=0.6.4,<0.7.0'],
|
'ujson', 'cloudpickle', 'sputnik>=0.7.0,<0.8.0'],
|
||||||
cmdclass = {
|
cmdclass = {
|
||||||
'build_ext': build_ext_subclass},
|
'build_ext': build_ext_subclass},
|
||||||
)
|
)
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
from . import util
|
||||||
|
from .en import English
|
||||||
|
|
||||||
|
|
||||||
|
def load(name, via=None):
|
||||||
|
package = util.get_package_by_name(name, via=via)
|
||||||
|
return English(package=package)
|
14
spacy/about.py
Normal file
14
spacy/about.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# inspired from:
|
||||||
|
|
||||||
|
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
|
||||||
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
|
__name__ = 'spacy'
|
||||||
|
__version__ = '0.100.0'
|
||||||
|
__summary__ = 'Industrial-strength NLP'
|
||||||
|
__uri__ = 'https://spacy.io'
|
||||||
|
__author__ = 'Matthew Honnibal'
|
||||||
|
__email__ = 'matt@spacy.io'
|
||||||
|
__license__ = 'MIT'
|
||||||
|
__release__ = False
|
||||||
|
__default_model__ = 'en_default==1.0.4'
|
|
@ -1,9 +1,15 @@
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from sputnik import Sputnik
|
import sputnik
|
||||||
|
from sputnik.package_list import (PackageNotFoundException,
|
||||||
|
CompatiblePackageNotFoundException)
|
||||||
|
|
||||||
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
def migrate(path):
|
def migrate(path):
|
||||||
|
@ -18,43 +24,34 @@ def migrate(path):
|
||||||
os.unlink(os.path.join(path, filename))
|
os.unlink(os.path.join(path, filename))
|
||||||
|
|
||||||
|
|
||||||
def link(package, path):
|
|
||||||
if os.path.exists(path):
|
|
||||||
if os.path.isdir(path):
|
|
||||||
shutil.rmtree(path)
|
|
||||||
else:
|
|
||||||
os.unlink(path)
|
|
||||||
|
|
||||||
if not hasattr(os, 'symlink'): # not supported by win+py27
|
|
||||||
shutil.copytree(package.dir_path('data'), path)
|
|
||||||
else:
|
|
||||||
os.symlink(package.dir_path('data'), path)
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
force=("Force overwrite", "flag", "f", bool),
|
force=("Force overwrite", "flag", "f", bool),
|
||||||
)
|
)
|
||||||
def main(data_size='all', force=False):
|
def main(data_size='all', force=False):
|
||||||
# TODO read version from the same source as the setup
|
|
||||||
sputnik = Sputnik('spacy', '0.100.0', console=sys.stdout)
|
|
||||||
|
|
||||||
path = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
|
|
||||||
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
|
|
||||||
if not os.path.isdir(data_path):
|
|
||||||
os.mkdir(data_path)
|
|
||||||
|
|
||||||
command = sputnik.command(
|
|
||||||
data_path=data_path,
|
|
||||||
repository_url='https://index.spacy.io')
|
|
||||||
|
|
||||||
if force:
|
if force:
|
||||||
command.purge()
|
sputnik.purge(about.__name__, about.__version__)
|
||||||
|
|
||||||
package = command.install('en_default')
|
try:
|
||||||
|
sputnik.package(about.__name__, about.__version__, about.__default_model__)
|
||||||
|
print("Model already installed. Please run 'python -m "
|
||||||
|
"spacy.en.download --force' to reinstall.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
||||||
|
pass
|
||||||
|
|
||||||
|
package = sputnik.install(about.__name__, about.__version__, about.__default_model__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sputnik.package(about.__name__, about.__version__, about.__default_model__)
|
||||||
|
except (PackageNotFoundException, CompatiblePackageNotFoundException):
|
||||||
|
print("Model failed to install. Please run 'python -m "
|
||||||
|
"spacy.en.download --force'.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# FIXME clean up old-style packages
|
# FIXME clean up old-style packages
|
||||||
migrate(path)
|
migrate(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
print("Model successfully installed.", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -19,8 +19,8 @@ from . import orth
|
||||||
from .syntax.ner import BiluoPushDown
|
from .syntax.ner import BiluoPushDown
|
||||||
from .syntax.arc_eager import ArcEager
|
from .syntax.arc_eager import ArcEager
|
||||||
|
|
||||||
|
from . import util
|
||||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
||||||
from .util import get_package
|
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
|
@ -137,28 +137,25 @@ class Language(object):
|
||||||
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_vocab(cls, package=None, get_lex_attr=None):
|
def default_vocab(cls, package, get_lex_attr=None):
|
||||||
if package is None:
|
|
||||||
package = get_package()
|
|
||||||
if get_lex_attr is None:
|
if get_lex_attr is None:
|
||||||
get_lex_attr = cls.default_lex_attrs()
|
get_lex_attr = cls.default_lex_attrs()
|
||||||
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
|
return Vocab.from_package(package, get_lex_attr=get_lex_attr)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_parser(cls, package, vocab):
|
def default_parser(cls, package, vocab):
|
||||||
data_dir = package.dir_path('deps', require=False)
|
data_dir = package.dir_path('deps')
|
||||||
if data_dir and path.exists(data_dir):
|
if data_dir and path.exists(data_dir):
|
||||||
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
|
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_entity(cls, package, vocab):
|
def default_entity(cls, package, vocab):
|
||||||
data_dir = package.dir_path('ner', require=False)
|
data_dir = package.dir_path('ner')
|
||||||
if data_dir and path.exists(data_dir):
|
if data_dir and path.exists(data_dir):
|
||||||
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
|
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
data_dir=None,
|
data_dir=None,
|
||||||
model=None,
|
|
||||||
vocab=None,
|
vocab=None,
|
||||||
tokenizer=None,
|
tokenizer=None,
|
||||||
tagger=None,
|
tagger=None,
|
||||||
|
@ -166,48 +163,36 @@ class Language(object):
|
||||||
entity=None,
|
entity=None,
|
||||||
matcher=None,
|
matcher=None,
|
||||||
serializer=None,
|
serializer=None,
|
||||||
load_vectors=True):
|
load_vectors=True,
|
||||||
|
package=None):
|
||||||
"""
|
"""
|
||||||
a model can be specified:
|
a model can be specified:
|
||||||
|
|
||||||
1) by a path to the model directory (DEPRECATED)
|
1) by calling a Language subclass
|
||||||
- Language(data_dir='path/to/data')
|
- spacy.en.English()
|
||||||
|
|
||||||
2) by a language identifier (and optionally a package root dir)
|
2) by calling a Language subclass with data_dir
|
||||||
- Language(lang='en')
|
- spacy.en.English('my/model/root')
|
||||||
- Language(lang='en', data_dir='spacy/data')
|
- spacy.en.English(data_dir='my/model/root')
|
||||||
|
|
||||||
3) by a model name/version (and optionally a package root dir)
|
3) by package name
|
||||||
- Language(model='en_default')
|
- spacy.load('en_default')
|
||||||
- Language(model='en_default ==1.0.0')
|
- spacy.load('en_default==1.0.0')
|
||||||
- Language(model='en_default <1.1.0, data_dir='spacy/data')
|
|
||||||
|
4) by package name with a relocated package base
|
||||||
|
- spacy.load('en_default', via='/my/package/root')
|
||||||
|
- spacy.load('en_default==1.0.0', via='/my/package/root')
|
||||||
"""
|
"""
|
||||||
# support non-package data dirs
|
|
||||||
if data_dir and path.exists(path.join(data_dir, 'vocab')):
|
|
||||||
class Package(object):
|
|
||||||
def __init__(self, root):
|
|
||||||
self.root = root
|
|
||||||
|
|
||||||
def has_file(self, *path_parts):
|
if package is None:
|
||||||
return path.exists(path.join(self.root, *path_parts))
|
if data_dir is None:
|
||||||
|
package = util.get_package_by_name()
|
||||||
def file_path(self, *path_parts, **kwargs):
|
|
||||||
return path.join(self.root, *path_parts)
|
|
||||||
|
|
||||||
def dir_path(self, *path_parts, **kwargs):
|
|
||||||
return path.join(self.root, *path_parts)
|
|
||||||
|
|
||||||
def load_utf8(self, func, *path_parts, **kwargs):
|
|
||||||
with io.open(self.file_path(path.join(*path_parts)),
|
|
||||||
mode='r', encoding='utf8') as f:
|
|
||||||
return func(f)
|
|
||||||
|
|
||||||
warn("using non-package data_dir", DeprecationWarning)
|
|
||||||
package = Package(data_dir)
|
|
||||||
else:
|
else:
|
||||||
package = get_package(name=model, data_path=data_dir)
|
package = util.get_package(data_dir)
|
||||||
|
|
||||||
if load_vectors is not True:
|
if load_vectors is not True:
|
||||||
warn("load_vectors is deprecated", DeprecationWarning)
|
warn("load_vectors is deprecated", DeprecationWarning)
|
||||||
|
|
||||||
if vocab in (None, True):
|
if vocab in (None, True):
|
||||||
vocab = self.default_vocab(package)
|
vocab = self.default_vocab(package)
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -230,7 +215,6 @@ class Language(object):
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
args = (
|
args = (
|
||||||
None, # data_dir
|
None, # data_dir
|
||||||
None, # model
|
|
||||||
self.vocab,
|
self.vocab,
|
||||||
self.tokenizer,
|
self.tokenizer,
|
||||||
self.tagger,
|
self.tagger,
|
||||||
|
|
|
@ -8,25 +8,24 @@ except ImportError:
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
|
||||||
|
from .util import get_package
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_package(cls, package):
|
def load(cls, via):
|
||||||
|
return cls.from_package(get_package(via))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_package(cls, pkg):
|
||||||
index = {}
|
index = {}
|
||||||
exc = {}
|
exc = {}
|
||||||
for pos in ['adj', 'noun', 'verb']:
|
for pos in ['adj', 'noun', 'verb']:
|
||||||
index[pos] = package.load_utf8(read_index,
|
with pkg.open(('wordnet', 'index.%s' % pos), default=None) as file_:
|
||||||
'wordnet', 'index.%s' % pos,
|
index[pos] = read_index(file_) if file_ is not None else set()
|
||||||
default=set()) # TODO: really optional?
|
with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
|
||||||
exc[pos] = package.load_utf8(read_exc,
|
exc[pos] = read_exc(file_) if file_ is not None else {}
|
||||||
'wordnet', '%s.exc' % pos,
|
rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
|
||||||
default={}) # TODO: really optional?
|
|
||||||
|
|
||||||
rules = package.load_utf8(json.load,
|
|
||||||
'vocab', 'lemma_rules.json',
|
|
||||||
default={}) # TODO: really optional?
|
|
||||||
|
|
||||||
return cls(index, exc, rules)
|
return cls(index, exc, rules)
|
||||||
|
|
||||||
def __init__(self, index, exceptions, rules):
|
def __init__(self, index, exceptions, rules):
|
||||||
|
|
|
@ -21,6 +21,7 @@ from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
from .attrs import FLAG61 as U_ENT
|
from .attrs import FLAG61 as U_ENT
|
||||||
|
from .util import get_package
|
||||||
|
|
||||||
from .attrs import FLAG60 as B2_ENT
|
from .attrs import FLAG60 as B2_ENT
|
||||||
from .attrs import FLAG59 as B3_ENT
|
from .attrs import FLAG59 as B3_ENT
|
||||||
|
@ -168,11 +169,13 @@ cdef class Matcher:
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef object _patterns
|
cdef object _patterns
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, data_dir, Vocab vocab):
|
||||||
|
return cls.from_package(get_package(data_dir), vocab=vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_package(cls, package, Vocab vocab):
|
def from_package(cls, package, Vocab vocab):
|
||||||
patterns = package.load_utf8(json.load,
|
patterns = package.load_json(('vocab', 'gazetteer.json'))
|
||||||
'vocab', 'gazetteer.json',
|
|
||||||
default={}) # TODO: really optional?
|
|
||||||
return cls(vocab, patterns)
|
return cls(vocab, patterns)
|
||||||
|
|
||||||
def __init__(self, vocab, patterns):
|
def __init__(self, vocab, patterns):
|
||||||
|
|
|
@ -89,6 +89,13 @@ cdef class Parser:
|
||||||
model.load(path.join(model_dir, 'model'))
|
model.load(path.join(model_dir, 'model'))
|
||||||
return cls(strings, moves, model)
|
return cls(strings, moves, model)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, pkg_or_str_or_file, vocab):
|
||||||
|
# TODO
|
||||||
|
raise NotImplementedError(
|
||||||
|
"This should be here, but isn't yet =/. Use Parser.from_dir")
|
||||||
|
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (Parser, (self.moves.strings, self.moves, self.model), None, None)
|
return (Parser, (self.moves.strings, self.moves, self.model), None, None)
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,8 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||||
|
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
|
|
||||||
|
from .util import get_package
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
P2_orth
|
P2_orth
|
||||||
|
@ -146,7 +148,11 @@ cdef class Tagger:
|
||||||
return cls(vocab, model)
|
return cls(vocab, model)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_package(cls, package, vocab):
|
def load(cls, data_dir, vocab):
|
||||||
|
return cls.from_package(get_package(data_dir), vocab=vocab)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_package(cls, pkg, vocab):
|
||||||
# TODO: templates.json deprecated? not present in latest package
|
# TODO: templates.json deprecated? not present in latest package
|
||||||
templates = cls.default_templates()
|
templates = cls.default_templates()
|
||||||
# templates = package.load_utf8(json.load,
|
# templates = package.load_utf8(json.load,
|
||||||
|
@ -156,8 +162,9 @@ cdef class Tagger:
|
||||||
model = TaggerModel(vocab.morphology.n_tags,
|
model = TaggerModel(vocab.morphology.n_tags,
|
||||||
ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
|
ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
|
||||||
|
|
||||||
if package.has_file('pos', 'model'): # TODO: really optional?
|
|
||||||
model.load(package.file_path('pos', 'model'))
|
if pkg.has_file('pos', 'model'): # TODO: really optional?
|
||||||
|
model.load(pkg.file_path('pos', 'model'))
|
||||||
|
|
||||||
return cls(vocab, model)
|
return cls(vocab, model)
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,17 @@
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def EN():
|
def EN():
|
||||||
return English()
|
if os.environ.get('SPACY_DATA'):
|
||||||
|
data_dir = os.environ.get('SPACY_DATA')
|
||||||
|
else:
|
||||||
|
data_dir = None
|
||||||
|
print("Load EN from %s" % data_dir)
|
||||||
|
return English(data_dir=data_dir)
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
|
|
|
@ -11,7 +11,9 @@ from spacy.vocab import Vocab
|
||||||
from spacy.tokens.doc import Doc
|
from spacy.tokens.doc import Doc
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
from os import path
|
from os import path
|
||||||
|
import os
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
|
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
|
||||||
from spacy.serialize.packer import Packer
|
from spacy.serialize.packer import Packer
|
||||||
|
|
||||||
|
@ -20,7 +22,13 @@ from spacy.serialize.bits import BitArray
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def vocab():
|
def vocab():
|
||||||
vocab = English.default_vocab()
|
data_dir = os.environ.get('SPACY_DATA')
|
||||||
|
if data_dir is None:
|
||||||
|
package = util.get_package_by_name()
|
||||||
|
else:
|
||||||
|
package = util.get_package(data_dir)
|
||||||
|
|
||||||
|
vocab = English.default_vocab(package=package)
|
||||||
lex = vocab['dog']
|
lex = vocab['dog']
|
||||||
assert vocab[vocab.strings['dog']].orth_ == 'dog'
|
assert vocab[vocab.strings['dog']].orth_ == 'dog'
|
||||||
lex = vocab['the']
|
lex = vocab['the']
|
||||||
|
|
|
@ -1,17 +1,22 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
import os
|
||||||
import io
|
import io
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||||
from spacy.util import get_package
|
from spacy import util
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def package():
|
def package():
|
||||||
return get_package()
|
data_dir = os.environ.get('SPACY_DATA')
|
||||||
|
if data_dir is None:
|
||||||
|
return util.get_package_by_name()
|
||||||
|
else:
|
||||||
|
return util.get_package(data_dir)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -20,14 +25,16 @@ def lemmatizer(package):
|
||||||
|
|
||||||
|
|
||||||
def test_read_index(package):
|
def test_read_index(package):
|
||||||
index = package.load_utf8(read_index, 'wordnet', 'index.noun')
|
with package.open(('wordnet', 'index.noun')) as file_:
|
||||||
|
index = read_index(file_)
|
||||||
assert 'man' in index
|
assert 'man' in index
|
||||||
assert 'plantes' not in index
|
assert 'plantes' not in index
|
||||||
assert 'plant' in index
|
assert 'plant' in index
|
||||||
|
|
||||||
|
|
||||||
def test_read_exc(package):
|
def test_read_exc(package):
|
||||||
exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc')
|
with package.open(('wordnet', 'verb.exc')) as file_:
|
||||||
|
exc = read_exc(file_)
|
||||||
assert exc['was'] == ('be',)
|
assert exc['was'] == ('be',)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -50,6 +50,7 @@ def test_punct(en_tokenizer):
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_therell(en_tokenizer):
|
def test_therell(en_tokenizer):
|
||||||
tokens = en_tokenizer("there'll")
|
tokens = en_tokenizer("there'll")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
|
@ -6,7 +6,11 @@ import os
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope='session')
|
||||||
def nlp():
|
def nlp():
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
return English()
|
if os.environ.get('SPACY_DATA'):
|
||||||
|
data_dir = os.environ.get('SPACY_DATA')
|
||||||
|
else:
|
||||||
|
data_dir = None
|
||||||
|
return English(data_dir=data_dir)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
|
|
|
@ -10,8 +10,14 @@ def token(doc):
|
||||||
|
|
||||||
|
|
||||||
def test_load_resources_and_process_text():
|
def test_load_resources_and_process_text():
|
||||||
|
if os.environ.get('SPACY_DATA'):
|
||||||
|
data_dir = os.environ.get('SPACY_DATA')
|
||||||
|
else:
|
||||||
|
data_dir = None
|
||||||
|
print("Load EN from %s" % data_dir)
|
||||||
|
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
nlp = English()
|
nlp = English(data_dir=data_dir)
|
||||||
doc = nlp('Hello, world. Here are two sentences.')
|
doc = nlp('Hello, world. Here are two sentences.')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,8 +15,9 @@ from .strings cimport hash_string
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from .util import read_lang_data
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
from .util import read_lang_data
|
||||||
|
from .util import get_package
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -40,6 +41,10 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
return (self.__class__, args, None, None)
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, data_dir, Vocab vocab):
|
||||||
|
return cls.from_package(get_package(data_dir), vocab=vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_package(cls, package, Vocab vocab):
|
def from_package(cls, package, Vocab vocab):
|
||||||
rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
|
rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
|
||||||
|
|
|
@ -2,23 +2,36 @@ import os
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import os.path
|
||||||
|
|
||||||
from sputnik import Sputnik
|
import six
|
||||||
|
import sputnik
|
||||||
|
from sputnik.dir_package import DirPackage
|
||||||
|
from sputnik.package_list import (PackageNotFoundException,
|
||||||
|
CompatiblePackageNotFoundException)
|
||||||
|
|
||||||
|
from . import about
|
||||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
def get_package(name=None, data_path=None):
|
def get_package(data_dir):
|
||||||
if data_path is None:
|
if not isinstance(data_dir, six.string_types):
|
||||||
if os.environ.get('SPACY_DATA'):
|
raise RuntimeError('data_dir must be a string')
|
||||||
data_path = os.environ.get('SPACY_DATA')
|
return DirPackage(data_dir)
|
||||||
else:
|
|
||||||
data_path = os.path.abspath(
|
|
||||||
os.path.join(os.path.dirname(__file__), 'data'))
|
|
||||||
|
|
||||||
sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version
|
|
||||||
pool = sputnik.pool(data_path)
|
def get_package_by_name(name=None, via=None):
|
||||||
return pool.get(name or 'en_default')
|
try:
|
||||||
|
return sputnik.package(about.__name__, about.__version__,
|
||||||
|
name or about.__default_model__, data_path=via)
|
||||||
|
except PackageNotFoundException as e:
|
||||||
|
raise RuntimeError("Model not installed. Please run 'python -m "
|
||||||
|
"spacy.en.download' to install latest compatible "
|
||||||
|
"model.")
|
||||||
|
except CompatiblePackageNotFoundException as e:
|
||||||
|
raise RuntimeError("Installed model is not compatible with spaCy "
|
||||||
|
"version. Please run 'python -m spacy.en.download "
|
||||||
|
"--force' to install latest compatible model.")
|
||||||
|
|
||||||
|
|
||||||
def normalize_slice(length, start, stop, step=None):
|
def normalize_slice(length, start, stop, step=None):
|
||||||
|
@ -46,10 +59,13 @@ def utf8open(loc, mode='r'):
|
||||||
|
|
||||||
|
|
||||||
def read_lang_data(package):
|
def read_lang_data(package):
|
||||||
tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json')
|
tokenization = package.load_json(('tokenizer', 'specials.json'))
|
||||||
prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt')
|
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
|
||||||
suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt')
|
prefix = read_prefix(file_) if file_ is not None else None
|
||||||
infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt')
|
with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
|
||||||
|
suffix = read_suffix(file_) if file_ is not None else None
|
||||||
|
with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
|
||||||
|
infix = read_infix(file_) if file_ is not None else None
|
||||||
return tokenization, prefix, suffix, infix
|
return tokenization, prefix, suffix, infix
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ from .orth cimport word_shape
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .cfile cimport CFile
|
from .cfile cimport CFile
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
from .util import get_package
|
||||||
|
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
@ -46,28 +47,28 @@ EMPTY_LEXEME.vector = EMPTY_VEC
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
'''A map container for a language's LexemeC structs.
|
||||||
'''
|
'''
|
||||||
|
@classmethod
|
||||||
|
def load(cls, data_dir, get_lex_attr=None):
|
||||||
|
return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_package(cls, package, get_lex_attr=None):
|
def from_package(cls, package, get_lex_attr=None):
|
||||||
tag_map = package.load_utf8(json.load,
|
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
||||||
'vocab', 'tag_map.json')
|
|
||||||
|
|
||||||
lemmatizer = Lemmatizer.from_package(package)
|
lemmatizer = Lemmatizer.from_package(package)
|
||||||
|
|
||||||
serializer_freqs = package.load_utf8(json.load,
|
serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
|
||||||
'vocab', 'serializer.json',
|
|
||||||
require=False) # TODO: really optional?
|
|
||||||
|
|
||||||
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
|
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
|
||||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
||||||
|
|
||||||
if package.has_file('vocab', 'strings.json'): # TODO: really optional?
|
with package.open(('vocab', 'strings.json')) as file_:
|
||||||
package.load_utf8(self.strings.load, 'vocab', 'strings.json')
|
self.strings.load(file_)
|
||||||
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
|
self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
|
||||||
|
|
||||||
if package.has_file('vocab', 'vec.bin'): # TODO: really optional?
|
if package.has_file('vocab', 'vec.bin'):
|
||||||
self.vectors_length = self.load_vectors_from_bin_loc(
|
self.vectors_length = self.load_vectors_from_bin_loc(
|
||||||
package.file_path('vocab', 'vec.bin'))
|
package.file_path('vocab', 'vec.bin'))
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
|
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user