mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
refactored data_dir->via, add zip_safe, add spacy.load()
This commit is contained in:
parent
04e67e8715
commit
788f734513
1
setup.py
1
setup.py
|
@ -260,6 +260,7 @@ def setup_package():
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='spacy',
|
name='spacy',
|
||||||
|
zip_safe=False,
|
||||||
packages=PACKAGES,
|
packages=PACKAGES,
|
||||||
package_data={'': ['*.pyx', '*.pxd']},
|
package_data={'': ['*.pyx', '*.pxd']},
|
||||||
description='Industrial-strength NLP',
|
description='Industrial-strength NLP',
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
from . import util
|
||||||
|
from .en import English
|
||||||
|
|
||||||
|
|
||||||
|
def load(name, via=None):
|
||||||
|
package = util.get_package_by_name(name, via=via)
|
||||||
|
return English(package)
|
|
@ -1,3 +1,5 @@
|
||||||
|
from __future__ import print_function
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -37,21 +39,26 @@ def link(package, path):
|
||||||
force=("Force overwrite", "flag", "f", bool),
|
force=("Force overwrite", "flag", "f", bool),
|
||||||
)
|
)
|
||||||
def main(data_size='all', force=False):
|
def main(data_size='all', force=False):
|
||||||
|
package_name = 'en_default==1.0.4'
|
||||||
path = os.path.dirname(os.path.abspath(__file__))
|
path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
data_path = os.path.abspath(os.path.join(path, '..', 'data'))
|
|
||||||
if not os.path.isdir(data_path):
|
|
||||||
os.mkdir(data_path)
|
|
||||||
|
|
||||||
if force:
|
if force:
|
||||||
sputnik.purge('spacy', about.short_version, data_path=data_path)
|
sputnik.purge('spacy', about.short_version)
|
||||||
|
|
||||||
package = sputnik.install('spacy', about.short_version, 'en_default==1.0.4',
|
package = sputnik.install('spacy', about.short_version, package_name)
|
||||||
data_path=data_path)
|
|
||||||
|
try:
|
||||||
|
sputnik.package('spacy', about.short_version, package_name)
|
||||||
|
except PackageNotFoundException, CompatiblePackageNotFoundException:
|
||||||
|
print("Model failed to install. Please run 'python -m "
|
||||||
|
"spacy.en.download --force'.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# FIXME clean up old-style packages
|
# FIXME clean up old-style packages
|
||||||
migrate(path)
|
migrate(path)
|
||||||
|
|
||||||
|
print("Model successfully installed.", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -8,6 +8,9 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
import sputnik
|
||||||
|
from sputnik.dir_package import DirPackage
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .syntax.parser import Parser
|
from .syntax.parser import Parser
|
||||||
|
@ -19,8 +22,9 @@ from . import orth
|
||||||
from .syntax.ner import BiluoPushDown
|
from .syntax.ner import BiluoPushDown
|
||||||
from .syntax.arc_eager import ArcEager
|
from .syntax.arc_eager import ArcEager
|
||||||
|
|
||||||
|
from . import about
|
||||||
|
from . import util
|
||||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
||||||
from .util import get_package
|
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
|
@ -137,9 +141,7 @@ class Language(object):
|
||||||
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_vocab(cls, package=None, get_lex_attr=None):
|
def default_vocab(cls, package, get_lex_attr=None):
|
||||||
if package is None:
|
|
||||||
package = get_package()
|
|
||||||
if get_lex_attr is None:
|
if get_lex_attr is None:
|
||||||
get_lex_attr = cls.default_lex_attrs()
|
get_lex_attr = cls.default_lex_attrs()
|
||||||
return Vocab.load(package, get_lex_attr=get_lex_attr)
|
return Vocab.load(package, get_lex_attr=get_lex_attr)
|
||||||
|
@ -157,8 +159,8 @@ class Language(object):
|
||||||
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
|
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
via=None,
|
||||||
data_dir=None,
|
data_dir=None,
|
||||||
model=None,
|
|
||||||
vocab=None,
|
vocab=None,
|
||||||
tokenizer=None,
|
tokenizer=None,
|
||||||
tagger=None,
|
tagger=None,
|
||||||
|
@ -170,19 +172,34 @@ class Language(object):
|
||||||
"""
|
"""
|
||||||
a model can be specified:
|
a model can be specified:
|
||||||
|
|
||||||
1) by a path to the model directory (DEPRECATED)
|
1) by calling a Language subclass
|
||||||
- Language(data_dir='path/to/data')
|
- spacy.en.English()
|
||||||
|
|
||||||
2) by a language identifier (and optionally a package root dir)
|
2) by calling a Language subclass with via (previously: data_dir)
|
||||||
- Language(lang='en')
|
- spacy.en.English('my/model/root')
|
||||||
- Language(lang='en', data_dir='spacy/data')
|
- spacy.en.English(via='my/model/root')
|
||||||
|
|
||||||
3) by a model name/version (and optionally a package root dir)
|
3) by package name
|
||||||
- Language(model='en_default')
|
- spacy.load('en_default')
|
||||||
- Language(model='en_default ==1.0.0')
|
- spacy.load('en_default==1.0.0')
|
||||||
- Language(model='en_default <1.1.0, data_dir='spacy/data')
|
|
||||||
|
4) by package name with a relocated package base
|
||||||
|
- spacy.load('en_default', via='/my/package/root')
|
||||||
|
- spacy.load('en_default==1.0.0', via='/my/package/root')
|
||||||
|
|
||||||
|
5) by package object
|
||||||
|
- spacy.en.English(package)
|
||||||
"""
|
"""
|
||||||
package = get_package(model, data_path=data_dir)
|
|
||||||
|
if data_dir is not None and via is None:
|
||||||
|
warn("Use of data_dir is deprecated, use via instead.", DeprecationWarning)
|
||||||
|
via = data_dir
|
||||||
|
|
||||||
|
if via is None:
|
||||||
|
package = util.get_package_by_name('en_default==1.0.4')
|
||||||
|
else:
|
||||||
|
package = util.get_package(via)
|
||||||
|
|
||||||
if load_vectors is not True:
|
if load_vectors is not True:
|
||||||
warn("load_vectors is deprecated", DeprecationWarning)
|
warn("load_vectors is deprecated", DeprecationWarning)
|
||||||
if vocab in (None, True):
|
if vocab in (None, True):
|
||||||
|
|
|
@ -13,8 +13,8 @@ from .util import get_package
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file):
|
def load(cls, via):
|
||||||
pkg = get_package(pkg_or_str_or_file)
|
pkg = get_package(via)
|
||||||
index = {}
|
index = {}
|
||||||
exc = {}
|
exc = {}
|
||||||
for pos in ['adj', 'noun', 'verb']:
|
for pos in ['adj', 'noun', 'verb']:
|
||||||
|
|
|
@ -170,8 +170,8 @@ cdef class Matcher:
|
||||||
cdef object _patterns
|
cdef object _patterns
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
def load(cls, via, Vocab vocab):
|
||||||
package = get_package(pkg_or_str_or_file)
|
package = get_package(via)
|
||||||
patterns = package.load_json(('vocab', 'gazetteer.json'))
|
patterns = package.load_json(('vocab', 'gazetteer.json'))
|
||||||
return cls(vocab, patterns)
|
return cls(vocab, patterns)
|
||||||
|
|
||||||
|
|
|
@ -148,8 +148,8 @@ cdef class Tagger:
|
||||||
return cls(vocab, model)
|
return cls(vocab, model)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, vocab):
|
def load(cls, via, vocab):
|
||||||
pkg = get_package(pkg_or_str_or_file)
|
pkg = get_package(via)
|
||||||
# TODO: templates.json deprecated? not present in latest package
|
# TODO: templates.json deprecated? not present in latest package
|
||||||
templates = cls.default_templates()
|
templates = cls.default_templates()
|
||||||
# templates = package.load_utf8(json.load,
|
# templates = package.load_utf8(json.load,
|
||||||
|
|
|
@ -42,8 +42,8 @@ cdef class Tokenizer:
|
||||||
return (self.__class__, args, None, None)
|
return (self.__class__, args, None, None)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, Vocab vocab):
|
def load(cls, via, Vocab vocab):
|
||||||
pkg = get_package(pkg_or_str_or_file)
|
pkg = get_package(via)
|
||||||
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
|
rules, prefix_re, suffix_re, infix_re = read_lang_data(pkg)
|
||||||
prefix_re = re.compile(prefix_re)
|
prefix_re = re.compile(prefix_re)
|
||||||
suffix_re = re.compile(suffix_re)
|
suffix_re = re.compile(suffix_re)
|
||||||
|
|
|
@ -7,34 +7,29 @@ import os.path
|
||||||
import sputnik
|
import sputnik
|
||||||
from sputnik.dir_package import DirPackage
|
from sputnik.dir_package import DirPackage
|
||||||
from sputnik.package_stub import PackageStub
|
from sputnik.package_stub import PackageStub
|
||||||
from sputnik.package_list import PackageNotFoundException, CompatiblePackageNotFoundException
|
from sputnik.package_list import (PackageNotFoundException,
|
||||||
|
CompatiblePackageNotFoundException)
|
||||||
|
|
||||||
from . import about
|
from . import about
|
||||||
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
def get_package(value=None, data_path=None):
|
def get_package(via=None):
|
||||||
if data_path is None:
|
if isinstance(via, PackageStub):
|
||||||
if isinstance(value, PackageStub):
|
return via
|
||||||
return value
|
return DirPackage(via)
|
||||||
elif value and os.path.isdir(value):
|
|
||||||
return DirPackage(value)
|
|
||||||
|
|
||||||
elif value is None and data_path is not None:
|
|
||||||
return DirPackage(data_path)
|
|
||||||
|
|
||||||
|
def get_package_by_name(name, via=None):
|
||||||
try:
|
try:
|
||||||
return sputnik.package('spacy', about.short_version,
|
return sputnik.package('spacy', about.short_version, name, data_path=via)
|
||||||
value or 'en_default==1.0.4',
|
|
||||||
data_path=data_path)
|
|
||||||
|
|
||||||
except PackageNotFoundException as e:
|
except PackageNotFoundException as e:
|
||||||
raise RuntimeError("Model not installed. Please run 'python -m "
|
raise RuntimeError("Model not installed. Please run 'python -m "
|
||||||
"spacy.en.download' to install latest compatible "
|
"spacy.en.download' to install latest compatible "
|
||||||
"model.")
|
"model.")
|
||||||
except CompatiblePackageNotFoundException as e:
|
except CompatiblePackageNotFoundException as e:
|
||||||
raise RuntimeError("Installed model is not compatible with spaCy "
|
raise RuntimeError("Installed model is not compatible with spaCy "
|
||||||
"version. Please run 'python -m spacy.en.download' "
|
"version. Please run 'python -m spacy.en.download "
|
||||||
"--force' to install latest compatible model.")
|
"--force' to install latest compatible model.")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -48,8 +48,8 @@ cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
'''A map container for a language's LexemeC structs.
|
||||||
'''
|
'''
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, pkg_or_str_or_file, get_lex_attr=None):
|
def load(cls, via, get_lex_attr=None):
|
||||||
package = get_package(pkg_or_str_or_file)
|
package = get_package(via)
|
||||||
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
|
||||||
|
|
||||||
lemmatizer = Lemmatizer.load(package)
|
lemmatizer = Lemmatizer.load(package)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user