mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c044e9c21c
|
@ -9,7 +9,7 @@ from . import util
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
name = resolve_load_name(name, **overrides)
|
name = resolve_load_name(name, **overrides)
|
||||||
return util.load_model(name)
|
return util.load_model(name, **overrides)
|
||||||
|
|
||||||
|
|
||||||
def info(model=None, markdown=False):
|
def info(model=None, markdown=False):
|
||||||
|
|
|
@ -14,4 +14,4 @@ __docs_models__ = 'https://spacy.io/docs/usage/models'
|
||||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
|
||||||
__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
|
__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/v2/templates/model/'
|
||||||
|
|
|
@ -36,7 +36,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):
|
||||||
|
|
||||||
template_setup = get_template('setup.py')
|
template_setup = get_template('setup.py')
|
||||||
template_manifest = get_template('MANIFEST.in')
|
template_manifest = get_template('MANIFEST.in')
|
||||||
template_init = get_template('en_model_name/__init__.py')
|
template_init = get_template('xx_model_name/__init__.py')
|
||||||
meta_path = meta_path or input_path / 'meta.json'
|
meta_path = meta_path or input_path / 'meta.json'
|
||||||
if meta_path.is_file():
|
if meta_path.is_file():
|
||||||
prints(meta_path, title="Reading meta.json from file")
|
prints(meta_path, title="Reading meta.json from file")
|
||||||
|
|
|
@ -85,10 +85,11 @@ def ensure_path(path):
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def load_model(name):
|
def load_model(name, **overrides):
|
||||||
"""Load a model from a shortcut link, package or data path.
|
"""Load a model from a shortcut link, package or data path.
|
||||||
|
|
||||||
name (unicode): Package name, shortcut link or model path.
|
name (unicode): Package name, shortcut link or model path.
|
||||||
|
**overrides: Specific overrides, like pipeline components to disable.
|
||||||
RETURNS (Language): `Language` class with the loaded model.
|
RETURNS (Language): `Language` class with the loaded model.
|
||||||
"""
|
"""
|
||||||
data_path = get_data_path()
|
data_path = get_data_path()
|
||||||
|
@ -96,73 +97,63 @@ def load_model(name):
|
||||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||||
if isinstance(name, basestring_):
|
if isinstance(name, basestring_):
|
||||||
if (data_path / name).exists(): # in data dir or shortcut
|
if (data_path / name).exists(): # in data dir or shortcut
|
||||||
return load_model_from_path(data_path / name)
|
spec = importlib.util.spec_from_file_location('model', data_path / name)
|
||||||
|
cls = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(cls)
|
||||||
|
return cls.load(**overrides)
|
||||||
if is_package(name): # installed as package
|
if is_package(name): # installed as package
|
||||||
return load_model_from_pkg(name)
|
cls = importlib.import_module(name)
|
||||||
|
return cls.load(**overrides)
|
||||||
if Path(name).exists(): # path to model data directory
|
if Path(name).exists(): # path to model data directory
|
||||||
return load_data_from_path(Path(name))
|
model_path = Path(name)
|
||||||
|
meta = get_package_meta(model_path)
|
||||||
|
cls = get_lang_class(meta['lang'])
|
||||||
|
nlp = cls(pipeline=meta.get('pipeline', True))
|
||||||
|
return nlp.from_disk(model_path, **overrides)
|
||||||
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||||
return load_data_from_path(name)
|
meta = get_package_meta(name)
|
||||||
|
cls = get_lang_class(meta['lang'])
|
||||||
|
nlp = cls(pipeline=meta.get('pipeline', True))
|
||||||
|
return nlp.from_disk(name, **overrides)
|
||||||
raise IOError("Can't find model '%s'" % name)
|
raise IOError("Can't find model '%s'" % name)
|
||||||
|
|
||||||
|
|
||||||
def load_model_from_init_py(init_file):
|
def load_model_from_init_py(init_file, **overrides):
|
||||||
"""Helper function to use in the `load()` method of a model package's
|
"""Helper function to use in the `load()` method of a model package's
|
||||||
__init__.py.
|
__init__.py.
|
||||||
|
|
||||||
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
||||||
|
**overrides: Specific overrides, like pipeline components to disable.
|
||||||
RETURNS (Language): `Language` class with loaded model.
|
RETURNS (Language): `Language` class with loaded model.
|
||||||
"""
|
"""
|
||||||
model_path = Path(init_file).parent
|
model_path = Path(init_file).parent
|
||||||
return load_data_from_path(model_path, package=True)
|
meta = get_model_meta(model_path)
|
||||||
|
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
||||||
|
data_path = model_path / data_dir
|
||||||
|
if not model_path.exists():
|
||||||
|
raise ValueError("Can't find model directory: %s" % path2str(data_path))
|
||||||
|
cls = get_lang_class(meta['lang'])
|
||||||
|
nlp = cls(pipeline=meta.get('pipeline', True))
|
||||||
|
return nlp.from_disk(data_path, **overrides)
|
||||||
|
|
||||||
|
|
||||||
def load_model_from_path(model_path):
|
def get_model_meta(path):
|
||||||
"""Import and load a model package from its file path.
|
"""Get model meta.json from a directory path and validate its contents.
|
||||||
|
|
||||||
path (unicode or Path): Path to package directory.
|
path (unicode or Path): Path to model directory.
|
||||||
RETURNS (Language): `Language` class with loaded model.
|
RETURNS (dict): The model's meta data.
|
||||||
"""
|
"""
|
||||||
model_path = ensure_path(model_path)
|
model_path = ensure_path(path)
|
||||||
spec = importlib.util.spec_from_file_location('model', model_path)
|
if not model_path.exists():
|
||||||
module = importlib.util.module_from_spec(spec)
|
raise ValueError("Can't find model directory: %s" % path2str(model_path))
|
||||||
spec.loader.exec_module(module)
|
|
||||||
return module.load()
|
|
||||||
|
|
||||||
|
|
||||||
def load_model_from_pkg(name):
|
|
||||||
"""Import and load a model package.
|
|
||||||
|
|
||||||
name (unicode): Name of model package installed via pip.
|
|
||||||
RETURNS (Language): `Language` class with loaded model.
|
|
||||||
"""
|
|
||||||
module = importlib.import_module(name)
|
|
||||||
return module.load()
|
|
||||||
|
|
||||||
|
|
||||||
def load_data_from_path(model_path, package=False):
|
|
||||||
"""Initialie a `Language` class with a loaded model from a model data path.
|
|
||||||
|
|
||||||
model_path (unicode or Path): Path to model data directory.
|
|
||||||
package (bool): Does the path point to the parent package directory?
|
|
||||||
RETURNS (Language): `Language` class with loaded model.
|
|
||||||
"""
|
|
||||||
model_path = ensure_path(model_path)
|
|
||||||
meta_path = model_path / 'meta.json'
|
meta_path = model_path / 'meta.json'
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
raise IOError("Could not read meta.json from %s" % location)
|
raise IOError("Could not read meta.json from %s" % meta_path)
|
||||||
meta = read_json(location)
|
meta = read_json(meta_path)
|
||||||
for setting in ['lang', 'name', 'version']:
|
for setting in ['lang', 'name', 'version']:
|
||||||
if setting not in meta:
|
if setting not in meta:
|
||||||
raise IOError('No %s setting found in model meta.json' % setting)
|
raise IOError('No %s setting found in model meta.json' % setting)
|
||||||
if package:
|
return meta
|
||||||
model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
|
||||||
model_path = model_path / model_data_path
|
|
||||||
if not model_path.exists():
|
|
||||||
raise ValueError("Can't find model directory: %s" % path2str(model_path))
|
|
||||||
cls = get_lang_class(meta['lang'])
|
|
||||||
nlp = cls(pipeline=meta.get('pipeline', True))
|
|
||||||
return nlp.from_disk(model_path)
|
|
||||||
|
|
||||||
|
|
||||||
def is_package(name):
|
def is_package(name):
|
||||||
|
|
|
@ -2,4 +2,6 @@
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
p A container class for serializing collections of #[code Doc] objects.
|
||||||
|
|
||||||
+under-construction
|
+under-construction
|
||||||
|
|
|
@ -87,7 +87,7 @@ p
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
nlp = util.load_model('en')
|
nlp = util.load_model('en')
|
||||||
nlp = util.load_model('en_core_web_sm')
|
nlp = util.load_model('en_core_web_sm', disable=['ner'])
|
||||||
nlp = util.load_model('/path/to/data')
|
nlp = util.load_model('/path/to/data')
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
|
@ -96,6 +96,11 @@ p
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Package name, shortcut link or model path.
|
+cell Package name, shortcut link or model path.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **overrides]
|
||||||
|
+cell -
|
||||||
|
+cell Specific overrides, like pipeline components to disable.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
|
@ -112,8 +117,8 @@ p
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
from spacy.util import load_model_from_init_py
|
from spacy.util import load_model_from_init_py
|
||||||
|
|
||||||
def load():
|
def load(**overrides):
|
||||||
return load_model_from_init_py(__file__)
|
return load_model_from_init_py(__file__, **overrides)
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -121,11 +126,37 @@ p
|
||||||
+cell unicode
|
+cell unicode
|
||||||
+cell Path to model's __init__.py, i.e. #[code __file__].
|
+cell Path to model's __init__.py, i.e. #[code __file__].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **overrides]
|
||||||
|
+cell -
|
||||||
|
+cell Specific overrides, like pipeline components to disable.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Language]
|
+cell #[code Language]
|
||||||
+cell #[code Language] class with the loaded model.
|
+cell #[code Language] class with the loaded model.
|
||||||
|
|
||||||
|
+h(2, "get_model_meta") util.get_model_meta
|
||||||
|
+tag function
|
||||||
|
+tag-new(2)
|
||||||
|
|
||||||
|
p
|
||||||
|
| Get a model's meta.json from a directory path and validate its contents.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
meta = util.get_model_meta('/path/to/model')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell Path to model directory.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell dict
|
||||||
|
+cell The model's meta data.
|
||||||
|
|
||||||
+h(2, "is_package") util.is_package
|
+h(2, "is_package") util.is_package
|
||||||
+tag function
|
+tag function
|
||||||
|
|
||||||
|
|
|
@ -107,8 +107,9 @@ p
|
||||||
assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
|
assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍
|
||||||
|
|
||||||
p
|
p
|
||||||
| If the vocabulary doesn't contain a hash for "coffee", spaCy will
|
| If the vocabulary doesn't contain a string for #[code 3197928453018144401],
|
||||||
| throw an error. So you either need to add it manually, or initialise the
|
| spaCy will raise an error. You can re-add "coffee" manually, but this
|
||||||
| new #[code Doc] with the shared vocabulary. To prevent this problem,
|
| only works if you actually #[em know] that the document contains that
|
||||||
| spaCy will also export the #[code Vocab] when you save a
|
| word. To prevent this problem, spaCy will also export the #[code Vocab]
|
||||||
| #[code Doc] or #[code nlp] object.
|
| when you save a #[code Doc] or #[code nlp] object. This will give you
|
||||||
|
| the object and its encoded annotations, plus they "key" to decode it.
|
||||||
|
|
|
@ -187,13 +187,13 @@ p
|
||||||
| #[+a("/docs/usage/saving-loading#models-generating") model package] with
|
| #[+a("/docs/usage/saving-loading#models-generating") model package] with
|
||||||
| a custom pipeline.
|
| a custom pipeline.
|
||||||
|
|
||||||
+h(2, "example1") Example: Custom sentence segmentation logic
|
|
||||||
|
|
||||||
+aside("Real-world examples")
|
+aside("Real-world examples")
|
||||||
| To see real-world examples of pipeline factories and components in action,
|
| To see real-world examples of pipeline factories and components in action,
|
||||||
| you can have a look at the source of spaCy's built-in components, e.g.
|
| you can have a look at the source of spaCy's built-in components, e.g.
|
||||||
| the #[+src(gh("spacy")) tagger], #[+src(gh("spacy")) parser] or
|
| the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
|
||||||
| #[+src(gh("spacy")) entity recognizer].
|
| #[+api("entityrecognizer") #[code EntityRecongnizer]].
|
||||||
|
|
||||||
|
+h(2, "example1") Example: Custom sentence segmentation logic
|
||||||
|
|
||||||
p
|
p
|
||||||
| Let's say you want to implement custom logic to improve spaCy's sentence
|
| Let's say you want to implement custom logic to improve spaCy's sentence
|
||||||
|
@ -318,8 +318,8 @@ p
|
||||||
| If you don't need a particular component of the pipeline – for
|
| If you don't need a particular component of the pipeline – for
|
||||||
| example, the tagger or the parser, you can disable loading it. This can
|
| example, the tagger or the parser, you can disable loading it. This can
|
||||||
| sometimes make a big difference and improve loading speed. Disabled
|
| sometimes make a big difference and improve loading speed. Disabled
|
||||||
| component names can be provided to #[+api("spacy#load") #[code spacy.load]],
|
| component names can be provided to #[+api("spacy#load") #[code spacy.load()]],
|
||||||
| #[+api("language#from_disk") #[code Language.from_disk]] or the
|
| #[+api("language#from_disk") #[code Language.from_disk()]] or the
|
||||||
| #[code nlp] object itself as a list:
|
| #[code nlp] object itself as a list:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user