Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-10-28 06:31:12 +03:00 · 2017-05-29 08:41:02 -05:00 · 2017-05-29 08:41:02 -05:00 · c044e9c21c
commit c044e9c21c
parent aa4c33914b 9e83a17e95
8 changed files with 88 additions and 63 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -9,7 +9,7 @@ from . import util

 def load(name, **overrides):
    name = resolve_load_name(name, **overrides)
-    return util.load_model(name)
+    return util.load_model(name, **overrides)


 def info(model=None, markdown=False):
--- a/spacy/about.py
+++ b/spacy/about.py
@ -14,4 +14,4 @@ __docs_models__ = 'https://spacy.io/docs/usage/models'
 __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
 __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
 __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json'
-__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/'
+__model_files__ = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/v2/templates/model/'
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -36,7 +36,7 @@ def package(cmd, input_dir, output_dir, meta=None, force=False):

    template_setup = get_template('setup.py')
    template_manifest = get_template('MANIFEST.in')
-    template_init = get_template('en_model_name/__init__.py')
+    template_init = get_template('xx_model_name/__init__.py')
    meta_path = meta_path or input_path / 'meta.json'
    if meta_path.is_file():
        prints(meta_path, title="Reading meta.json from file")
--- a/spacy/util.py
+++ b/spacy/util.py
@ -85,10 +85,11 @@ def ensure_path(path):
        return path


-def load_model(name):
+def load_model(name, **overrides):
    """Load a model from a shortcut link, package or data path.

    name (unicode): Package name, shortcut link or model path.
+    **overrides: Specific overrides, like pipeline components to disable.
    RETURNS (Language): `Language` class with the loaded model.
    """
    data_path = get_data_path()
@ -96,73 +97,63 @@ def load_model(name):
        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
    if isinstance(name, basestring_):
        if (data_path / name).exists(): # in data dir or shortcut
-            return load_model_from_path(data_path / name)
+            spec = importlib.util.spec_from_file_location('model', data_path / name)
+            cls = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(cls)
+            return cls.load(**overrides)
        if is_package(name): # installed as package
-            return load_model_from_pkg(name)
+            cls = importlib.import_module(name)
+            return cls.load(**overrides)
        if Path(name).exists(): # path to model data directory
-            return load_data_from_path(Path(name))
+            model_path = Path(name)
+            meta = get_package_meta(model_path)
+            cls = get_lang_class(meta['lang'])
+            nlp = cls(pipeline=meta.get('pipeline', True))
+            return nlp.from_disk(model_path, **overrides)
    elif hasattr(name, 'exists'): # Path or Path-like to model data
-        return load_data_from_path(name)
+        meta = get_package_meta(name)
+        cls = get_lang_class(meta['lang'])
+        nlp = cls(pipeline=meta.get('pipeline', True))
+        return nlp.from_disk(name, **overrides)
    raise IOError("Can't find model '%s'" % name)


-def load_model_from_init_py(init_file):
+def load_model_from_init_py(init_file, **overrides):
    """Helper function to use in the `load()` method of a model package's
    __init__.py.

    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
+    **overrides: Specific overrides, like pipeline components to disable.
    RETURNS (Language): `Language` class with loaded model.
    """
    model_path = Path(init_file).parent
-    return load_data_from_path(model_path, package=True)
+    meta = get_model_meta(model_path)
+    data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
+    data_path = model_path / data_dir
+    if not model_path.exists():
+        raise ValueError("Can't find model directory: %s" % path2str(data_path))
+    cls = get_lang_class(meta['lang'])
+    nlp = cls(pipeline=meta.get('pipeline', True))
+    return nlp.from_disk(data_path, **overrides)


-def load_model_from_path(model_path):
-    """Import and load a model package from its file path.
+def get_model_meta(path):
+    """Get model meta.json from a directory path and validate its contents.

-    path (unicode or Path): Path to package directory.
-    RETURNS (Language): `Language` class with loaded model.
+    path (unicode or Path): Path to model directory.
+    RETURNS (dict): The model's meta data.
    """
-    model_path = ensure_path(model_path)
-    spec = importlib.util.spec_from_file_location('model', model_path)
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module.load()
-
-
-def load_model_from_pkg(name):
-    """Import and load a model package.
-
-    name (unicode): Name of model package installed via pip.
-    RETURNS (Language): `Language` class with loaded model.
-    """
-    module = importlib.import_module(name)
-    return module.load()
-
-
-def load_data_from_path(model_path, package=False):
-    """Initialie a `Language` class with a loaded model from a model data path.
-
-    model_path (unicode or Path): Path to model data directory.
-    package (bool): Does the path point to the parent package directory?
-    RETURNS (Language): `Language` class with loaded model.
-    """
-    model_path = ensure_path(model_path)
+    model_path = ensure_path(path)
+    if not model_path.exists():
+        raise ValueError("Can't find model directory: %s" % path2str(model_path))
    meta_path = model_path / 'meta.json'
    if not meta_path.is_file():
-        raise IOError("Could not read meta.json from %s" % location)
-    meta = read_json(location)
+        raise IOError("Could not read meta.json from %s" % meta_path)
+    meta = read_json(meta_path)
    for setting in ['lang', 'name', 'version']:
        if setting not in meta:
            raise IOError('No %s setting found in model meta.json' % setting)
-    if package:
-        model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
-        model_path = model_path / model_data_path
-    if not model_path.exists():
-        raise ValueError("Can't find model directory: %s" % path2str(model_path))
-    cls = get_lang_class(meta['lang'])
-    nlp = cls(pipeline=meta.get('pipeline', True))
-    return nlp.from_disk(model_path)
+    return meta


 def is_package(name):
--- a/website/docs/api/binder.jade
+++ b/website/docs/api/binder.jade
@ -2,4 +2,6 @@

 include ../../_includes/_mixins

+p A container class for serializing collections of #[code Doc] objects.
+
 +under-construction
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@ -87,7 +87,7 @@ p

 +aside-code("Example").
    nlp = util.load_model('en')
-    nlp = util.load_model('en_core_web_sm')
+    nlp = util.load_model('en_core_web_sm', disable=['ner'])
    nlp = util.load_model('/path/to/data')

 +table(["Name", "Type", "Description"])
@ -96,6 +96,11 @@ p
        +cell unicode
        +cell Package name, shortcut link or model path.

+    +row
+        +cell #[code **overrides]
+        +cell -
+        +cell Specific overrides, like pipeline components to disable.
+
    +footrow
        +cell returns
        +cell #[code Language]
@ -112,8 +117,8 @@ p
 +aside-code("Example").
    from spacy.util import load_model_from_init_py

-    def load():
-        return load_model_from_init_py(__file__)
+    def load(**overrides):
+        return load_model_from_init_py(__file__, **overrides)

 +table(["Name", "Type", "Description"])
    +row
@ -121,11 +126,37 @@ p
        +cell unicode
        +cell Path to model's __init__.py, i.e. #[code __file__].

+    +row
+        +cell #[code **overrides]
+        +cell -
+        +cell Specific overrides, like pipeline components to disable.
+
    +footrow
        +cell returns
        +cell #[code Language]
        +cell #[code Language] class with the loaded model.

+h(2, "get_model_meta") util.get_model_meta
+    +tag function
+    +tag-new(2)
+
+p
+    |  Get a model's meta.json from a directory path and validate its contents.
+
+aside-code("Example").
+    meta = util.get_model_meta('/path/to/model')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell Path to model directory.
+
+    +footrow
+        +cell returns
+        +cell dict
+        +cell The model's meta data.
+
 +h(2, "is_package") util.is_package
    +tag function

--- a/website/docs/usage/_spacy-101/_vocab.jade
+++ b/website/docs/usage/_spacy-101/_vocab.jade
@ -107,8 +107,9 @@ p
    assert doc.vocab.strings[3197928453018144401] == u'coffee' # 👍

 p
-    |  If the vocabulary doesn't contain a hash for "coffee", spaCy will
-    |  throw an error. So you either need to add it manually, or initialise the
-    |  new #[code Doc] with the shared vocabulary. To prevent this problem,
-    |  spaCy will also export the #[code Vocab] when you save a
-    |  #[code Doc] or #[code nlp] object.
+    |  If the vocabulary doesn't contain a string for #[code 3197928453018144401],
+    |  spaCy will raise an error. You can re-add "coffee" manually, but this
+    |  only works if you actually #[em know] that the document contains that
+    |  word. To prevent this problem, spaCy will also export the #[code Vocab]
+    |  when you save a #[code Doc] or #[code nlp] object. This will give you
+    |  the object and its encoded annotations, plus they "key" to decode it.
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@ -187,13 +187,13 @@ p
    |  #[+a("/docs/usage/saving-loading#models-generating") model package] with
    |  a custom pipeline.

-+h(2, "example1") Example: Custom sentence segmentation logic
-
 +aside("Real-world examples")
    |  To see real-world examples of pipeline factories and components in action,
    |  you can have a look at the source of spaCy's built-in components, e.g.
-    |  the #[+src(gh("spacy")) tagger], #[+src(gh("spacy")) parser] or
-    |  #[+src(gh("spacy")) entity recognizer].
+    |  the #[+api("tagger") #[code Tagger]], #[+api("parser") #[code Parser]] or
+    |  #[+api("entityrecognizer") #[code EntityRecongnizer]].
+
+h(2, "example1") Example: Custom sentence segmentation logic

 p
    |  Let's say you want to implement custom logic to improve spaCy's sentence
@ -318,8 +318,8 @@ p
    |  If you don't need a particular component of the pipeline – for
    |  example, the tagger or the parser, you can disable loading it. This can
    |  sometimes make a big difference and improve loading speed. Disabled
-    |  component names can be provided to #[+api("spacy#load") #[code spacy.load]],
-    |  #[+api("language#from_disk") #[code Language.from_disk]] or the
+    |  component names can be provided to #[+api("spacy#load") #[code spacy.load()]],
+    |  #[+api("language#from_disk") #[code Language.from_disk()]] or the
    |  #[code nlp] object itself as a list:

 +code.