From a7d7ea3afa776132d5f46f2f1b59a4deeda1748c Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Thu, 24 Mar 2016 11:19:43 +0100 Subject: [PATCH 1/4] first idea for supporting multiple langs in download script --- spacy/__init__.py | 9 ++++++-- spacy/about.py | 14 ++++++++++++- spacy/de/download.py | 13 ++++++++++++ spacy/download.py | 33 +++++++++++++++++++++++++++++ spacy/en/download.py | 49 ++------------------------------------------ spacy/util.py | 14 +++++++------ 6 files changed, 76 insertions(+), 56 deletions(-) create mode 100644 spacy/de/download.py create mode 100644 spacy/download.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 70e72b7a1..b09ee3491 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,8 +1,13 @@ from . import util -from .en import English +from .about import __models__ +import importlib def load(name, vectors=None, via=None): - return English( + if name not in __models__: + raise Exception('Model %s not found.' % name) + + mod = importlib.import_module('.%s' % __models__[name]['module'], 'spacy') + return getattr(mod, __models__[name]['class'])( package=util.get_package_by_name(name, via=via), vectors_package=util.get_package_by_name(vectors, via=via)) diff --git a/spacy/about.py b/spacy/about.py index 3814b8d61..eed7c3f81 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -10,4 +10,16 @@ __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' -__default_model__ = 'en>=1.0.0,<1.1.0' +__models__ = { + 'en': { + 'module': 'en', + 'class': 'English', + 'package': 'en>=1.0.0,<1.1.0', + }, + 'de': { + 'module': 'de', + 'class': 'German', + 'package': 'de>=1.0.0,<1.1.0', + }, +} +__default_model__ = 'en' diff --git a/spacy/de/download.py b/spacy/de/download.py new file mode 100644 index 000000000..ba57c1d31 --- /dev/null +++ b/spacy/de/download.py @@ -0,0 +1,13 @@ +import plac +from ..download import download + + +@plac.annotations( + force=("Force overwrite", "flag", "f", bool), +) +def main(data_size='all', force=False): + download('de', force) + + +if __name__ == '__main__': + plac.call(main) diff --git a/spacy/download.py b/spacy/download.py new file mode 100644 index 000000000..537c06872 --- /dev/null +++ b/spacy/download.py @@ -0,0 +1,33 @@ +from __future__ import print_function + +import sys + +import sputnik +from sputnik.package_list import (PackageNotFoundException, + CompatiblePackageNotFoundException) + +from . import about + + +def download(lang, force=False): + if force: + sputnik.purge(about.__title__, about.__version__) + + try: + sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + print("Model already installed. Please run 'python -m " + "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) + sys.exit(1) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + pass + + package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]['package']) + + try: + sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + except (PackageNotFoundException, CompatiblePackageNotFoundException): + print("Model failed to install. Please run 'python -m " + "spacy.%s.download --force'." % lang, file=sys.stderr) + sys.exit(1) + + print("Model successfully installed.", file=sys.stderr) diff --git a/spacy/en/download.py b/spacy/en/download.py index 993b8b16d..f0c23b088 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -1,57 +1,12 @@ -from __future__ import print_function - -import sys -import os -import shutil - import plac -import sputnik -from sputnik.package_list import (PackageNotFoundException, - CompatiblePackageNotFoundException) - -from .. import about - - -def migrate(path): - data_path = os.path.join(path, 'data') - if os.path.isdir(data_path): - if os.path.islink(data_path): - os.unlink(data_path) - else: - shutil.rmtree(data_path) - for filename in os.listdir(path): - if filename.endswith('.tgz'): - os.unlink(os.path.join(path, filename)) +from ..download import download @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) def main(data_size='all', force=False): - if force: - sputnik.purge(about.__title__, about.__version__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - print("Model already installed. Please run 'python -m " - "spacy.en.download --force' to reinstall.", file=sys.stderr) - sys.exit(1) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - pass - - package = sputnik.install(about.__title__, about.__version__, about.__default_model__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - print("Model failed to install. Please run 'python -m " - "spacy.en.download --force'.", file=sys.stderr) - sys.exit(1) - - # FIXME clean up old-style packages - migrate(os.path.dirname(os.path.abspath(__file__))) - - print("Model successfully installed.", file=sys.stderr) + download('en', force) if __name__ == '__main__': diff --git a/spacy/util.py b/spacy/util.py index bcc55c656..37d3b7bab 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -23,15 +23,17 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): try: return sputnik.package(about.__title__, about.__version__, - name or about.__default_model__, data_path=via) + name or about.__models__[about.__default_model__]['package'], + data_path=via) except PackageNotFoundException as e: raise RuntimeError("Model %s not installed. Please run 'python -m " - "spacy.en.download' to install latest compatible " - "model." % name) + "spacy.%s.download' to install latest compatible " + "model." % (name, about.__models__[name]['module'])) except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model is not compatible with spaCy " - "version. Please run 'python -m spacy.en.download " - "--force' to install latest compatible model.") + raise RuntimeError("Installed model %s is not compatible with spaCy " + "version. Please run 'python -m spacy.%s.download " + "--force' to install latest compatible model." % + (name, about.__models__[name]['module'])) def normalize_slice(length, start, stop, step=None): From b8f63071eb1a8a1523ca91819485a350afd83c14 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Mar 2016 18:54:45 +0100 Subject: [PATCH 2/4] add lang registration facility --- spacy/__init__.py | 21 ++++++++++++--------- spacy/about.py | 14 +++----------- spacy/download.py | 6 +++--- spacy/tokenizer.pyx | 3 +-- spacy/util.py | 32 ++++++++++++++++++++++++-------- spacy/vocab.pyx | 1 - 6 files changed, 43 insertions(+), 34 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index b09ee3491..f47926a63 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,13 +1,16 @@ from . import util -from .about import __models__ -import importlib + +from .en import English +from .de import German +from . import util + + +util.register_lang(English.lang, English) +util.register_lang(German.lang, German) def load(name, vectors=None, via=None): - if name not in __models__: - raise Exception('Model %s not found.' % name) - - mod = importlib.import_module('.%s' % __models__[name]['module'], 'spacy') - return getattr(mod, __models__[name]['class'])( - package=util.get_package_by_name(name, via=via), - vectors_package=util.get_package_by_name(vectors, via=via)) + package = util.get_package_by_name(name, via=via) + vectors_package = util.get_package_by_name(vectors, via=via) + cls = util.get_lang(name) + return cls(package=package, vectors_package=vectors_package) diff --git a/spacy/about.py b/spacy/about.py index eed7c3f81..7f889cad8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -11,15 +11,7 @@ __author__ = 'Matthew Honnibal' __email__ = 'matt@spacy.io' __license__ = 'MIT' __models__ = { - 'en': { - 'module': 'en', - 'class': 'English', - 'package': 'en>=1.0.0,<1.1.0', - }, - 'de': { - 'module': 'de', - 'class': 'German', - 'package': 'de>=1.0.0,<1.1.0', - }, + 'en': 'en>=1.0.0,<1.1.0', + 'de': 'de>=1.0.0,<1.1.0', } -__default_model__ = 'en' +__default_lang__ = 'en' diff --git a/spacy/download.py b/spacy/download.py index 537c06872..f7fc798ae 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -14,17 +14,17 @@ def download(lang, force=False): sputnik.purge(about.__title__, about.__version__) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) print("Model already installed. Please run 'python -m " "spacy.%s.download --force' to reinstall." % lang, file=sys.stderr) sys.exit(1) except (PackageNotFoundException, CompatiblePackageNotFoundException): pass - package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]['package']) + package = sputnik.install(about.__title__, about.__version__, about.__models__[lang]) try: - sputnik.package(about.__title__, about.__version__, about.__models__[lang]['package']) + sputnik.package(about.__title__, about.__version__, about.__models__[lang]) except (PackageNotFoundException, CompatiblePackageNotFoundException): print("Model failed to install. Please run 'python -m " "spacy.%s.download --force'." % lang, file=sys.stderr) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f8613fce8..44d627505 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -16,8 +16,7 @@ cimport cython from . import util from .tokens.doc cimport Doc -from .util import read_lang_data -from .util import get_package +from .util import read_lang_data, get_package cdef class Tokenizer: diff --git a/spacy/util.py b/spacy/util.py index 37d3b7bab..4eda2d0e4 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -14,6 +14,21 @@ from . import about from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE +LANGUAGES = {} + + +def register_lang(name, cls): + global LANGUAGES + LANGUAGES[name] = cls + + +def get_lang(name): + lang = re.split('[^a-zA-Z0-9_]', name, 1)[0] + if lang not in LANGUAGES: + raise RuntimeError('Language not supported: %s' % lang) + return LANGUAGES[lang] + + def get_package(data_dir): if not isinstance(data_dir, six.string_types): raise RuntimeError('data_dir must be a string') @@ -21,19 +36,20 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): + package_name = name or about.__models__[about.__default_lang__] + lang = get_lang(package_name) try: return sputnik.package(about.__title__, about.__version__, - name or about.__models__[about.__default_model__]['package'], - data_path=via) + package_name, data_path=via) except PackageNotFoundException as e: - raise RuntimeError("Model %s not installed. Please run 'python -m " - "spacy.%s.download' to install latest compatible " - "model." % (name, about.__models__[name]['module'])) + raise RuntimeError("Model '%s' not installed. Please run 'python -m " + "%s.download' to install latest compatible " + "model." % (name, lang.__module__)) except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model %s is not compatible with spaCy " - "version. Please run 'python -m spacy.%s.download " + raise RuntimeError("Installed model is not compatible with spaCy " + "version. Please run 'python -m %s.download " "--force' to install latest compatible model." % - (name, about.__models__[name]['module'])) + (lang.__module__)) def normalize_slice(length, start, stop, step=None): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index f876bfefb..3712a7383 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -25,7 +25,6 @@ from . import attrs from . import symbols from cymem.cymem cimport Address -from . import util from .serialize.packer cimport Packer from .attrs cimport PROB From db095a162c12d4e68b11543e16ba5a9c47881d23 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 25 Mar 2016 18:59:47 +0100 Subject: [PATCH 3/4] fix --- spacy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index f47926a63..d01bb11f3 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -2,7 +2,6 @@ from . import util from .en import English from .de import German -from . import util util.register_lang(English.lang, English) From c90d4a6f17aa2940b744863c2491f23637fe0c24 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Sat, 26 Mar 2016 11:44:53 +0100 Subject: [PATCH 4/4] relative imports in __init__.py --- spacy/__init__.py | 12 ++++++------ spacy/util.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index d01bb11f3..676659fdd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,15 +1,15 @@ -from . import util +from .util import set_lang_class, get_lang_class, get_package, get_package_by_name from .en import English from .de import German -util.register_lang(English.lang, English) -util.register_lang(German.lang, German) +set_lang_class(English.lang, English) +set_lang_class(German.lang, German) def load(name, vectors=None, via=None): - package = util.get_package_by_name(name, via=via) - vectors_package = util.get_package_by_name(vectors, via=via) - cls = util.get_lang(name) + package = get_package_by_name(name, via=via) + vectors_package = get_package_by_name(vectors, via=via) + cls = get_lang_class(name) return cls(package=package, vectors_package=vectors_package) diff --git a/spacy/util.py b/spacy/util.py index 4eda2d0e4..b1e93d08b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -17,12 +17,12 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE LANGUAGES = {} -def register_lang(name, cls): +def set_lang_class(name, cls): global LANGUAGES LANGUAGES[name] = cls -def get_lang(name): +def get_lang_class(name): lang = re.split('[^a-zA-Z0-9_]', name, 1)[0] if lang not in LANGUAGES: raise RuntimeError('Language not supported: %s' % lang) @@ -37,7 +37,7 @@ def get_package(data_dir): def get_package_by_name(name=None, via=None): package_name = name or about.__models__[about.__default_lang__] - lang = get_lang(package_name) + lang = get_lang_class(package_name) try: return sputnik.package(about.__title__, about.__version__, package_name, data_path=via)