diff --git a/MANIFEST.in b/MANIFEST.in index e15d9de6d..697748835 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,3 @@ recursive-include include *.h -include buildbot.json include LICENSE include README.rst diff --git a/buildbot.json b/buildbot.json deleted file mode 100644 index 6dc8aa286..000000000 --- a/buildbot.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "build": { - "sdist": [ - "pip install -r requirements.txt", - "pip install \"numpy<1.8\"", - "python setup.py sdist" - ], - "install": [ - "pip install -v source.tar.gz" - ], - "wheel": [ - "python untar.py source.tar.gz .", - "python setup.py bdist_wheel", - "python cpdist.py dist" - ] - }, - "test": { - "after": ["install", "wheel"], - "run": [ - "python -m spacy.en.download --force" - ], - "package": "spacy", - "args": "--tb=native -x --models --vectors --slow" - } -} diff --git a/spacy/__init__.py b/spacy/__init__.py index 0b76f1f9e..5ccaf3056 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals, print_function import json from pathlib import Path from .util import set_lang_class, get_lang_class, parse_package_meta +from .deprecated import resolve_model_name from . import en from . import de @@ -35,13 +36,15 @@ set_lang_class(bn.Bengali.lang, bn.Bengali) def load(name, **overrides): data_path = overrides.get('path', util.get_data_path()) - meta = parse_package_meta(data_path, name, require=False) + model_name = resolve_model_name(name) + meta = parse_package_meta(data_path, model_name, require=False) lang = meta['lang'] if meta and 'lang' in meta else name cls = get_lang_class(lang) overrides['meta'] = meta - model_path = Path(data_path) / name + model_path = Path(data_path / model_name) if model_path.exists(): overrides['path'] = model_path + return cls(**overrides) diff --git a/spacy/about.py b/spacy/about.py index b2ceacf54..698e4e9ce 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -12,5 +12,5 @@ __license__ = 'MIT' __docs__ = 'https://spacy.io/docs/usage' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' -__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json?token=ANAt54fi5zcUtnwGhMLw2klWwcAyHkZGks5Y0nw1wA%3D%3D' +__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' __shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_web_md', 'vectors': 'en_vectors_glove_md'} diff --git a/spacy/deprecated.py b/spacy/deprecated.py index 72327c584..30be24942 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -2,6 +2,7 @@ from pathlib import Path from . import about from . import util from .download import download +from .link import link try: @@ -86,6 +87,35 @@ def fix_glove_vectors_loading(overrides): return overrides +def resolve_model_name(name): + """If spaCy is loaded with 'de', check if symlink already exists. If + not, user have upgraded from older version and have old models installed. + Check if old model directory exists and if so, return that instead and create + shortcut link. If English model is found and no shortcut exists, raise error + and tell user to install new model. + """ + + if name == 'en' or name == 'de': + versions = ['1.0.0', '1.1.0'] + data_path = Path(util.get_data_path()) + model_path = data_path / name + v_model_paths = [data_path / Path(name + '-' + v) for v in versions] + + if not model_path.exists(): # no shortcut found + for v_path in v_model_paths: + if v_path.exists(): # versioned model directory found + if name == 'de': + link(v_path, name) + return name + else: + raise ValueError( + "Found English model at {p}. This model is not " + "compatible with the current version. See " + "https://spacy.io/docs/usage/models to download the " + "new model.".format(p=v_path)) + return name + + class ModelDownload(): """Replace download modules within en and de with deprecation warning and download default language model (using shortcut). Use classmethods to allow diff --git a/spacy/download.py b/spacy/download.py index 1f8f701ff..f7ece25e9 100644 --- a/spacy/download.py +++ b/spacy/download.py @@ -28,6 +28,7 @@ def download(model=None, direct=False): download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) link(model_name, model, force=True) + def get_compatibility(): version = about.__version__ r = requests.get(about.__compatibility__) diff --git a/spacy/tests/test_download.py b/spacy/tests/test_download.py index 8d67364ea..728cacc41 100644 --- a/spacy/tests/test_download.py +++ b/spacy/tests/test_download.py @@ -5,6 +5,7 @@ from ..download import download, get_compatibility, get_version, check_error_dep import pytest +@pytest.mark.slow def test_download_fetch_compatibility(): compatibility = get_compatibility() assert type(compatibility) == dict diff --git a/website/_harp.json b/website/_harp.json index 9548cadcf..d26851804 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,10 +12,10 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.6", + "SPACY_VERSION": "1.7", "LATEST_NEWS": { - "url": "https://explosion.ai/blog/deep-learning-formula-nlp", - "title": "The new deep learning formula for state-of-the-art NLP models" + "url": "/docs/usage/models", + "title": "Downloading and installing models as packages" }, "SOCIAL": { @@ -54,8 +54,8 @@ } }, - "V_CSS": "1.15", - "V_JS": "1.1", + "V_CSS": "1.2", + "V_JS": "1.2", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", "MAILCHIMP": { diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 8a42024c1..ba5a9297e 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -67,6 +67,17 @@ mixin aside-code(label, language) block +//- Infobox + label - [string] infobox title (optional or false for no title) + +mixin infobox(label) + aside.o-box.o-block.u-text-small + if label + h3.u-text-label.u-color-theme=label + + block + + //- Link button url - [string] link href trusted - [boolean] if not set / false, rel="noopener nofollow" is added diff --git a/website/assets/css/_base/_layout.sass b/website/assets/css/_base/_layout.sass index bb5e82e62..3f680bdc2 100644 --- a/website/assets/css/_base/_layout.sass +++ b/website/assets/css/_base/_layout.sass @@ -22,7 +22,10 @@ p //- Links -main p a, main table a, main > *:not(footer) li a, .c-aside a +main p a, +main table a, +main > *:not(footer) li a, +main aside a @extend .u-link diff --git a/website/assets/css/_base/_objects.sass b/website/assets/css/_base/_objects.sass index 1be4b17d5..abd5453f4 100644 --- a/website/assets/css/_base/_objects.sass +++ b/website/assets/css/_base/_objects.sass @@ -62,6 +62,15 @@ border: 1px solid $color-subtle padding: 3rem 2.5% + +//- Box + +.o-box + background: $color-theme-light + padding: 2rem + border: 1px solid darken($color-theme-light, 5) + + //- Icons .o-icon diff --git a/website/assets/css/_components/_buttons.sass b/website/assets/css/_components/_buttons.sass index 647723380..f753e15bf 100644 --- a/website/assets/css/_components/_buttons.sass +++ b/website/assets/css/_components/_buttons.sass @@ -4,6 +4,7 @@ display: inline-block font-weight: bold padding: 0.75em 1em + margin-bottom: 1px border: 2px solid border-radius: 2px text-align: center diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index 9029161e2..bfef915be 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -34,6 +34,7 @@ $color-dark: lighten($color-front, 20) !default $color-theme: map-get($colors, $theme) $color-theme-dark: darken(map-get($colors, $theme), 5) +$color-theme-light: saturate(lighten(map-get($colors, $theme), 35), 15) $color-subtle: #ddd !default $color-subtle-light: #f6f6f6 !default diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 4621ec8c2..436d14abe 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -2,6 +2,7 @@ "sidebar": { "Get started": { "Installation": "./", + "Models": "models", "Lightning tour": "lightning-tour", "Resources": "resources" }, @@ -28,6 +29,11 @@ "index": { "title": "Install spaCy", + "next": "models" + }, + + "models": { + "title": "Models", "next": "lightning-tour" }, diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 479635e4b..b894bedde 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -12,6 +12,16 @@ p | #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X] | and #[a(href="#source-windows") Windows] for details. ++aside("Download models") + | After installation you need to download a language model. For more info + | and available models, see the #[+a("/docs/usage/models") docs on models]. + + +code.o-no-block. + python -m spacy.download en + + >>> import spacy + >>> nlp = spacy.load('en') + +h(2, "pip") pip p Using pip, spaCy releases are currently only available as source packages. @@ -43,64 +53,6 @@ p | #[+a("https://github.com/conda-forge/spacy-feedstock") this repository]. | Improvements and pull requests to the recipe and setup are always appreciated. -+h(2, "models") Download models - -p - | After installation you need to download a language model. Models for - | English (#[code en]) and German (#[code de]) are available. - -+code(false, "bash"). - python -m spacy.en.download all - python -m spacy.de.download all - -+aside-code("Examples", "bash"). - # Install English tagger, parser and NER - python -m spacy.en.download parser - - # Install English GloVe vectors - python -m spacy.en.download glove - - # Upgrade/overwrite existing data - python -m spacy.en.download --force - - # Check whether the model was successfully installed - python -c "import spacy; spacy.load('en'); print('OK')" - -p - | The download command fetches about 1 GB of data which it - | installs within the #[code spacy] package directory. - -+h(3, "custom-location") Download model to custom location - -p - | You can specify where #[code spacy.en.download] and - | #[code spacy.de.download] download the language model to using the - | #[code --data-path] or #[code -d] argument: - -+code(false, "bash"). - python -m spacy.en.download all --data-path /some/dir - -p - | If you choose to download to a custom location, you will need to tell - | spaCy where to load the model from in order to use it. You can do this - | either by calling #[code spacy.util.set_data_path()] before calling - | #[code spacy.load()], or by passing a #[code path] argument to the - | #[code spacy.en.English] or #[code spacy.de.German] constructors. - -+h(3, "models-manual") Download models manually - -p - | As of v1.6, the models and word vectors are also available as direct - | downloads from GitHub, attached to the #[+a(gh("spaCy") + "/releases") releases] as #[code .tar.gz] archives. - -p - | To install the models manually, first find the default data path. You can - | use #[code spacy.util.get_data_path()] to find the directory where spaCy - | will look for its models, or change the default data path with - | #[code spacy.util.set_data_path()]. Then simply unpack the archive and - | place the contained folder in that directory. You can now load the models - | via #[code spacy.load()]. - +h(2, "source") Compile from source p diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index cb08bc045..ba0adaa6e 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -6,6 +6,15 @@ p | The following examples and code snippets give you an overview of spaCy's | functionality and its usage. ++h(2, "models") Install and load models + ++code(false, "bash"). + python -m spacy.download en + ++code. + import spacy + nlp = spacy.load('en') + +h(2, "examples-resources") Load resources and process text +code. diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade new file mode 100644 index 000000000..3b1bb5f7d --- /dev/null +++ b/website/docs/usage/models.jade @@ -0,0 +1,285 @@ +//- 💫 DOCS > USAGE > MODELS + +include ../../_includes/_mixins + +p + | As of v1.7.0, models for spaCy can be installed as #[strong Python packages]. + | This means that they're a component of your application, just like any + | other module. They're versioned and can be defined as a dependency in your + | #[code requirements.txt]. Models can be installed from a download URL or + | a local directory, manually or via #[+a("https://pypi.python.org/pypi/pip") pip]. + | Their data can be located anywhere on your file system. To make a model + | available to spaCy, all you need to do is create a "shortcut link", an + | internal alias that tells spaCy where to find the data files for a specific + | model name. + ++infobox("Important note") + | Due to improvements in the English lemmatizer in v1.7.0, you need to download the + | new English model. The German model is still compatible and will be + | recognised and linked automatically. + ++aside-code("Quickstart"). + # Install spaCy and download English model + pip install spacy + python -m spacy.download en + + # Usage in Python + import spacy + nlp = spacy.load('en') + doc = nlp(u'This is a sentence.') + ++h(2, "available") Available models + ++table(["Name", "Size", "Description"]) + +row + +cell #[code en_core_web_sm] + +cell 50 MB + +cell Vocab, syntax, entities, word vectors #[+tag default] + + +row + +cell #[code en_core_web_md] + +cell 1 GB + +cell Vocab, syntax, entities, word vectors + + +row + +cell #[code en_depent_web_md] + +cell 328 MB + +cell Vocab, syntax, entities + + +row + +cell #[code en_vectors_glove_md] + +cell 727 MB + +cell + | #[+a("http://nlp.stanford.edu/projects/glove/") GloVe] Common + | Crawl vectors + + +row + +cell #[code de_core_news_md] + +cell 645 MB + +cell Vocab, syntax, entities, word vectors #[+tag default] + +p + | Models are now available as #[code .tar.gz] archives #[+a(gh("spacy-models")) from GitHub], + | attached to individual releases. They can be downloaded and loaded manually, + | or using spaCy's #[code download] and #[code link] commands. All models + | follow the naming convention of #[code [language]_[type]_[genre]_[size]]. + ++button(gh("spacy-models") + "/releases", true, "primary") View models + ++h(2, "download") Downloading models + ++aside("Downloading models in spaCy < v1.7") + | In older versions of spaCy, you can still use the old download commands. + | This will download and install the models into the #[code spacy/data] + | directory. + + +code.o-no-block. + python -m spacy.en.download all + python -m spacy.de.download all + python -m spacy.en.download glove + + | The old models are also #[+a(gh("spacy") + "/tree/v1.6.0") attached to the v1.6.0 release]. + | To download and install them manually, unpack the archive, drop the + | contained directory into #[code spacy/data] and load the model via + | #[code spacy.load('en')] or #[code spacy.load('de')]. + +p + | The easiest way to download a model is via spaCy's #[code download] + | command. It takes care of finding the best-matching model compatible with + | your spaCy installation. + ++code(false, "bash"). + # out-of-the-box: download best-matching default model + python -m spacy.download en + python -m spacy.download de + + # download best-matching version of specific model for your spaCy installation + python -m spacy.download en_core_web_md + + # download exact model version (doesn't create shortcut link) + python -m spacy.download en_core_web_md-1.2.0 --direct + +p + | The download command will #[+a("#download-pip") install the model] via + | pip, place the package in your #[code site-packages] directory and create + | a #[+a("#usage") shortcut link] that lets you load the model by name. The + | shortcut link will be the same as the model name used in + | #[code spacy.download]. + ++code(false, "bash"). + pip install spacy + python -m spacy.download en + ++code. + import spacy + nlp = spacy.load('en') + doc = nlp(u'This is a sentence.') + ++h(3, "download-pip") Installation via pip + +p + | To download a model directly using #[+a("https://pypi.python.org/pypi/pip") pip], + | simply point #[code pip install] to the URL or local path of the archive + | file. To find the direct link to a model, head over to the + | #[+a(gh("spacy-models") + "/releases") model releases], right click on the archive + | link and copy it to your clipboard. + ++code(false, "bash"). + # with external URL + pip install #{gh("spacy-models")}/releases/download/en_core_web_md-1.2.0/en_core_web_md-1.2.0.tar.gz + + # with local file + pip install /Users/you/en_core_web_md-1.2.0.tar.gz + +p + | By default, this will install the model into your #[code site-packages] + | directory. You can then create a #[+a("#usage") shortcut link] for your + | model to load it via #[code spacy.load()], or #[+a("usage-import") import it] + | as a Python module. + ++h(3, "download-manual") Manual download and installation + +p + | In some cases, you might prefer downloading the data manually, for + | example to place it into a custom directory. You can download the model + | via your browser from the #[+a(gh("spacy-models")) latest releases], or configure + | your own download script using the URL of the archive file. The archive + | consists of a model directory that contains another directory with the + | model data. + ++code("Directory structure", "yaml"). + └── en_core_web_md-1.2.0.tar.gz # downloaded archive + ├── meta.json # model meta data + ├── setup.py # setup file for pip installation + └── en_core_web_md # model directory + ├── __init__.py # init for pip installation + ├── meta.json # model meta data + └── en_core_web_md-1.2.0 # model data + +p + | You can place the model data directory anywhere on your local file system. + | To use it with spaCy, simply assign it a name by creating a + | #[+a("#usage") shortcut link] for the data directory. + ++h(2, "usage") Using models with spaCy + +p + | While previous versions of spaCy required you to maintain a data directory + | containing the models for each installation, you can now choose how and + | where you want to keep your data files. To load the models conveniently + | from within spaCy, you can use the #[code spacy.link] command to create a + | symlink. This lets you set up custom shortcut links for models so you can + | load them by name. + ++code(false, "bash"). + python -m spacy.link [package name or path] [shortcut] [--force] + +p + | The first argument is the package name (if the model was installed via + | pip), or a local path to the the data directory. The second argument is + | the internal name you want to use for the model. Setting the #[code --force] + | flag will overwrite any existing links. + ++code("Examples", "bash"). + # set up shortcut link to load installed package as "en_default" + python -m spacy.link en_core_web_md en_default + + # set up shortcut link to load local model as "my_amazing_model" + python -m spacy.link /Users/you/model my_amazing_model + ++h(3, "usage-loading") Loading models + +p + | To load a model, use #[code spacy.load()] with the model's shortcut link. + ++code. + import spacy + nlp = spacy.load('en_default') + doc = nlp(u'This is a sentence.') + +p + | You can also use the #[code info()] method to print a model's meta data + | before loading it. Each #[code Language] object returned by #[code spacy.load()] + | also exposes the model's meta data as the attribute #[code meta]: + ++code. + import spacy + spacy.info('en_default') + # JSON-formatted model meta data + + nlp = spacy.load('en_default') + print(nlp.meta['version']) + # 1.2.0 + ++h(3, "usage-import") Importing models as modules + +p + | If you've installed a model via pip, you can also #[code import] it + | directly and then call its #[code load()] method with no arguments: + ++code. + import spacy + import en_core_web_md + + nlp = en_core_web_md.load() + doc = nlp(u'This is a sentence.') + ++h(2, "own-models") Using your own models + +p + | If you've trained your own model, for example for + | #[+a("/docs/usage/adding-languages") additional languages], you can + | create a shortuct link for it by pointing #[code spacy.link] to the + | model's data directory. To allow your model to be downloaded and + | installed via pip, you'll also need to generate a package for it. + ++infobox("Important note") + | The model packages are #[strong not suitable] for the public + | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not + | designed for binary data and files over 50 MB. However, if your company + | is running an internal installation of pypi, publishing your models on + | there can be a convenient solution to share them with your team. + +p The model directory should look like this: + ++code("Directory structure", "yaml"). + └── / + ├── MANIFEST.in # to include meta.json + ├── meta.json # model meta data + ├── setup.py # setup file for pip installation + └── en_core_web_md # model directory + ├── __init__.py # init for pip installation + └── en_core_web_md-1.2.0 # model data + +p + | You can find templates for all files in our + | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | Unless you want to customise installation and loading, the only file + | you'll need to modify is #[code meta.json], which includes the model's + | meta data. It will later be copied into the package and data directory. + ++code("meta.json", "json"). + { + "name": "core_web_md", + "lang": "en", + "version": "1.2.0", + "spacy_version": "1.7.0", + "description": "English model for spaCy", + "author": "Explosion AI", + "email": "contact@explosion.ai", + "license": "MIT" + } + +p + | Keep in mind that the directories need to be named according to the + | naming conventions. The #[code lang] setting is also used to create the + | respective #[code Language] class in spaCy, which will later be returned + | by the model's #[code load()] method. + +p + | To generate the package, run the following command from within the + | directory. This will create a #[code .tar.gz] archive in a directory + | #[code /dist]. + ++code(false, "bash"). + python setup.py sdist diff --git a/website/docs/usage/resources.jade b/website/docs/usage/resources.jade index 754b951c7..56e92a1e7 100644 --- a/website/docs/usage/resources.jade +++ b/website/docs/usage/resources.jade @@ -7,6 +7,13 @@ p Many of the associated tools and resources that we're developing alongside spa +h(2, "developer") Developer tools +table(["Name", "Description"]) + +row + +cell + +src(gh("spacy-models")) spaCy Models + + +cell + | Model releases for spaCy. + +row +cell +src(gh("spacy-dev-resources")) spaCy Dev Resources @@ -55,7 +62,7 @@ p Many of the associated tools and resources that we're developing alongside spa +src(gh("thinc")) Thinc +cell - | Super sparse multi-class machine learning with Cython. + | spaCy's Machine Learning library for NLP in Python. +row +cell diff --git a/website/index.jade b/website/index.jade index 9d53432fc..1a5cd0826 100644 --- a/website/index.jade +++ b/website/index.jade @@ -66,7 +66,7 @@ include _includes/_mixins +grid +grid-col("two-thirds") +terminal("lightning_tour.py"). - # Install: pip install spacy && python -m spacy.en.download + # Install: pip install spacy && python -m spacy.download en import spacy # Load English tokenizer, tagger, parser, NER and word vectors