From 8b86b08bedf8143dad696bc6077f4c10a12782b9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 24 May 2017 11:59:08 +0200 Subject: [PATCH] Update usage workflows --- website/docs/api/util.jade | 2 +- website/docs/usage/_data.json | 2 +- website/docs/usage/adding-languages.jade | 11 ++- website/docs/usage/customizing-pipeline.jade | 38 ----------- website/docs/usage/index.jade | 2 +- website/docs/usage/processing-text.jade | 9 ++- website/docs/usage/saving-loading.jade | 70 +++++++++++--------- website/docs/usage/training-ner.jade | 2 +- 8 files changed, 55 insertions(+), 81 deletions(-) delete mode 100644 website/docs/usage/customizing-pipeline.jade diff --git a/website/docs/api/util.jade b/website/docs/api/util.jade index f14cdbb6d..bf81a4f61 100644 --- a/website/docs/api/util.jade +++ b/website/docs/api/util.jade @@ -225,7 +225,7 @@ p p | Print a formatted, text-wrapped message with optional title. If a text | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the #[+a("/docs/api/cli") CLI]. + | be used for interactive components like the #[+api("cli") cli]. +aside-code("Example"). data_path = Path('/some/path') diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index f903c7c1e..acd973aa1 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -125,7 +125,7 @@ }, "saving-loading": { - "title": "Saving and loading models" + "title": "Saving, loading and data serialization" }, "showcase": { diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index f3648b885..ae04aad57 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -538,8 +538,8 @@ p | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] | script from the spaCy developer resources. Note that your corpus should | not be preprocessed (i.e. you need punctuation for example). The - | #[+a("/docs/api/cli#model") #[code model]] command expects a - | tab-separated word frequencies file with three columns: + | #[+api("cli#model") #[code model]] command expects a tab-separated word + | frequencies file with three columns: +list("numbers") +item The number of times the word occurred in your language sample. @@ -654,13 +654,12 @@ p | If your corpus uses the | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, | i.e. files with the extension #[code .conllu], you can use the - | #[+a("/docs/api/cli#convert") #[code convert]] command to convert it to - | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. + | #[+api("cli#convert") #[code convert]] command to convert it to spaCy's + | #[+a("/docs/api/annotation#json-input") JSON format] for training. p | Once you have your UD corpus transformed into JSON, you can train your - | model use the using spaCy's - | #[+a("/docs/api/cli#train") #[code train]] command: + | model use the using spaCy's #[+api("cli#train") #[code train]] command: +code(false, "bash"). python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] diff --git a/website/docs/usage/customizing-pipeline.jade b/website/docs/usage/customizing-pipeline.jade deleted file mode 100644 index a4846d02e..000000000 --- a/website/docs/usage/customizing-pipeline.jade +++ /dev/null @@ -1,38 +0,0 @@ -//- 💫 DOCS > USAGE > CUSTOMIZING THE PIPELINE - -include ../../_includes/_mixins - -p - | spaCy provides several linguistic annotation functions by default. Each - | function takes a Doc object, and modifies it in-place. The default - | pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 - | introduced the ability to customise this pipeline with arbitrary - | functions. - -+code. - def arbitrary_fixup_rules(doc): - for token in doc: - if token.text == u'bill' and token.tag_ == u'NNP': - token.tag_ = u'NN' - - def custom_pipeline(nlp): - return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) - - nlp = spacy.load('en', create_pipeline=custom_pipeline) - -p - | The easiest way to customise the pipeline is to pass a - | #[code create_pipeline] callback to the #[code spacy.load()] function. - -p - | The callback you pass to #[code create_pipeline] should take a single - | argument, and return a sequence of callables. Each callable in the - | sequence should accept a #[code Doc] object and modify it in place. - -p - | Instead of passing a callback, you can also write to the - | #[code .pipeline] attribute directly. - -+code. - nlp = spacy.load('en') - nlp.pipeline = [nlp.tagger] diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index 61398b431..cb1ab5754 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -291,7 +291,7 @@ p | environment variable, as this can lead to unexpected results, especially | when using #[code virtualenv]. Run the command with #[code python -m], | for example #[code python -m spacy download en]. For more info on this, - | see the #[+a("/docs/api/cli#download") CLI documentation]. + | see #[+api("cli#download") download]. +h(3, "module-load") 'module' object has no attribute 'load' diff --git a/website/docs/usage/processing-text.jade b/website/docs/usage/processing-text.jade index 4bd6132d2..2562d9fc4 100644 --- a/website/docs/usage/processing-text.jade +++ b/website/docs/usage/processing-text.jade @@ -10,14 +10,19 @@ p doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') p - | The library should perform equally well with short or long documents. + | The library should perform equally well with #[strong short or long documents]. | All algorithms are linear-time in the length of the string, and once the | data is loaded, there's no significant start-up cost to consider. This | means that you don't have to strategically merge or split your text — | you should feel free to feed in either single tweets or whole novels. p - | If you run #[code nlp = spacy.load('en')], the #[code nlp] object will + | If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will + | load the #[+a("/docs/usage/models") model] associated with the name + | #[code 'en']. Each model is a Python package containing an + | #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py] + +the #[code nlp] object will | be an instance of #[code spacy.en.English]. This means that when you run | #[code doc = nlp(text)], you're executing | #[code spacy.en.English.__call__], which is implemented on its parent diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 3513e9505..63c951d40 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -1,5 +1,8 @@ include ../../_includes/_mixins + ++h(2, "models") Saving models + p | After training your model, you'll usually want to save its state, and load | it back later. You can do this with the @@ -14,28 +17,28 @@ p | will be written out. To make the model more convenient to deploy, we | recommend wrapping it as a Python package. -+h(2, "generating") Generating a model package ++h(3, "models-generating") Generating a model package +infobox("Important note") | The model packages are #[strong not suitable] for the public | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not | designed for binary data and files over 50 MB. However, if your company - | is running an internal installation of pypi, publishing your models on - | there can be a convenient solution to share them with your team. + | is running an #[strong internal installation] of PyPi, publishing your + | models on there can be a convenient way to share them with your team. p | spaCy comes with a handy CLI command that will create all required files, | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a - | path to it using the #[code --meta] flag. For more info on this, see the - | #[+a("/docs/api/cli#package") #[code package]] command documentation. + | path to it using the #[code --meta] flag. For more info on this, see + | the #[+api("cli#package") #[code package]] docs. +aside-code("meta.json", "json"). { "name": "example_model", "lang": "en", "version": "1.0.0", - "spacy_version": ">=1.7.0,<2.0.0", + "spacy_version": ">=2.0.0,<3.0.0", "description": "Example model for spaCy", "author": "You", "email": "you@example.com", @@ -58,7 +61,7 @@ p This command will create a model package directory that should look like this: p | You can also find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | #[+src(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The @@ -66,44 +69,49 @@ p | respective #[code Language] class in spaCy, which will later be returned | by the model's #[code load()] method. -+h(2, "building") Building a model package - p - | To build the package, run the following command from within the + | To #[strong build the package], run the following command from within the | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. + | #[code /dist]. For more information on building Python packages, see the + | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + +code(false, "bash"). python setup.py sdist -p - | For more information on building Python packages, see the - | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. - - -+h(2, "loading") Loading a model package ++h(2, "loading") Loading a custom model package p - | Model packages can be installed by pointing pip to the model's - | #[code .tar.gz] archive: + | To load a model from a data directory, you can use + | #[+api("spacy#load") #[code spacy.load()]] with the local path: + ++code. + nlp = spacy.load('/path/to/model') + +p + | If you have generated a model package, you can also install it by + | pointing pip to the model's #[code .tar.gz] archive – this is pretty + | much exactly what spaCy's #[+api("cli#download") #[code download]] + | command does under the hood. +code(false, "bash"). pip install /path/to/en_example_model-1.0.0.tar.gz -p You'll then be able to load the model as follows: ++aside-code("Custom model names", "bash"). + # optional: assign custom name to model + python -m spacy link en_example_model my_cool_model + +p + | You'll then be able to load the model via spaCy's loader, or by importing + | it as a module. For larger code bases, we usually recommend native + | imports, as this will make it easier to integrate models with your + | existing build process, continuous integration workflow and testing + | framework. +code. + # option 1: import model as module import en_example_model nlp = en_example_model.load() -p - | To load the model via #[code spacy.load()], you can also - | create a #[+a("/docs/usage/models#usage") shortcut link] that maps the - | package name to a custom model name of your choice: - -+code(false, "bash"). - python -m spacy link en_example_model example - -+code. - import spacy - nlp = spacy.load('example') + # option 2: use spacy.load() + nlp = spacy.load('en_example_model') diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 4d864ac9d..8b8789485 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -77,7 +77,7 @@ p p | To make the model more convenient to deploy, we recommend wrapping it as | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+a("/docs/api/cli#package") #[code package]] + | module. spaCy comes with a handy #[+api("cli#package") #[code package]] | CLI command to create all required files and directories. +code(false, "bash").