mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Update usage workflows
This commit is contained in:
parent
66088851dc
commit
8b86b08bed
|
@ -225,7 +225,7 @@ p
|
||||||
p
|
p
|
||||||
| Print a formatted, text-wrapped message with optional title. If a text
|
| Print a formatted, text-wrapped message with optional title. If a text
|
||||||
| argument is a #[code Path], it's converted to a string. Should only
|
| argument is a #[code Path], it's converted to a string. Should only
|
||||||
| be used for interactive components like the #[+a("/docs/api/cli") CLI].
|
| be used for interactive components like the #[+api("cli") cli].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
data_path = Path('/some/path')
|
data_path = Path('/some/path')
|
||||||
|
|
|
@ -125,7 +125,7 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
"saving-loading": {
|
"saving-loading": {
|
||||||
"title": "Saving and loading models"
|
"title": "Saving, loading and data serialization"
|
||||||
},
|
},
|
||||||
|
|
||||||
"showcase": {
|
"showcase": {
|
||||||
|
|
|
@ -538,8 +538,8 @@ p
|
||||||
| #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py]
|
| #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py]
|
||||||
| script from the spaCy developer resources. Note that your corpus should
|
| script from the spaCy developer resources. Note that your corpus should
|
||||||
| not be preprocessed (i.e. you need punctuation for example). The
|
| not be preprocessed (i.e. you need punctuation for example). The
|
||||||
| #[+a("/docs/api/cli#model") #[code model]] command expects a
|
| #[+api("cli#model") #[code model]] command expects a tab-separated word
|
||||||
| tab-separated word frequencies file with three columns:
|
| frequencies file with three columns:
|
||||||
|
|
||||||
+list("numbers")
|
+list("numbers")
|
||||||
+item The number of times the word occurred in your language sample.
|
+item The number of times the word occurred in your language sample.
|
||||||
|
@ -654,13 +654,12 @@ p
|
||||||
| If your corpus uses the
|
| If your corpus uses the
|
||||||
| #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
|
| #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format,
|
||||||
| i.e. files with the extension #[code .conllu], you can use the
|
| i.e. files with the extension #[code .conllu], you can use the
|
||||||
| #[+a("/docs/api/cli#convert") #[code convert]] command to convert it to
|
| #[+api("cli#convert") #[code convert]] command to convert it to spaCy's
|
||||||
| spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training.
|
| #[+a("/docs/api/annotation#json-input") JSON format] for training.
|
||||||
|
|
||||||
p
|
p
|
||||||
| Once you have your UD corpus transformed into JSON, you can train your
|
| Once you have your UD corpus transformed into JSON, you can train your
|
||||||
| model use the using spaCy's
|
| model use the using spaCy's #[+api("cli#train") #[code train]] command:
|
||||||
| #[+a("/docs/api/cli#train") #[code train]] command:
|
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
|
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
//- 💫 DOCS > USAGE > CUSTOMIZING THE PIPELINE
|
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
|
||||||
|
|
||||||
p
|
|
||||||
| spaCy provides several linguistic annotation functions by default. Each
|
|
||||||
| function takes a Doc object, and modifies it in-place. The default
|
|
||||||
| pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0
|
|
||||||
| introduced the ability to customise this pipeline with arbitrary
|
|
||||||
| functions.
|
|
||||||
|
|
||||||
+code.
|
|
||||||
def arbitrary_fixup_rules(doc):
|
|
||||||
for token in doc:
|
|
||||||
if token.text == u'bill' and token.tag_ == u'NNP':
|
|
||||||
token.tag_ = u'NN'
|
|
||||||
|
|
||||||
def custom_pipeline(nlp):
|
|
||||||
return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity)
|
|
||||||
|
|
||||||
nlp = spacy.load('en', create_pipeline=custom_pipeline)
|
|
||||||
|
|
||||||
p
|
|
||||||
| The easiest way to customise the pipeline is to pass a
|
|
||||||
| #[code create_pipeline] callback to the #[code spacy.load()] function.
|
|
||||||
|
|
||||||
p
|
|
||||||
| The callback you pass to #[code create_pipeline] should take a single
|
|
||||||
| argument, and return a sequence of callables. Each callable in the
|
|
||||||
| sequence should accept a #[code Doc] object and modify it in place.
|
|
||||||
|
|
||||||
p
|
|
||||||
| Instead of passing a callback, you can also write to the
|
|
||||||
| #[code .pipeline] attribute directly.
|
|
||||||
|
|
||||||
+code.
|
|
||||||
nlp = spacy.load('en')
|
|
||||||
nlp.pipeline = [nlp.tagger]
|
|
|
@ -291,7 +291,7 @@ p
|
||||||
| environment variable, as this can lead to unexpected results, especially
|
| environment variable, as this can lead to unexpected results, especially
|
||||||
| when using #[code virtualenv]. Run the command with #[code python -m],
|
| when using #[code virtualenv]. Run the command with #[code python -m],
|
||||||
| for example #[code python -m spacy download en]. For more info on this,
|
| for example #[code python -m spacy download en]. For more info on this,
|
||||||
| see the #[+a("/docs/api/cli#download") CLI documentation].
|
| see #[+api("cli#download") download].
|
||||||
|
|
||||||
+h(3, "module-load") 'module' object has no attribute 'load'
|
+h(3, "module-load") 'module' object has no attribute 'load'
|
||||||
|
|
||||||
|
|
|
@ -10,14 +10,19 @@ p
|
||||||
doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...')
|
doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...')
|
||||||
|
|
||||||
p
|
p
|
||||||
| The library should perform equally well with short or long documents.
|
| The library should perform equally well with #[strong short or long documents].
|
||||||
| All algorithms are linear-time in the length of the string, and once the
|
| All algorithms are linear-time in the length of the string, and once the
|
||||||
| data is loaded, there's no significant start-up cost to consider. This
|
| data is loaded, there's no significant start-up cost to consider. This
|
||||||
| means that you don't have to strategically merge or split your text —
|
| means that you don't have to strategically merge or split your text —
|
||||||
| you should feel free to feed in either single tweets or whole novels.
|
| you should feel free to feed in either single tweets or whole novels.
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you run #[code nlp = spacy.load('en')], the #[code nlp] object will
|
| If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will
|
||||||
|
| load the #[+a("/docs/usage/models") model] associated with the name
|
||||||
|
| #[code 'en']. Each model is a Python package containing an
|
||||||
|
| #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py]
|
||||||
|
|
||||||
|
the #[code nlp] object will
|
||||||
| be an instance of #[code spacy.en.English]. This means that when you run
|
| be an instance of #[code spacy.en.English]. This means that when you run
|
||||||
| #[code doc = nlp(text)], you're executing
|
| #[code doc = nlp(text)], you're executing
|
||||||
| #[code spacy.en.English.__call__], which is implemented on its parent
|
| #[code spacy.en.English.__call__], which is implemented on its parent
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
|
||||||
|
+h(2, "models") Saving models
|
||||||
|
|
||||||
p
|
p
|
||||||
| After training your model, you'll usually want to save its state, and load
|
| After training your model, you'll usually want to save its state, and load
|
||||||
| it back later. You can do this with the
|
| it back later. You can do this with the
|
||||||
|
@ -14,28 +17,28 @@ p
|
||||||
| will be written out. To make the model more convenient to deploy, we
|
| will be written out. To make the model more convenient to deploy, we
|
||||||
| recommend wrapping it as a Python package.
|
| recommend wrapping it as a Python package.
|
||||||
|
|
||||||
+h(2, "generating") Generating a model package
|
+h(3, "models-generating") Generating a model package
|
||||||
|
|
||||||
+infobox("Important note")
|
+infobox("Important note")
|
||||||
| The model packages are #[strong not suitable] for the public
|
| The model packages are #[strong not suitable] for the public
|
||||||
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
|
| #[+a("https://pypi.python.org") pypi.python.org] directory, which is not
|
||||||
| designed for binary data and files over 50 MB. However, if your company
|
| designed for binary data and files over 50 MB. However, if your company
|
||||||
| is running an internal installation of pypi, publishing your models on
|
| is running an #[strong internal installation] of PyPi, publishing your
|
||||||
| there can be a convenient solution to share them with your team.
|
| models on there can be a convenient way to share them with your team.
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy comes with a handy CLI command that will create all required files,
|
| spaCy comes with a handy CLI command that will create all required files,
|
||||||
| and walk you through generating the meta data. You can also create the
|
| and walk you through generating the meta data. You can also create the
|
||||||
| meta.json manually and place it in the model data directory, or supply a
|
| meta.json manually and place it in the model data directory, or supply a
|
||||||
| path to it using the #[code --meta] flag. For more info on this, see the
|
| path to it using the #[code --meta] flag. For more info on this, see
|
||||||
| #[+a("/docs/api/cli#package") #[code package]] command documentation.
|
| the #[+api("cli#package") #[code package]] docs.
|
||||||
|
|
||||||
+aside-code("meta.json", "json").
|
+aside-code("meta.json", "json").
|
||||||
{
|
{
|
||||||
"name": "example_model",
|
"name": "example_model",
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"spacy_version": ">=1.7.0,<2.0.0",
|
"spacy_version": ">=2.0.0,<3.0.0",
|
||||||
"description": "Example model for spaCy",
|
"description": "Example model for spaCy",
|
||||||
"author": "You",
|
"author": "You",
|
||||||
"email": "you@example.com",
|
"email": "you@example.com",
|
||||||
|
@ -58,7 +61,7 @@ p This command will create a model package directory that should look like this:
|
||||||
|
|
||||||
p
|
p
|
||||||
| You can also find templates for all files in our
|
| You can also find templates for all files in our
|
||||||
| #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
|
| #[+src(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources].
|
||||||
| If you're creating the package manually, keep in mind that the directories
|
| If you're creating the package manually, keep in mind that the directories
|
||||||
| need to be named according to the naming conventions of
|
| need to be named according to the naming conventions of
|
||||||
| #[code [language]_[name]] and #[code [language]_[name]-[version]]. The
|
| #[code [language]_[name]] and #[code [language]_[name]-[version]]. The
|
||||||
|
@ -66,44 +69,49 @@ p
|
||||||
| respective #[code Language] class in spaCy, which will later be returned
|
| respective #[code Language] class in spaCy, which will later be returned
|
||||||
| by the model's #[code load()] method.
|
| by the model's #[code load()] method.
|
||||||
|
|
||||||
+h(2, "building") Building a model package
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| To build the package, run the following command from within the
|
| To #[strong build the package], run the following command from within the
|
||||||
| directory. This will create a #[code .tar.gz] archive in a directory
|
| directory. This will create a #[code .tar.gz] archive in a directory
|
||||||
| #[code /dist].
|
| #[code /dist]. For more information on building Python packages, see the
|
||||||
|
| #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
|
||||||
|
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python setup.py sdist
|
python setup.py sdist
|
||||||
|
|
||||||
p
|
+h(2, "loading") Loading a custom model package
|
||||||
| For more information on building Python packages, see the
|
|
||||||
| #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation].
|
|
||||||
|
|
||||||
|
|
||||||
+h(2, "loading") Loading a model package
|
|
||||||
|
|
||||||
p
|
p
|
||||||
| Model packages can be installed by pointing pip to the model's
|
| To load a model from a data directory, you can use
|
||||||
| #[code .tar.gz] archive:
|
| #[+api("spacy#load") #[code spacy.load()]] with the local path:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
nlp = spacy.load('/path/to/model')
|
||||||
|
|
||||||
|
p
|
||||||
|
| If you have generated a model package, you can also install it by
|
||||||
|
| pointing pip to the model's #[code .tar.gz] archive – this is pretty
|
||||||
|
| much exactly what spaCy's #[+api("cli#download") #[code download]]
|
||||||
|
| command does under the hood.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
pip install /path/to/en_example_model-1.0.0.tar.gz
|
pip install /path/to/en_example_model-1.0.0.tar.gz
|
||||||
|
|
||||||
p You'll then be able to load the model as follows:
|
+aside-code("Custom model names", "bash").
|
||||||
|
# optional: assign custom name to model
|
||||||
|
python -m spacy link en_example_model my_cool_model
|
||||||
|
|
||||||
|
p
|
||||||
|
| You'll then be able to load the model via spaCy's loader, or by importing
|
||||||
|
| it as a module. For larger code bases, we usually recommend native
|
||||||
|
| imports, as this will make it easier to integrate models with your
|
||||||
|
| existing build process, continuous integration workflow and testing
|
||||||
|
| framework.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
|
# option 1: import model as module
|
||||||
import en_example_model
|
import en_example_model
|
||||||
nlp = en_example_model.load()
|
nlp = en_example_model.load()
|
||||||
|
|
||||||
p
|
# option 2: use spacy.load()
|
||||||
| To load the model via #[code spacy.load()], you can also
|
nlp = spacy.load('en_example_model')
|
||||||
| create a #[+a("/docs/usage/models#usage") shortcut link] that maps the
|
|
||||||
| package name to a custom model name of your choice:
|
|
||||||
|
|
||||||
+code(false, "bash").
|
|
||||||
python -m spacy link en_example_model example
|
|
||||||
|
|
||||||
+code.
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load('example')
|
|
||||||
|
|
|
@ -77,7 +77,7 @@ p
|
||||||
p
|
p
|
||||||
| To make the model more convenient to deploy, we recommend wrapping it as
|
| To make the model more convenient to deploy, we recommend wrapping it as
|
||||||
| a Python package, so that you can install it via pip and load it as a
|
| a Python package, so that you can install it via pip and load it as a
|
||||||
| module. spaCy comes with a handy #[+a("/docs/api/cli#package") #[code package]]
|
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
|
||||||
| CLI command to create all required files and directories.
|
| CLI command to create all required files and directories.
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
|
|
Loading…
Reference in New Issue
Block a user