mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Update usage workflows
This commit is contained in:
		
							parent
							
								
									66088851dc
								
							
						
					
					
						commit
						8b86b08bed
					
				|  | @ -225,7 +225,7 @@ p | ||||||
| p | p | ||||||
|     |  Print a formatted, text-wrapped message with optional title. If a text |     |  Print a formatted, text-wrapped message with optional title. If a text | ||||||
|     |  argument is a #[code Path], it's converted to a string. Should only |     |  argument is a #[code Path], it's converted to a string. Should only | ||||||
|     |  be used for interactive components like the #[+a("/docs/api/cli") CLI]. |     |  be used for interactive components like the #[+api("cli") cli]. | ||||||
| 
 | 
 | ||||||
| +aside-code("Example"). | +aside-code("Example"). | ||||||
|     data_path = Path('/some/path') |     data_path = Path('/some/path') | ||||||
|  |  | ||||||
|  | @ -125,7 +125,7 @@ | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "saving-loading": { |     "saving-loading": { | ||||||
|         "title": "Saving and loading models" |         "title": "Saving, loading and data serialization" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "showcase": { |     "showcase": { | ||||||
|  |  | ||||||
|  | @ -538,8 +538,8 @@ p | ||||||
|     |  #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] |     |  #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] | ||||||
|     |  script from the spaCy developer resources. Note that your corpus should |     |  script from the spaCy developer resources. Note that your corpus should | ||||||
|     |  not be preprocessed (i.e. you need punctuation for example). The |     |  not be preprocessed (i.e. you need punctuation for example). The | ||||||
|     |  #[+a("/docs/api/cli#model") #[code model]] command expects a |     |  #[+api("cli#model") #[code model]] command expects a tab-separated word | ||||||
|     |  tab-separated word frequencies file with three columns: |     |  frequencies file with three columns: | ||||||
| 
 | 
 | ||||||
| +list("numbers") | +list("numbers") | ||||||
|     +item The number of times the word occurred in your language sample. |     +item The number of times the word occurred in your language sample. | ||||||
|  | @ -654,13 +654,12 @@ p | ||||||
|     |  If your corpus uses the |     |  If your corpus uses the | ||||||
|     |  #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, |     |  #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, | ||||||
|     |  i.e. files with the extension #[code .conllu], you can use the |     |  i.e. files with the extension #[code .conllu], you can use the | ||||||
|     |  #[+a("/docs/api/cli#convert") #[code convert]] command to convert it to |     |  #[+api("cli#convert") #[code convert]] command to convert it to spaCy's | ||||||
|     |  spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. |     |  #[+a("/docs/api/annotation#json-input") JSON format] for training. | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Once you have your UD corpus transformed into JSON, you can train your |     |  Once you have your UD corpus transformed into JSON, you can train your | ||||||
|     |  model use the using spaCy's |     |  model use the using spaCy's #[+api("cli#train") #[code train]] command: | ||||||
|     |  #[+a("/docs/api/cli#train") #[code train]] command: |  | ||||||
| 
 | 
 | ||||||
| +code(false, "bash"). | +code(false, "bash"). | ||||||
|     python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] |     python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] | ||||||
|  |  | ||||||
|  | @ -1,38 +0,0 @@ | ||||||
| //- 💫 DOCS > USAGE > CUSTOMIZING THE PIPELINE |  | ||||||
| 
 |  | ||||||
| include ../../_includes/_mixins |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  spaCy provides several linguistic annotation functions by default. Each |  | ||||||
|     |  function takes a Doc object, and modifies it in-place. The default |  | ||||||
|     |  pipeline is #[code [nlp.tagger, nlp.entity, nlp.parser]]. spaCy 1.0 |  | ||||||
|     |  introduced the ability to customise this pipeline with arbitrary |  | ||||||
|     |  functions. |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     def arbitrary_fixup_rules(doc): |  | ||||||
|         for token in doc: |  | ||||||
|             if token.text == u'bill' and token.tag_ == u'NNP': |  | ||||||
|                 token.tag_ = u'NN' |  | ||||||
| 
 |  | ||||||
|     def custom_pipeline(nlp): |  | ||||||
|         return (nlp.tagger, arbitrary_fixup_rules, nlp.parser, nlp.entity) |  | ||||||
| 
 |  | ||||||
|     nlp = spacy.load('en', create_pipeline=custom_pipeline) |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  The easiest way to customise the pipeline is to pass a |  | ||||||
|     |  #[code create_pipeline] callback to the #[code spacy.load()] function. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  The callback you pass to #[code create_pipeline] should take a single |  | ||||||
|     |  argument, and return a sequence of callables. Each callable in the |  | ||||||
|     |  sequence should accept a #[code Doc] object and modify it in place. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Instead of passing a callback, you can also write to the |  | ||||||
|     |  #[code .pipeline] attribute directly. |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     nlp = spacy.load('en') |  | ||||||
|     nlp.pipeline = [nlp.tagger] |  | ||||||
|  | @ -291,7 +291,7 @@ p | ||||||
|     |  environment variable, as this can lead to unexpected results, especially |     |  environment variable, as this can lead to unexpected results, especially | ||||||
|     |  when using #[code virtualenv]. Run the command with #[code python -m], |     |  when using #[code virtualenv]. Run the command with #[code python -m], | ||||||
|     |  for example #[code python -m spacy download en]. For more info on this, |     |  for example #[code python -m spacy download en]. For more info on this, | ||||||
|     |  see the #[+a("/docs/api/cli#download") CLI documentation]. |     |  see #[+api("cli#download") download]. | ||||||
| 
 | 
 | ||||||
| +h(3, "module-load") 'module' object has no attribute 'load' | +h(3, "module-load") 'module' object has no attribute 'load' | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -10,14 +10,19 @@ p | ||||||
|     doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') |     doc = nlp(u'Hello, world! A three sentence document.\nWith new lines...') | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  The library should perform equally well with short or long documents. |     |  The library should perform equally well with #[strong short or long documents]. | ||||||
|     |  All algorithms are linear-time in the length of the string, and once the |     |  All algorithms are linear-time in the length of the string, and once the | ||||||
|     |  data is loaded, there's no significant start-up cost to consider. This |     |  data is loaded, there's no significant start-up cost to consider. This | ||||||
|     |  means that you don't have to strategically merge or split your text — |     |  means that you don't have to strategically merge or split your text — | ||||||
|     |  you should feel free to feed in either single tweets or whole novels. |     |  you should feel free to feed in either single tweets or whole novels. | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  If you run #[code nlp = spacy.load('en')], the #[code nlp] object will |     |  If you run #[+api("spacy#load") #[code spacy.load('en')]], spaCy will | ||||||
|  |     |  load the #[+a("/docs/usage/models") model] associated with the name | ||||||
|  |     |  #[code 'en']. Each model is a Python package containing an | ||||||
|  |     |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py"))__init__.py] | ||||||
|  | 
 | ||||||
|  | the #[code nlp] object will | ||||||
|     |  be an instance of #[code spacy.en.English]. This means that when you run |     |  be an instance of #[code spacy.en.English]. This means that when you run | ||||||
|     |  #[code doc = nlp(text)], you're executing |     |  #[code doc = nlp(text)], you're executing | ||||||
|     |  #[code spacy.en.English.__call__], which is implemented on its parent |     |  #[code spacy.en.English.__call__], which is implemented on its parent | ||||||
|  |  | ||||||
|  | @ -1,5 +1,8 @@ | ||||||
| include ../../_includes/_mixins | include ../../_includes/_mixins | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | +h(2, "models") Saving models | ||||||
|  | 
 | ||||||
| p | p | ||||||
|     |  After training your model, you'll usually want to save its state, and load |     |  After training your model, you'll usually want to save its state, and load | ||||||
|     |  it back later. You can do this with the |     |  it back later. You can do this with the | ||||||
|  | @ -14,28 +17,28 @@ p | ||||||
|     |  will be written out. To make the model more convenient to deploy, we |     |  will be written out. To make the model more convenient to deploy, we | ||||||
|     |  recommend wrapping it as a Python package. |     |  recommend wrapping it as a Python package. | ||||||
| 
 | 
 | ||||||
| +h(2, "generating") Generating a model package | +h(3, "models-generating") Generating a model package | ||||||
| 
 | 
 | ||||||
| +infobox("Important note") | +infobox("Important note") | ||||||
|     |  The model packages are #[strong not suitable] for the public |     |  The model packages are #[strong not suitable] for the public | ||||||
|     |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not |     |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not | ||||||
|     |  designed for binary data and files over 50 MB. However, if your company |     |  designed for binary data and files over 50 MB. However, if your company | ||||||
|     |  is running an internal installation of pypi, publishing your models on |     |  is running an #[strong internal installation] of PyPi, publishing your | ||||||
|     |  there can be a convenient solution to share them with your team. |     |  models on there can be a convenient way to share them with your team. | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  spaCy comes with a handy CLI command that will create all required files, |     |  spaCy comes with a handy CLI command that will create all required files, | ||||||
|     |  and walk you through generating the meta data. You can also create the |     |  and walk you through generating the meta data. You can also create the | ||||||
|     |  meta.json manually and place it in the model data directory, or supply a |     |  meta.json manually and place it in the model data directory, or supply a | ||||||
|     |  path to it using the #[code --meta] flag. For more info on this, see the |     |  path to it using the #[code --meta] flag. For more info on this, see | ||||||
|     |  #[+a("/docs/api/cli#package") #[code package]] command documentation. |     |  the #[+api("cli#package") #[code package]] docs. | ||||||
| 
 | 
 | ||||||
| +aside-code("meta.json", "json"). | +aside-code("meta.json", "json"). | ||||||
|     { |     { | ||||||
|         "name": "example_model", |         "name": "example_model", | ||||||
|         "lang": "en", |         "lang": "en", | ||||||
|         "version": "1.0.0", |         "version": "1.0.0", | ||||||
|         "spacy_version": ">=1.7.0,<2.0.0", |         "spacy_version": ">=2.0.0,<3.0.0", | ||||||
|         "description": "Example model for spaCy", |         "description": "Example model for spaCy", | ||||||
|         "author": "You", |         "author": "You", | ||||||
|         "email": "you@example.com", |         "email": "you@example.com", | ||||||
|  | @ -58,7 +61,7 @@ p This command will create a model package directory that should look like this: | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  You can also find templates for all files in our |     |  You can also find templates for all files in our | ||||||
|     |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. |     |  #[+src(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. | ||||||
|     |  If you're creating the package manually, keep in mind that the directories |     |  If you're creating the package manually, keep in mind that the directories | ||||||
|     |  need to be named according to the naming conventions of |     |  need to be named according to the naming conventions of | ||||||
|     |  #[code [language]_[name]] and #[code [language]_[name]-[version]]. The |     |  #[code [language]_[name]] and #[code [language]_[name]-[version]]. The | ||||||
|  | @ -66,44 +69,49 @@ p | ||||||
|     |  respective #[code Language] class in spaCy, which will later be returned |     |  respective #[code Language] class in spaCy, which will later be returned | ||||||
|     |  by the model's #[code load()] method. |     |  by the model's #[code load()] method. | ||||||
| 
 | 
 | ||||||
| +h(2, "building") Building a model package |  | ||||||
| 
 |  | ||||||
| p | p | ||||||
|     |  To build the package, run the following command from within the |     |  To #[strong build the package], run the following command from within the | ||||||
|     |  directory. This will create a #[code .tar.gz] archive in a directory |     |  directory. This will create a #[code .tar.gz] archive in a directory | ||||||
|     |  #[code /dist]. |     |  #[code /dist]. For more information on building Python packages, see the | ||||||
|  |     |  #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| +code(false, "bash"). | +code(false, "bash"). | ||||||
|     python setup.py sdist |     python setup.py sdist | ||||||
| 
 | 
 | ||||||
| p | +h(2, "loading") Loading a custom model package | ||||||
|     |  For more information on building Python packages, see the |  | ||||||
|     |  #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +h(2, "loading") Loading a model package |  | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Model packages can be installed by pointing pip to the model's |     |  To load a model from a data directory, you can use | ||||||
|     |  #[code .tar.gz] archive: |     |  #[+api("spacy#load") #[code spacy.load()]] with the local path: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     nlp = spacy.load('/path/to/model') | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  If you have generated a model package, you can also install it by | ||||||
|  |     |  pointing pip to the model's #[code .tar.gz] archive – this is pretty | ||||||
|  |     |  much exactly what spaCy's #[+api("cli#download") #[code download]] | ||||||
|  |     |  command does under the hood. | ||||||
| 
 | 
 | ||||||
| +code(false, "bash"). | +code(false, "bash"). | ||||||
|     pip install /path/to/en_example_model-1.0.0.tar.gz |     pip install /path/to/en_example_model-1.0.0.tar.gz | ||||||
| 
 | 
 | ||||||
| p You'll then be able to load the model as follows: | +aside-code("Custom model names", "bash"). | ||||||
|  |     # optional: assign custom name to model | ||||||
|  |     python -m spacy link en_example_model my_cool_model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  You'll then be able to load the model via spaCy's loader, or by importing | ||||||
|  |     |  it as a module. For larger code bases, we usually recommend native | ||||||
|  |     |  imports, as this will make it easier to integrate models with your | ||||||
|  |     |  existing build process, continuous integration workflow and testing | ||||||
|  |     |  framework. | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|  |     # option 1: import model as module | ||||||
|     import en_example_model |     import en_example_model | ||||||
|     nlp = en_example_model.load() |     nlp = en_example_model.load() | ||||||
| 
 | 
 | ||||||
| p |     # option 2: use spacy.load() | ||||||
|     |  To load the model via #[code spacy.load()], you can also |     nlp = spacy.load('en_example_model') | ||||||
|     |  create a #[+a("/docs/usage/models#usage") shortcut link] that maps the |  | ||||||
|     |  package name to a custom model name of your choice: |  | ||||||
| 
 |  | ||||||
| +code(false, "bash"). |  | ||||||
|     python -m spacy link en_example_model example |  | ||||||
| 
 |  | ||||||
| +code. |  | ||||||
|     import spacy |  | ||||||
|     nlp = spacy.load('example') |  | ||||||
|  |  | ||||||
|  | @ -77,7 +77,7 @@ p | ||||||
| p | p | ||||||
|     |  To make the model more convenient to deploy, we recommend wrapping it as |     |  To make the model more convenient to deploy, we recommend wrapping it as | ||||||
|     |  a Python package, so that you can install it via pip and load it as a |     |  a Python package, so that you can install it via pip and load it as a | ||||||
|     |  module. spaCy comes with a handy #[+a("/docs/api/cli#package") #[code package]] |     |  module. spaCy comes with a handy #[+api("cli#package") #[code package]] | ||||||
|     |  CLI command to create all required files and directories. |     |  CLI command to create all required files and directories. | ||||||
| 
 | 
 | ||||||
| +code(false, "bash"). | +code(false, "bash"). | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user