mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'master' into develop
This commit is contained in:
		
						commit
						ad74245be9
					
				|  | @ -10,7 +10,7 @@ open-source software, released under the MIT license. | ||||||
| 
 | 
 | ||||||
| 📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_. | 📊 **Help us improve the library!** `Take the spaCy user survey <https://survey.spacy.io>`_. | ||||||
| 
 | 
 | ||||||
| 💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_ | 💫 **Version 1.8 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_ | ||||||
| 
 | 
 | ||||||
| .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square | .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square | ||||||
|     :target: https://travis-ci.org/explosion/spaCy |     :target: https://travis-ci.org/explosion/spaCy | ||||||
|  | @ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests: | ||||||
| =========== ============== =========== | =========== ============== =========== | ||||||
| Version     Date           Description | Version     Date           Description | ||||||
| =========== ============== =========== | =========== ============== =========== | ||||||
|  | `v1.8.0`_   ``2017-04-16`` Better NER training, saving and loading | ||||||
| `v1.7.5`_   ``2017-04-07`` Bug fixes and new CLI commands | `v1.7.5`_   ``2017-04-07`` Bug fixes and new CLI commands | ||||||
| `v1.7.3`_   ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes | `v1.7.3`_   ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes | ||||||
| `v1.7.2`_   ``2017-03-20`` Small fixes to beam parser and model linking | `v1.7.2`_   ``2017-03-20`` Small fixes to beam parser and model linking | ||||||
|  | @ -350,6 +351,7 @@ Version     Date           Description | ||||||
| `v0.93`_    ``2015-09-22`` Bug fixes to word vectors | `v0.93`_    ``2015-09-22`` Bug fixes to word vectors | ||||||
| =========== ============== =========== | =========== ============== =========== | ||||||
| 
 | 
 | ||||||
|  | .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 | ||||||
| .. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 | .. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 | ||||||
| .. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3 | .. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3 | ||||||
| .. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2 | .. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2 | ||||||
|  |  | ||||||
|  | @ -1,7 +1,8 @@ | ||||||
| '''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated | """ | ||||||
|  | Print part-of-speech tagged, true-cased, (very roughly) sentence-separated | ||||||
| text, with each "sentence" on a newline, and spaces between tokens. Supports | text, with each "sentence" on a newline, and spaces between tokens. Supports | ||||||
| multi-processing. | multi-processing. | ||||||
| ''' | """ | ||||||
| from __future__ import print_function, unicode_literals, division | from __future__ import print_function, unicode_literals, division | ||||||
| import io | import io | ||||||
| import bz2 | import bz2 | ||||||
|  | @ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def iter_texts_from_json_bz2(loc): | def iter_texts_from_json_bz2(loc): | ||||||
|     ''' |     """ | ||||||
|     Iterator of unicode strings, one per document (here, a comment). |     Iterator of unicode strings, one per document (here, a comment). | ||||||
|      |      | ||||||
|     Expects a a path to a BZ2 file, which should be new-line delimited JSON. The |     Expects a a path to a BZ2 file, which should be new-line delimited JSON. The | ||||||
|     document text should be in a string field titled 'body'. |     document text should be in a string field titled 'body'. | ||||||
| 
 | 
 | ||||||
|     This is the data format of the Reddit comments corpus. |     This is the data format of the Reddit comments corpus. | ||||||
|     ''' |     """ | ||||||
|     with bz2.BZ2File(loc) as file_: |     with bz2.BZ2File(loc) as file_: | ||||||
|         for i, line in enumerate(file_): |         for i, line in enumerate(file_): | ||||||
|             yield ujson.loads(line)['body'] |             yield ujson.loads(line)['body'] | ||||||
|  | @ -80,7 +81,7 @@ def is_sent_begin(word): | ||||||
| def main(in_loc, out_dir, n_workers=4, batch_size=100000): | def main(in_loc, out_dir, n_workers=4, batch_size=100000): | ||||||
|     if not path.exists(out_dir): |     if not path.exists(out_dir): | ||||||
|         path.join(out_dir) |         path.join(out_dir) | ||||||
|     texts = partition(batch_size, iter_texts(in_loc)) |     texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) | ||||||
|     parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) |     parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) | ||||||
|   |   | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,22 +1,45 @@ | ||||||
|  | #!/usr/bin/env python | ||||||
|  | """ | ||||||
|  | Example of training an additional entity type | ||||||
|  | 
 | ||||||
|  | This script shows how to add a new entity type to an existing pre-trained NER | ||||||
|  | model. To keep the example short and simple, only four sentences are provided | ||||||
|  | as examples. In practice, you'll need many more — a few hundred would be a | ||||||
|  | good start. You will also likely need to mix in examples of other entity | ||||||
|  | types, which might be obtained by running the entity recognizer over unlabelled | ||||||
|  | sentences, and adding their annotations to the training set. | ||||||
|  | 
 | ||||||
|  | The actual training is performed by looping over the examples, and calling | ||||||
|  | `nlp.entity.update()`. The `update()` method steps through the words of the | ||||||
|  | input. At each word, it makes a prediction. It then consults the annotations | ||||||
|  | provided on the GoldParse instance, to see whether it was right. If it was | ||||||
|  | wrong, it adjusts its weights so that the correct action will score higher | ||||||
|  | next time. | ||||||
|  | 
 | ||||||
|  | After training your model, you can save it to a directory. We recommend | ||||||
|  | wrapping models as Python packages, for ease of deployment. | ||||||
|  | 
 | ||||||
|  | For more details, see the documentation: | ||||||
|  | * Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner | ||||||
|  | * Saving and loading models: https://spacy.io/docs/usage/saving-loading | ||||||
|  | 
 | ||||||
|  | Developed for: spaCy 1.7.6 | ||||||
|  | Last tested for: spaCy 1.7.6 | ||||||
|  | """ | ||||||
|  | # coding: utf8 | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| import json | 
 | ||||||
| import pathlib |  | ||||||
| import random | import random | ||||||
|  | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| import spacy | import spacy | ||||||
| from spacy.pipeline import EntityRecognizer | from spacy.pipeline import EntityRecognizer | ||||||
| from spacy.gold import GoldParse | from spacy.gold import GoldParse | ||||||
| from spacy.tagger import Tagger | from spacy.tagger import Tagger | ||||||
| 
 | 
 | ||||||
|   |  | ||||||
| try: |  | ||||||
|     unicode |  | ||||||
| except: |  | ||||||
|     unicode = str |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| def train_ner(nlp, train_data, output_dir): | def train_ner(nlp, train_data, output_dir): | ||||||
|     # Add new words to vocab. |     # Add new words to vocab | ||||||
|     for raw_text, _ in train_data: |     for raw_text, _ in train_data: | ||||||
|         doc = nlp.make_doc(raw_text) |         doc = nlp.make_doc(raw_text) | ||||||
|         for word in doc: |         for word in doc: | ||||||
|  | @ -30,11 +53,14 @@ def train_ner(nlp, train_data, output_dir): | ||||||
|             nlp.tagger(doc) |             nlp.tagger(doc) | ||||||
|             loss = nlp.entity.update(doc, gold) |             loss = nlp.entity.update(doc, gold) | ||||||
|     nlp.end_training() |     nlp.end_training() | ||||||
|     nlp.save_to_directory(output_dir) |     if output_dir: | ||||||
|  |         nlp.save_to_directory(output_dir) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def main(model_name, output_directory=None): | def main(model_name, output_directory=None): | ||||||
|     nlp = spacy.load(model_name) |     nlp = spacy.load(model_name) | ||||||
|  |     if output_directory is not None: | ||||||
|  |         output_directory = Path(output_directory) | ||||||
| 
 | 
 | ||||||
|     train_data = [ |     train_data = [ | ||||||
|         ( |         ( | ||||||
|  | @ -55,18 +81,18 @@ def main(model_name, output_directory=None): | ||||||
|         ) |         ) | ||||||
|     ] |     ] | ||||||
|     nlp.entity.add_label('ANIMAL') |     nlp.entity.add_label('ANIMAL') | ||||||
|     if output_directory is not None: |  | ||||||
|         output_directory = pathlib.Path(output_directory) |  | ||||||
|     ner = train_ner(nlp, train_data, output_directory) |     ner = train_ner(nlp, train_data, output_directory) | ||||||
| 
 | 
 | ||||||
|  |     # Test that the entity is recognized | ||||||
|     doc = nlp('Do you like horses?') |     doc = nlp('Do you like horses?') | ||||||
|     for ent in doc.ents: |     for ent in doc.ents: | ||||||
|         print(ent.label_, ent.text) |         print(ent.label_, ent.text) | ||||||
|     nlp2 = spacy.load('en', path=output_directory) |     if output_directory: | ||||||
|     nlp2.entity.add_label('ANIMAL') |         nlp2 = spacy.load('en', path=output_directory) | ||||||
|     doc2 = nlp2('Do you like horses?') |         nlp2.entity.add_label('ANIMAL') | ||||||
|     for ent in doc2.ents: |         doc2 = nlp2('Do you like horses?') | ||||||
|         print(ent.label_, ent.text) |         for ent in doc2.ents: | ||||||
|  |             print(ent.label_, ent.text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|  |  | ||||||
|  | @ -11,3 +11,4 @@ ujson>=1.35 | ||||||
| dill>=0.2,<0.3 | dill>=0.2,<0.3 | ||||||
| requests>=2.13.0,<3.0.0 | requests>=2.13.0,<3.0.0 | ||||||
| regex==2017.4.5 | regex==2017.4.5 | ||||||
|  | pytest>=3.0.6,<4.0.0 | ||||||
|  |  | ||||||
|  | @ -1,39 +1,40 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from pathlib import Path | from . import util | ||||||
| 
 |  | ||||||
| from .util import set_lang_class, get_lang_class, parse_package_meta |  | ||||||
| from .deprecated import resolve_model_name | from .deprecated import resolve_model_name | ||||||
| from .cli import info | from .cli import info | ||||||
| 
 | 
 | ||||||
| from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he | from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| set_lang_class(en.English.lang, en.English) | _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, | ||||||
| set_lang_class(de.German.lang, de.German) |              it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, | ||||||
| set_lang_class(es.Spanish.lang, es.Spanish) |              fi.Finnish, bn.Bengali, he.Hebrew) | ||||||
| set_lang_class(pt.Portuguese.lang, pt.Portuguese) | 
 | ||||||
| set_lang_class(fr.French.lang, fr.French) | 
 | ||||||
| set_lang_class(it.Italian.lang, it.Italian) | for _lang in _languages: | ||||||
| set_lang_class(hu.Hungarian.lang, hu.Hungarian) |     util.set_lang_class(_lang.lang, _lang) | ||||||
| set_lang_class(zh.Chinese.lang, zh.Chinese) |  | ||||||
| set_lang_class(nl.Dutch.lang, nl.Dutch) |  | ||||||
| set_lang_class(sv.Swedish.lang, sv.Swedish) |  | ||||||
| set_lang_class(fi.Finnish.lang, fi.Finnish) |  | ||||||
| set_lang_class(bn.Bengali.lang, bn.Bengali) |  | ||||||
| set_lang_class(he.Hebrew.lang, he.Hebrew) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load(name, **overrides): | def load(name, **overrides): | ||||||
|     data_path = overrides.get('path', util.get_data_path()) |     if overrides.get('path') in (None, False, True): | ||||||
|     model_name = resolve_model_name(name) |         data_path = util.get_data_path() | ||||||
|     meta = parse_package_meta(data_path, model_name, require=False) |         model_name = resolve_model_name(name) | ||||||
|  |         model_path = data_path / model_name | ||||||
|  |         if not model_path.exists(): | ||||||
|  |             lang_name = util.get_lang_class(name).lang | ||||||
|  |             model_path = None | ||||||
|  |             util.print_msg( | ||||||
|  |                 "Only loading the '{}' tokenizer.".format(lang_name), | ||||||
|  |                 title="Warning: no model found for '{}'".format(name)) | ||||||
|  |     else: | ||||||
|  |         model_path = util.ensure_path(overrides['path']) | ||||||
|  |         data_path = model_path.parent | ||||||
|  |         model_name = '' | ||||||
|  |     meta = util.parse_package_meta(data_path, model_name, require=False) | ||||||
|     lang = meta['lang'] if meta and 'lang' in meta else name |     lang = meta['lang'] if meta and 'lang' in meta else name | ||||||
|     cls = get_lang_class(lang) |     cls = util.get_lang_class(lang) | ||||||
|     overrides['meta'] = meta |     overrides['meta'] = meta | ||||||
|     model_path = Path(data_path / model_name) |     overrides['path'] = model_path | ||||||
|     if model_path.exists(): |  | ||||||
|         overrides['path'] = model_path |  | ||||||
| 
 |  | ||||||
|     return cls(**overrides) |     return cls(**overrides) | ||||||
|  |  | ||||||
|  | @ -63,15 +63,16 @@ class CLI(object): | ||||||
|     @plac.annotations( |     @plac.annotations( | ||||||
|         input_dir=("directory with model data", "positional", None, str), |         input_dir=("directory with model data", "positional", None, str), | ||||||
|         output_dir=("output parent directory", "positional", None, str), |         output_dir=("output parent directory", "positional", None, str), | ||||||
|  |         meta=("path to meta.json", "option", "m", str), | ||||||
|         force=("force overwriting of existing folder in output directory", "flag", "f", bool) |         force=("force overwriting of existing folder in output directory", "flag", "f", bool) | ||||||
|     ) |     ) | ||||||
|     def package(self, input_dir, output_dir, force=False): |     def package(self, input_dir, output_dir, meta=None, force=False): | ||||||
|         """ |         """ | ||||||
|         Generate Python package for model data, including meta and required |         Generate Python package for model data, including meta and required | ||||||
|         installation files. A new directory will be created in the specified |         installation files. A new directory will be created in the specified | ||||||
|         output directory, and model data will be copied over. |         output directory, and model data will be copied over. | ||||||
|         """ |         """ | ||||||
|         cli_package(input_dir, output_dir, force) |         cli_package(input_dir, output_dir, meta, force) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     @plac.annotations( |     @plac.annotations( | ||||||
|  |  | ||||||
|  | @ -3,7 +3,7 @@ | ||||||
| # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | ||||||
| 
 | 
 | ||||||
| __title__ = 'spacy' | __title__ = 'spacy' | ||||||
| __version__ = '1.7.5' | __version__ = '1.8.0' | ||||||
| __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | ||||||
| __uri__ = 'https://spacy.io' | __uri__ = 'https://spacy.io' | ||||||
| __author__ = 'Matthew Honnibal' | __author__ = 'Matthew Honnibal' | ||||||
|  |  | ||||||
|  | @ -29,7 +29,7 @@ def link_package(package_name, link_name, force=False): | ||||||
| 
 | 
 | ||||||
| def symlink(model_path, link_name, force): | def symlink(model_path, link_name, force): | ||||||
|     model_path = Path(model_path) |     model_path = Path(model_path) | ||||||
|     if not Path(model_path).exists(): |     if not model_path.exists(): | ||||||
|         util.sys_exit( |         util.sys_exit( | ||||||
|             "The data should be located in {p}".format(p=model_path), |             "The data should be located in {p}".format(p=model_path), | ||||||
|             title="Can't locate model data") |             title="Can't locate model data") | ||||||
|  | @ -48,12 +48,16 @@ def symlink(model_path, link_name, force): | ||||||
|     except: |     except: | ||||||
|         # This is quite dirty, but just making sure other errors are caught so |         # This is quite dirty, but just making sure other errors are caught so | ||||||
|         # users at least see a proper message. |         # users at least see a proper message. | ||||||
|         util.sys_exit( |         util.print_msg( | ||||||
|             "Creating a symlink in spacy/data failed. You can still import " |             "Creating a symlink in spacy/data failed. Make sure you have the " | ||||||
|             "the model as a Python package and call its load() method, or " |             "required permissions and try re-running the command as admin, or " | ||||||
|             "create the symlink manually:", |             "use a virtualenv to install spaCy in a user directory, instead of " | ||||||
|  |             "doing a system installation.", | ||||||
|  |             "You can still import the model as a Python package and call its " | ||||||
|  |             "load() method, or create the symlink manually:", | ||||||
|             "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)), |             "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)), | ||||||
|             title="Error: Couldn't link model to '{l}'".format(l=link_name)) |             title="Error: Couldn't link model to '{l}'".format(l=link_name)) | ||||||
|  |         raise | ||||||
| 
 | 
 | ||||||
|     util.print_msg( |     util.print_msg( | ||||||
|         "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), |         "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), | ||||||
|  |  | ||||||
|  | @ -9,16 +9,24 @@ from ..compat import unicode_, json_dumps | ||||||
| from .. import util | from .. import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def package(input_dir, output_dir, force): | def package(input_dir, output_dir, meta_path, force): | ||||||
|     input_path = Path(input_dir) |     input_path = Path(input_dir) | ||||||
|     output_path = Path(output_dir) |     output_path = Path(output_dir) | ||||||
|     check_dirs(input_path, output_path) |     meta_path = util.ensure_path(meta_path) | ||||||
|  |     check_dirs(input_path, output_path, meta_path) | ||||||
| 
 | 
 | ||||||
|     template_setup = get_template('setup.py') |     template_setup = get_template('setup.py') | ||||||
|     template_manifest = get_template('MANIFEST.in') |     template_manifest = get_template('MANIFEST.in') | ||||||
|     template_init = get_template('en_model_name/__init__.py') |     template_init = get_template('en_model_name/__init__.py') | ||||||
|     meta = generate_meta() |  | ||||||
| 
 | 
 | ||||||
|  |     meta_path = meta_path or input_path / 'meta.json' | ||||||
|  |     if meta_path.is_file(): | ||||||
|  |         util.print_msg(unicode_(meta_path), title="Reading meta.json from file") | ||||||
|  |         meta = util.read_json(meta_path) | ||||||
|  |     else: | ||||||
|  |         meta = generate_meta() | ||||||
|  | 
 | ||||||
|  |     validate_meta(meta, ['lang', 'name', 'version']) | ||||||
|     model_name = meta['lang'] + '_' + meta['name'] |     model_name = meta['lang'] + '_' + meta['name'] | ||||||
|     model_name_v = model_name + '-' + meta['version'] |     model_name_v = model_name + '-' + meta['version'] | ||||||
|     main_path = output_path / model_name_v |     main_path = output_path / model_name_v | ||||||
|  | @ -37,20 +45,23 @@ def package(input_dir, output_dir, force): | ||||||
|         title="Successfully created package {p}".format(p=model_name_v)) |         title="Successfully created package {p}".format(p=model_name_v)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def check_dirs(input_path, output_path): | def check_dirs(input_path, output_path, meta_path): | ||||||
|     if not input_path.exists(): |     if not input_path.exists(): | ||||||
|         util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found") |         util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found") | ||||||
|     if not output_path.exists(): |     if not output_path.exists(): | ||||||
|         util.sys_exit(unicode_(output_path), title="Output directory not found") |         util.sys_exit(unicode_(output_path), title="Output directory not found") | ||||||
|  |     if meta_path and not meta_path.exists(): | ||||||
|  |         util.sys_exit(unicode_(meta_path), title="meta.json not found") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_dirs(package_path, force): | def create_dirs(package_path, force): | ||||||
|     if package_path.exists(): |     if package_path.exists(): | ||||||
|         if force: |         if force: | ||||||
|             shutil.rmtree(unicode_(package_path.as_posix)) |             shutil.rmtree(unicode_(package_path)) | ||||||
|         else: |         else: | ||||||
|             util.sys_exit(unicode_(package_path.as_posix), |             util.sys_exit(unicode_(package_path), | ||||||
|                 "Please delete the directory and try again.", |                 "Please delete the directory and try again, or use the --force " | ||||||
|  |                 "flag to overwrite existing directories.", | ||||||
|                 title="Package directory already exists") |                 title="Package directory already exists") | ||||||
|     Path.mkdir(package_path, parents=True) |     Path.mkdir(package_path, parents=True) | ||||||
| 
 | 
 | ||||||
|  | @ -80,6 +91,14 @@ def generate_meta(): | ||||||
|     return meta |     return meta | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def validate_meta(meta, keys): | ||||||
|  |     for key in keys: | ||||||
|  |         if key not in meta or meta[key] == '': | ||||||
|  |             util.sys_exit( | ||||||
|  |                 "This setting is required to build your package.", | ||||||
|  |                 title='No "{k}" setting found in meta.json'.format(k=key)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def get_template(filepath): | def get_template(filepath): | ||||||
|     url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' |     url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' | ||||||
|     r = requests.get(url + filepath) |     r = requests.get(url + filepath) | ||||||
|  |  | ||||||
|  | @ -2,8 +2,8 @@ | ||||||
| from __future__ import unicode_literals, division, print_function | from __future__ import unicode_literals, division, print_function | ||||||
| 
 | 
 | ||||||
| import json | import json | ||||||
| from pathlib import Path |  | ||||||
| 
 | 
 | ||||||
|  | from ..util import ensure_path | ||||||
| from ..scorer import Scorer | from ..scorer import Scorer | ||||||
| from ..gold import GoldParse, merge_sents | from ..gold import GoldParse, merge_sents | ||||||
| from ..gold import read_json_file as read_gold_json | from ..gold import read_json_file as read_gold_json | ||||||
|  | @ -12,9 +12,9 @@ from .. import util | ||||||
| 
 | 
 | ||||||
| def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner, | def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner, | ||||||
|           parser_L1): |           parser_L1): | ||||||
|     output_path = Path(output_dir) |     output_path = ensure_path(output_dir) | ||||||
|     train_path = Path(train_data) |     train_path = ensure_path(train_data) | ||||||
|     dev_path = Path(dev_data) |     dev_path = ensure_path(dev_data) | ||||||
|     check_dirs(output_path, train_path, dev_path) |     check_dirs(output_path, train_path, dev_path) | ||||||
| 
 | 
 | ||||||
|     lang = util.get_lang_class(language) |     lang = util.get_lang_class(language) | ||||||
|  | @ -43,7 +43,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def train_config(config): | def train_config(config): | ||||||
|     config_path = Path(config) |     config_path = ensure_path(config) | ||||||
|     if not config_path.is_file(): |     if not config_path.is_file(): | ||||||
|         util.sys_exit(config_path.as_posix(), title="Config file not found") |         util.sys_exit(config_path.as_posix(), title="Config file not found") | ||||||
|     config = json.load(config_path) |     config = json.load(config_path) | ||||||
|  | @ -57,7 +57,8 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ | ||||||
|                 entity_cfg, n_iter): |                 entity_cfg, n_iter): | ||||||
|     print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") |     print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") | ||||||
| 
 | 
 | ||||||
|     with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: |     with Language.train(output_path, train_data, | ||||||
|  |                         pos=tagger_cfg, deps=parser_cfg, ner=entity_cfg) as trainer: | ||||||
|         for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): |         for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): | ||||||
|             for doc, gold in epoch: |             for doc, gold in epoch: | ||||||
|                 trainer.update(doc, gold) |                 trainer.update(doc, gold) | ||||||
|  |  | ||||||
|  | @ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides): | ||||||
| def resolve_model_name(name): | def resolve_model_name(name): | ||||||
|     """ |     """ | ||||||
|     If spaCy is loaded with 'de', check if symlink already exists. If |     If spaCy is loaded with 'de', check if symlink already exists. If | ||||||
|     not, user have upgraded from older version and have old models installed. |     not, user may have upgraded from older version and have old models installed. | ||||||
|     Check if old model directory exists and if so, return that instead and create |     Check if old model directory exists and if so, return that instead and create | ||||||
|     shortcut link. If English model is found and no shortcut exists, raise error |     shortcut link. If English model is found and no shortcut exists, raise error | ||||||
|     and tell user to install new model. |     and tell user to install new model. | ||||||
|  |  | ||||||
|  | @ -5,9 +5,9 @@ from __future__ import unicode_literals, print_function | ||||||
| import io | import io | ||||||
| import re | import re | ||||||
| import ujson | import ujson | ||||||
| from pathlib import Path |  | ||||||
| 
 | 
 | ||||||
| from .syntax import nonproj | from .syntax import nonproj | ||||||
|  | from .util import ensure_path | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def tags_to_entities(tags): | def tags_to_entities(tags): | ||||||
|  | @ -139,12 +139,12 @@ def _min_edit_path(cand_words, gold_words): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_json_file(loc, docs_filter=None): | def read_json_file(loc, docs_filter=None): | ||||||
|     loc = Path(loc) |     loc = ensure_path(loc) | ||||||
|     if loc.is_dir(): |     if loc.is_dir(): | ||||||
|         for filename in loc.iterdir(): |         for filename in loc.iterdir(): | ||||||
|             yield from read_json_file(loc / filename) |             yield from read_json_file(loc / filename) | ||||||
|     else: |     else: | ||||||
|         with io.open(loc, 'r', encoding='utf8') as file_: |         with loc.open('r', encoding='utf8') as file_: | ||||||
|             docs = ujson.load(file_) |             docs = ujson.load(file_) | ||||||
|         for doc in docs: |         for doc in docs: | ||||||
|             if docs_filter is not None and not docs_filter(doc): |             if docs_filter is not None and not docs_filter(doc): | ||||||
|  |  | ||||||
|  | @ -204,15 +204,18 @@ class Language(object): | ||||||
|     @classmethod |     @classmethod | ||||||
|     @contextmanager |     @contextmanager | ||||||
|     def train(cls, path, gold_tuples, **configs): |     def train(cls, path, gold_tuples, **configs): | ||||||
|         if parser_cfg['pseudoprojective']: |         parser_cfg = configs.get('deps', {}) | ||||||
|  |         if parser_cfg.get('pseudoprojective'): | ||||||
|             # preprocess training data here before ArcEager.get_labels() is called |             # preprocess training data here before ArcEager.get_labels() is called | ||||||
|             gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) |             gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) | ||||||
| 
 | 
 | ||||||
|         for subdir in ('deps', 'ner', 'pos'): |         for subdir in ('deps', 'ner', 'pos'): | ||||||
|             if subdir not in configs: |             if subdir not in configs: | ||||||
|                 configs[subdir] = {} |                 configs[subdir] = {} | ||||||
|         configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) |         if parser_cfg: | ||||||
|         configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) |             configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) | ||||||
|  |         if 'ner' in configs: | ||||||
|  |             configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) | ||||||
| 
 | 
 | ||||||
|         cls.setup_directory(path, **configs) |         cls.setup_directory(path, **configs) | ||||||
| 
 | 
 | ||||||
|  | @ -236,8 +239,7 @@ class Language(object): | ||||||
|         self.pipeline = self.Defaults.create_pipeline(self) |         self.pipeline = self.Defaults.create_pipeline(self) | ||||||
|         yield Trainer(self, gold_tuples) |         yield Trainer(self, gold_tuples) | ||||||
|         self.end_training() |         self.end_training() | ||||||
|         self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg, |         self.save_to_directory(path) | ||||||
|                                pos=self.tagger.cfg) |  | ||||||
| 
 | 
 | ||||||
|     def __init__(self, **overrides): |     def __init__(self, **overrides): | ||||||
|         if 'data_dir' in overrides and 'path' not in overrides: |         if 'data_dir' in overrides and 'path' not in overrides: | ||||||
|  |  | ||||||
|  | @ -40,7 +40,7 @@ from ..strings cimport StringStore | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| USE_FTRL = False | USE_FTRL = True | ||||||
| DEBUG = False | DEBUG = False | ||||||
| def set_debug(val): | def set_debug(val): | ||||||
|     global DEBUG |     global DEBUG | ||||||
|  |  | ||||||
|  | @ -16,6 +16,7 @@ def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas): | ||||||
|     assert lemmatizer.noun(text) == set(lemmas) |     assert lemmatizer.noun(text) == set(lemmas) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.mark.xfail | ||||||
| @pytest.mark.models | @pytest.mark.models | ||||||
| def test_tagger_lemmatizer_base_forms(lemmatizer): | def test_tagger_lemmatizer_base_forms(lemmatizer): | ||||||
|     if lemmatizer is None: |     if lemmatizer is None: | ||||||
|  |  | ||||||
|  | @ -3,9 +3,8 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ...vocab import Vocab | from ...vocab import Vocab | ||||||
| from ...tokenizer import Tokenizer | from ...tokenizer import Tokenizer | ||||||
| from ...util import utf8open | from ... import util | ||||||
| 
 | 
 | ||||||
| from os import path |  | ||||||
| import pytest | import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n | ||||||
| 
 | 
 | ||||||
| @pytest.mark.parametrize('file_name', ["sun.txt"]) | @pytest.mark.parametrize('file_name', ["sun.txt"]) | ||||||
| def test_tokenizer_handle_text_from_file(tokenizer, file_name): | def test_tokenizer_handle_text_from_file(tokenizer, file_name): | ||||||
|     loc = path.join(path.dirname(__file__), file_name) |     loc = util.ensure_path(__file__).parent / file_name | ||||||
|     text = utf8open(loc).read() |     text = loc.open('r', encoding='utf8').read() | ||||||
|     assert len(text) != 0 |     assert len(text) != 0 | ||||||
|     tokens = tokenizer(text) |     tokens = tokenizer(text) | ||||||
|     assert len(tokens) > 100 |     assert len(tokens) > 100 | ||||||
|  |  | ||||||
|  | @ -192,6 +192,8 @@ cdef class Token: | ||||||
|     property lemma: |     property lemma: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return self.c.lemma |             return self.c.lemma | ||||||
|  |         def __set__(self, int lemma): | ||||||
|  |             self.c.lemma = lemma | ||||||
| 
 | 
 | ||||||
|     property pos: |     property pos: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  | @ -570,6 +572,8 @@ cdef class Token: | ||||||
|     property lemma_: |     property lemma_: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return self.vocab.strings[self.c.lemma] |             return self.vocab.strings[self.c.lemma] | ||||||
|  |         def __set__(self, unicode lemma_): | ||||||
|  |             self.c.lemma = self.vocab.strings[lemma_] | ||||||
| 
 | 
 | ||||||
|     property pos_: |     property pos_: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|  |  | ||||||
|  | @ -1,7 +1,6 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals, print_function | from __future__ import unicode_literals, print_function | ||||||
| 
 | 
 | ||||||
| import io |  | ||||||
| import ujson | import ujson | ||||||
| import re | import re | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | @ -21,9 +20,11 @@ def set_lang_class(name, cls): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_lang_class(name): | def get_lang_class(name): | ||||||
|  |     if name in LANGUAGES: | ||||||
|  |         return LANGUAGES[name] | ||||||
|     lang = re.split('[^a-zA-Z0-9]', name, 1)[0] |     lang = re.split('[^a-zA-Z0-9]', name, 1)[0] | ||||||
|     if lang not in LANGUAGES: |     if lang not in LANGUAGES: | ||||||
|         raise RuntimeError('Language not supported: %s' % lang) |         raise RuntimeError('Language not supported: %s' % name) | ||||||
|     return LANGUAGES[lang] |     return LANGUAGES[lang] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -46,15 +47,6 @@ def ensure_path(path): | ||||||
|         return path |         return path | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def or_(val1, val2): |  | ||||||
|     if val1 is not None: |  | ||||||
|         return val1 |  | ||||||
|     elif callable(val2): |  | ||||||
|         return val2() |  | ||||||
|     else: |  | ||||||
|         return val2 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def read_regex(path): | def read_regex(path): | ||||||
|     path = ensure_path(path) |     path = ensure_path(path) | ||||||
|     with path.open() as file_: |     with path.open() as file_: | ||||||
|  | @ -103,22 +95,28 @@ def normalize_slice(length, start, stop, step=None): | ||||||
|     return start, stop |     return start, stop | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def utf8open(loc, mode='r'): |  | ||||||
|     return io.open(loc, mode, encoding='utf8') |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def check_renamed_kwargs(renamed, kwargs): | def check_renamed_kwargs(renamed, kwargs): | ||||||
|     for old, new in renamed.items(): |     for old, new in renamed.items(): | ||||||
|         if old in kwargs: |         if old in kwargs: | ||||||
|             raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) |             raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def read_json(location): | ||||||
|  |     with location.open('r', encoding='utf8') as f: | ||||||
|  |         return ujson.load(f) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def parse_package_meta(package_path, package, require=True): | def parse_package_meta(package_path, package, require=True): | ||||||
|  |     """ | ||||||
|  |     Check if a meta.json exists in a package and return its contents as a | ||||||
|  |     dictionary. If require is set to True, raise an error if no meta.json found. | ||||||
|  |     """ | ||||||
|  |     # TODO: Allow passing in full model path and only require one argument | ||||||
|  |     # instead of path and package name. This lets us avoid passing in an awkward | ||||||
|  |     # empty string in spacy.load() if user supplies full model path. | ||||||
|     location = package_path / package / 'meta.json' |     location = package_path / package / 'meta.json' | ||||||
|     if location.is_file(): |     if location.is_file(): | ||||||
|         with location.open('r', encoding='utf8') as f: |         return read_json(location) | ||||||
|             meta = ujson.load(f) |  | ||||||
|             return meta |  | ||||||
|     elif require: |     elif require: | ||||||
|         raise IOError("Could not read meta.json from %s" % location) |         raise IOError("Could not read meta.json from %s" % location) | ||||||
|     else: |     else: | ||||||
|  | @ -126,10 +124,11 @@ def parse_package_meta(package_path, package, require=True): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_raw_input(description, default=False): | def get_raw_input(description, default=False): | ||||||
|     """Get user input via raw_input / input and return input value. Takes a |     """ | ||||||
|  |     Get user input via raw_input / input and return input value. Takes a | ||||||
|     description for the prompt, and an optional default value that's displayed |     description for the prompt, and an optional default value that's displayed | ||||||
|     with the prompt.""" |     with the prompt. | ||||||
| 
 |     """ | ||||||
|     additional = ' (default: {d})'.format(d=default) if default else '' |     additional = ' (default: {d})'.format(d=default) if default else '' | ||||||
|     prompt = '    {d}{a}: '.format(d=description, a=additional) |     prompt = '    {d}{a}: '.format(d=description, a=additional) | ||||||
|     user_input = input_(prompt) |     user_input = input_(prompt) | ||||||
|  | @ -137,9 +136,10 @@ def get_raw_input(description, default=False): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def print_table(data, **kwargs): | def print_table(data, **kwargs): | ||||||
|     """Print data in table format. Can either take a list of tuples or a |     """ | ||||||
|     dictionary, which will be converted to a list of tuples.""" |     Print data in table format. Can either take a list of tuples or a | ||||||
| 
 |     dictionary, which will be converted to a list of tuples. | ||||||
|  |     """ | ||||||
|     if type(data) == dict: |     if type(data) == dict: | ||||||
|         data = list(data.items()) |         data = list(data.items()) | ||||||
| 
 | 
 | ||||||
|  | @ -155,10 +155,11 @@ def print_table(data, **kwargs): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def print_markdown(data, **kwargs): | def print_markdown(data, **kwargs): | ||||||
|     """Print listed data in GitHub-flavoured Markdown format so it can be |     """ | ||||||
|  |     Print listed data in GitHub-flavoured Markdown format so it can be | ||||||
|     copy-pasted into issues. Can either take a list of tuples or a dictionary, |     copy-pasted into issues. Can either take a list of tuples or a dictionary, | ||||||
|     which will be converted to a list of tuples.""" |     which will be converted to a list of tuples. | ||||||
| 
 |     """ | ||||||
|     def excl_value(value): |     def excl_value(value): | ||||||
|         # don't print value if it contains absolute path of directory (i.e. |         # don't print value if it contains absolute path of directory (i.e. | ||||||
|         # personal info). Other conditions can be included here if necessary. |         # personal info). Other conditions can be included here if necessary. | ||||||
|  | @ -175,16 +176,16 @@ def print_markdown(data, **kwargs): | ||||||
| 
 | 
 | ||||||
|     if 'title' in kwargs and kwargs['title']: |     if 'title' in kwargs and kwargs['title']: | ||||||
|         print(tpl_title.format(msg=kwargs['title'])) |         print(tpl_title.format(msg=kwargs['title'])) | ||||||
| 
 |  | ||||||
|     print(tpl_msg.format(msg=markdown)) |     print(tpl_msg.format(msg=markdown)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def print_msg(*text, **kwargs): | def print_msg(*text, **kwargs): | ||||||
|     """Print formatted message. Each positional argument is rendered as newline- |     """ | ||||||
|  |     Print formatted message. Each positional argument is rendered as newline- | ||||||
|     separated paragraph. If kwarg 'title' exist, title is printed above the text |     separated paragraph. If kwarg 'title' exist, title is printed above the text | ||||||
|     and highlighted (using ANSI escape sequences manually to avoid unnecessary |     and highlighted (using ANSI escape sequences manually to avoid unnecessary | ||||||
|     dependency).""" |     dependency). | ||||||
| 
 |     """ | ||||||
|     message = '\n\n'.join([_wrap_text(t) for t in text]) |     message = '\n\n'.join([_wrap_text(t) for t in text]) | ||||||
|     tpl_msg = '\n{msg}\n' |     tpl_msg = '\n{msg}\n' | ||||||
|     tpl_title = '\n\033[93m{msg}\033[0m' |     tpl_title = '\n\033[93m{msg}\033[0m' | ||||||
|  | @ -196,9 +197,10 @@ def print_msg(*text, **kwargs): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _wrap_text(text): | def _wrap_text(text): | ||||||
|     """Wrap text at given width using textwrap module. Indent should consist of |     """ | ||||||
|     spaces. Its length is deducted from wrap width to ensure exact wrapping.""" |     Wrap text at given width using textwrap module. Indent should consist of | ||||||
| 
 |     spaces. Its length is deducted from wrap width to ensure exact wrapping. | ||||||
|  |     """ | ||||||
|     wrap_max = 80 |     wrap_max = 80 | ||||||
|     indent = '    ' |     indent = '    ' | ||||||
|     wrap_width = wrap_max - len(indent) |     wrap_width = wrap_max - len(indent) | ||||||
|  | @ -208,10 +210,11 @@ def _wrap_text(text): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def sys_exit(*messages, **kwargs): | def sys_exit(*messages, **kwargs): | ||||||
|     """Performs SystemExit. For modules used from the command line, like |     """ | ||||||
|  |     Performs SystemExit. For modules used from the command line, like | ||||||
|     download and link. To print message, use the same arguments as for |     download and link. To print message, use the same arguments as for | ||||||
|     print_msg().""" |     print_msg(). | ||||||
| 
 |     """ | ||||||
|     if messages: |     if messages: | ||||||
|         print_msg(*messages, **kwargs) |         print_msg(*messages, **kwargs) | ||||||
|     sys.exit(0) |     sys.exit(0) | ||||||
|  |  | ||||||
|  | @ -12,7 +12,7 @@ | ||||||
|         "COMPANY_URL": "https://explosion.ai", |         "COMPANY_URL": "https://explosion.ai", | ||||||
|         "DEMOS_URL": "https://demos.explosion.ai", |         "DEMOS_URL": "https://demos.explosion.ai", | ||||||
| 
 | 
 | ||||||
|         "SPACY_VERSION": "1.7", |         "SPACY_VERSION": "1.8", | ||||||
|         "LATEST_NEWS": { |         "LATEST_NEWS": { | ||||||
|             "url": "https://survey.spacy.io/", |             "url": "https://survey.spacy.io/", | ||||||
|             "title": "Take the spaCy user survey and help us improve the library!" |             "title": "Take the spaCy user survey and help us improve the library!" | ||||||
|  |  | ||||||
|  | @ -20,8 +20,10 @@ | ||||||
|             "Word vectors": "word-vectors-similarities", |             "Word vectors": "word-vectors-similarities", | ||||||
|             "Deep learning": "deep-learning", |             "Deep learning": "deep-learning", | ||||||
|             "Custom tokenization": "customizing-tokenizer", |             "Custom tokenization": "customizing-tokenizer", | ||||||
|  |             "Adding languages": "adding-languages", | ||||||
|             "Training": "training", |             "Training": "training", | ||||||
|             "Adding languages": "adding-languages" |             "Training NER": "training-ner", | ||||||
|  |             "Saving & loading": "saving-loading" | ||||||
|         }, |         }, | ||||||
|         "Examples": { |         "Examples": { | ||||||
|             "Tutorials": "tutorials", |             "Tutorials": "tutorials", | ||||||
|  | @ -101,11 +103,21 @@ | ||||||
| 
 | 
 | ||||||
|     "customizing-tokenizer": { |     "customizing-tokenizer": { | ||||||
|         "title": "Customizing the tokenizer", |         "title": "Customizing the tokenizer", | ||||||
|         "next": "training" |         "next": "adding-languages" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "training": { |     "training": { | ||||||
|         "title": "Training the tagger, parser and entity recognizer" |         "title": "Training spaCy's statistical models", | ||||||
|  |         "next": "saving-loading" | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|  |     "training-ner": { | ||||||
|  |         "title": "Training the Named Entity Recognizer", | ||||||
|  |         "next": "saving-loading" | ||||||
|  |     }, | ||||||
|  | 
 | ||||||
|  |     "saving-loading": { | ||||||
|  |         "title": "Saving and loading models" | ||||||
|     }, |     }, | ||||||
| 
 | 
 | ||||||
|     "pos-tagging": { |     "pos-tagging": { | ||||||
|  | @ -356,6 +368,18 @@ | ||||||
|         }, |         }, | ||||||
| 
 | 
 | ||||||
|         "code": { |         "code": { | ||||||
|  |             "Training a new entity type": { | ||||||
|  |                 "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py", | ||||||
|  |                 "author": "Matthew Honnibal", | ||||||
|  |                 "tags": ["ner", "training"] | ||||||
|  |             }, | ||||||
|  | 
 | ||||||
|  |             "Training an NER system from scratch": { | ||||||
|  |                 "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py", | ||||||
|  |                 "author": "Matthew Honnibal", | ||||||
|  |                 "tags": ["ner", "training"] | ||||||
|  |             }, | ||||||
|  | 
 | ||||||
|             "Information extraction": { |             "Information extraction": { | ||||||
|                 "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", |                 "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", | ||||||
|                 "author": "Matthew Honnibal", |                 "author": "Matthew Honnibal", | ||||||
|  |  | ||||||
|  | @ -63,14 +63,16 @@ p | ||||||
|             tag_map = TAG_MAP |             tag_map = TAG_MAP | ||||||
|             stop_words = STOP_WORDS |             stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| p Additionally, the new #[code Language] class needs to be registered in #[+src(gh("spaCy", "spacy/__init__.py")) spacy/__init__.py] using the #[code set_lang_class()] function, so that you can use #[code spacy.load()]. | p | ||||||
|  |     |  Additionally, the new #[code Language] class needs to be added to the | ||||||
|  |     |  list of available languages in #[+src(gh("spaCy", "spacy/__init__.py")) __init__.py]. | ||||||
|  |     |  The languages are then registered using the #[code set_lang_class()] function. | ||||||
| 
 | 
 | ||||||
| +code("spacy/__init__.py"). | +code("spacy/__init__.py"). | ||||||
|     from . import en |     from . import en | ||||||
|     from . import xx |     from . import xx | ||||||
| 
 | 
 | ||||||
|     set_lang_class(en.English.lang, en.English) |     _languages = (en.English, ..., xx.Xxxxx) | ||||||
|     set_lang_class(xx.Xxxxx.lang, xx.Xxxxx) |  | ||||||
| 
 | 
 | ||||||
| p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]: | p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]: | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -248,15 +248,17 @@ p | ||||||
|     +tag experimental |     +tag experimental | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  Generate a #[+a("/docs/usage/models#own-models") model Python package] |     |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | ||||||
|     |  from an existing model data directory. All data files are copied over, |     |  from an existing model data directory. All data files are copied over. | ||||||
|     |  and the meta data can be entered directly from the command line. While |     |  If the path to a meta.json is supplied, or a meta.json is found in the | ||||||
|     |  this feature is still experimental, the required file templates are |     |  input directory, this file is used. Otherwise, the data can be entered | ||||||
|     |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. |     |  directly from the command line. While this feature is still experimental, | ||||||
|     |  This means you need to be connected to the internet to use this command. |     |  the required file templates are downloaded from | ||||||
|  |     |  #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means | ||||||
|  |     |  you need to be connected to the internet to use this command. | ||||||
| 
 | 
 | ||||||
| +code(false, "bash"). | +code(false, "bash"). | ||||||
|     python -m spacy package [input_dir] [output_dir] [--force] |     python -m spacy package [input_dir] [output_dir] [--meta] [--force] | ||||||
| 
 | 
 | ||||||
| +table(["Argument", "Type", "Description"]) | +table(["Argument", "Type", "Description"]) | ||||||
|     +row |     +row | ||||||
|  | @ -269,6 +271,11 @@ p | ||||||
|         +cell positional |         +cell positional | ||||||
|         +cell Directory to create package folder in. |         +cell Directory to create package folder in. | ||||||
| 
 | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code meta] | ||||||
|  |         +cell option | ||||||
|  |         +cell Path to meta.json file (optional). | ||||||
|  | 
 | ||||||
|     +row |     +row | ||||||
|         +cell #[code --force], #[code -f] |         +cell #[code --force], #[code -f] | ||||||
|         +cell flag |         +cell flag | ||||||
|  |  | ||||||
|  | @ -137,7 +137,7 @@ p | ||||||
|         return word.ent_type != 0 |         return word.ent_type != 0 | ||||||
| 
 | 
 | ||||||
|     def count_parent_verb_by_person(docs): |     def count_parent_verb_by_person(docs): | ||||||
|         counts = defaultdict(defaultdict(int)) |         counts = defaultdict(lambda: defaultdict(int)) | ||||||
|         for doc in docs: |         for doc in docs: | ||||||
|             for ent in doc.ents: |             for ent in doc.ents: | ||||||
|                 if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: |                 if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: | ||||||
|  |  | ||||||
|  | @ -235,62 +235,13 @@ p | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  If you've trained your own model, for example for |     |  If you've trained your own model, for example for | ||||||
|     |  #[+a("/docs/usage/adding-languages") additional languages], you can |     |  #[+a("/docs/usage/adding-languages") additional languages] or | ||||||
|     |  create a shortuct link for it by pointing #[code spacy.link] to the |     |  #[+a("/docs/usage/train-ner") custom named entities], you can save its | ||||||
|     |  model's data directory. To allow your model to be downloaded and |     |  state using the #[code Language.save_to_directory()] method. To make the | ||||||
|     |  installed via pip, you'll also need to generate a package for it. You can |     |  model more convenient to deploy, we recommend wrapping it as a Python | ||||||
|     |  do this manually, or via the new |     |  package. | ||||||
|     |  #[+a("/docs/usage/cli#package") #[code spacy package] command] that will |  | ||||||
|     |  create all required files, and walk you through generating the meta data. |  | ||||||
| 
 | 
 | ||||||
| 
 | +infobox("Saving and loading models") | ||||||
| +infobox("Important note") |     |  For more information and a detailed guide on how to package your model, | ||||||
|     |  The model packages are #[strong not suitable] for the public |     |  see the documentation on | ||||||
|     |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not |     |  #[+a("/docs/usage/saving-loading") saving and loading models]. | ||||||
|     |  designed for binary data and files over 50 MB. However, if your company |  | ||||||
|     |  is running an internal installation of pypi, publishing your models on |  | ||||||
|     |  there can be a convenient solution to share them with your team. |  | ||||||
| 
 |  | ||||||
| p The model directory should look like this: |  | ||||||
| 
 |  | ||||||
| +code("Directory structure", "yaml"). |  | ||||||
|     └── / |  | ||||||
|         ├── MANIFEST.in                   # to include meta.json |  | ||||||
|         ├── meta.json                     # model meta data |  | ||||||
|         ├── setup.py                      # setup file for pip installation |  | ||||||
|         └── en_core_web_md                # model directory |  | ||||||
|             ├── __init__.py               # init for pip installation |  | ||||||
|             └── en_core_web_md-1.2.0      # model data |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  You can find templates for all files in our |  | ||||||
|     |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. |  | ||||||
|     |  Unless you want to customise installation and loading, the only file |  | ||||||
|     |  you'll need to modify is #[code meta.json], which includes the model's |  | ||||||
|     |  meta data. It will later be copied into the package and data directory. |  | ||||||
| 
 |  | ||||||
| +code("meta.json", "json"). |  | ||||||
|     { |  | ||||||
|         "name": "core_web_md", |  | ||||||
|         "lang": "en", |  | ||||||
|         "version": "1.2.0", |  | ||||||
|         "spacy_version": "1.7.0", |  | ||||||
|         "description": "English model for spaCy", |  | ||||||
|         "author": "Explosion AI", |  | ||||||
|         "email": "contact@explosion.ai", |  | ||||||
|         "license": "MIT" |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Keep in mind that the directories need to be named according to the |  | ||||||
|     |  naming conventions. The #[code lang] setting is also used to create the |  | ||||||
|     |  respective #[code Language] class in spaCy, which will later be returned |  | ||||||
|     |  by the model's #[code load()] method. |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  To generate the package, run the following command from within the |  | ||||||
|     |  directory. This will create a #[code .tar.gz] archive in a directory |  | ||||||
|     |  #[code /dist]. |  | ||||||
| 
 |  | ||||||
| +code(false, "bash"). |  | ||||||
|     python setup.py sdist |  | ||||||
|  |  | ||||||
							
								
								
									
										108
									
								
								website/docs/usage/saving-loading.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								website/docs/usage/saving-loading.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,108 @@ | ||||||
|  | include ../../_includes/_mixins | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  After training your model, you'll usually want to save its state, and load | ||||||
|  |     |  it back later. You can do this with the #[code Language.save_to_directory()] | ||||||
|  |     |  method: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     nlp.save_to_directory('/home/me/data/en_example_model') | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  The directory will be created if it doesn't exist, and the whole pipeline | ||||||
|  |     |  will be written out. To make the model more convenient to deploy, we | ||||||
|  |     |  recommend wrapping it as a Python package. | ||||||
|  | 
 | ||||||
|  | +h(2, "generating") Generating a model package | ||||||
|  | 
 | ||||||
|  | +infobox("Important note") | ||||||
|  |     |  The model packages are #[strong not suitable] for the public | ||||||
|  |     |  #[+a("https://pypi.python.org") pypi.python.org] directory, which is not | ||||||
|  |     |  designed for binary data and files over 50 MB. However, if your company | ||||||
|  |     |  is running an internal installation of pypi, publishing your models on | ||||||
|  |     |  there can be a convenient solution to share them with your team. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  spaCy comes with a handy  CLI command that will create all required files, | ||||||
|  |     |  and walk you through generating the meta data. You can also create the | ||||||
|  |     |  meta.json manually and place it in the model data directory, or supply a | ||||||
|  |     |  path to it using the #[code --meta] flag. For more info on this, see the | ||||||
|  |     |  #[+a("/docs/usage/cli/#package") #[code package] command] documentation. | ||||||
|  | 
 | ||||||
|  | +aside-code("meta.json", "json"). | ||||||
|  |     { | ||||||
|  |         "name": "example_model", | ||||||
|  |         "lang": "en", | ||||||
|  |         "version": "1.0.0", | ||||||
|  |         "spacy_version": ">=1.7.0,<2.0.0", | ||||||
|  |         "description": "Example model for spaCy", | ||||||
|  |         "author": "You", | ||||||
|  |         "email": "you@example.com", | ||||||
|  |         "license": "CC BY-SA 3.0" | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | +code(false, "bash"). | ||||||
|  |     python -m spacy package /home/me/data/en_example_model /home/me/my_models | ||||||
|  | 
 | ||||||
|  | p This command will create a model package directory that should look like this: | ||||||
|  | 
 | ||||||
|  | +code("Directory structure", "yaml"). | ||||||
|  |     └── / | ||||||
|  |         ├── MANIFEST.in                   # to include meta.json | ||||||
|  |         ├── meta.json                     # model meta data | ||||||
|  |         ├── setup.py                      # setup file for pip installation | ||||||
|  |         └── en_example_model              # model directory | ||||||
|  |             ├── __init__.py               # init for pip installation | ||||||
|  |             └── en_example_model-1.0.0    # model data | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  You can also find templates for all files in our | ||||||
|  |     |  #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. | ||||||
|  |     |  If you're creating the package manually, keep in mind that the directories | ||||||
|  |     |  need to be named according to the naming conventions of | ||||||
|  |     |  #[code [language]_[type]] and #[code [language]_[type]-[version]]. The | ||||||
|  |     |  #[code lang] setting in the meta.json is also used to create the | ||||||
|  |     |  respective #[code Language] class in spaCy, which will later be returned | ||||||
|  |     |  by the model's #[code load()] method. | ||||||
|  | 
 | ||||||
|  | +h(2, "building") Building a model package | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  To build the package, run the following command from within the | ||||||
|  |     |  directory. This will create a #[code .tar.gz] archive in a directory | ||||||
|  |     |  #[code /dist]. | ||||||
|  | 
 | ||||||
|  | +code(false, "bash"). | ||||||
|  |     python setup.py sdist | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  For more information on building Python packages, see the | ||||||
|  |     |  #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +h(2, "loading") Loading a model package | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Model packages can be installed by pointing pip to the model's | ||||||
|  |     |  #[code .tar.gz] archive: | ||||||
|  | 
 | ||||||
|  | +code(false, "bash"). | ||||||
|  |     pip install /path/to/en_example_model-1.0.0.tar.gz | ||||||
|  | 
 | ||||||
|  | p You'll then be able to load the model as follows: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     import en_example_model | ||||||
|  |     nlp = en_example_model.load() | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  To load the model via #[code spacy.load()], you can also | ||||||
|  |     |  create a #[+a("/docs/usage/models#usage") shortcut link] that maps the | ||||||
|  |     |  package name to a custom model name of your choice: | ||||||
|  | 
 | ||||||
|  | +code(false, "bash"). | ||||||
|  |     python -m spacy link en_example_model example | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     import spacy | ||||||
|  |     nlp = spacy.load('example') | ||||||
							
								
								
									
										174
									
								
								website/docs/usage/training-ner.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								website/docs/usage/training-ner.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,174 @@ | ||||||
|  | include ../../_includes/_mixins | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  All #[+a("/docs/usage/models") spaCy models] support online learning, so | ||||||
|  |     |  you can update a pre-trained model with new examples. You can even add | ||||||
|  |     |  new classes to an existing model, to recognise a new entity type, | ||||||
|  |     |  part-of-speech, or syntactic relation. Updating an existing model is | ||||||
|  |     |  particularly useful as a "quick and dirty solution", if you have only a | ||||||
|  |     |  few corrections or annotations. | ||||||
|  | 
 | ||||||
|  | +h(2, "improving-accuracy") Improving accuracy on existing entity types | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  To update the model, you first need to create an instance of | ||||||
|  |     |  #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels | ||||||
|  |     |  you want to learn. You will then pass this instance to the | ||||||
|  |     |  #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]] | ||||||
|  |     |  method. For example: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     import spacy | ||||||
|  |     from spacy.gold import GoldParse | ||||||
|  | 
 | ||||||
|  |     nlp = spacy.load('en') | ||||||
|  |     doc = nlp.make_doc(u'Facebook released React in 2014') | ||||||
|  |     gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE']) | ||||||
|  |     nlp.entity.update(doc, gold) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  You'll usually need to provide many examples to meaningfully improve the | ||||||
|  |     |  system — a few hundred is a good start, although more is better. You | ||||||
|  |     |  should avoid iterating over the same few examples multiple times, or the | ||||||
|  |     |  model is likely to "forget" how to annotate other examples. If you | ||||||
|  |     |  iterate over the same few examples, you're effectively changing the loss | ||||||
|  |     |  function. The optimizer will find a way to minimize the loss on your | ||||||
|  |     |  examples, without regard for the consequences on the examples it's no | ||||||
|  |     |  longer paying attention to. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  One way to avoid this "catastrophic forgetting" problem is to "remind" | ||||||
|  |     |  the model of other examples by augmenting your annotations with sentences | ||||||
|  |     |  annotated with entities automatically recognised by the original model. | ||||||
|  |     |  Ultimately, this is an empirical process: you'll need to | ||||||
|  |     |  #[strong experiment on your own data] to find a solution that works best | ||||||
|  |     |  for you. | ||||||
|  | 
 | ||||||
|  | +h(2, "adding") Adding a new entity type | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  You can add new entity types to an existing model. Let's say we want to | ||||||
|  |     |  recognise the category #[code TECHNOLOGY]. The new category will include | ||||||
|  |     |  programming languages, frameworks and platforms. First, we need to | ||||||
|  |     |  register the new entity type: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     nlp.entity.add_label('TECHNOLOGY') | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Next, iterate over your examples, calling #[code entity.update()]. As | ||||||
|  |     |  above, we want to avoid iterating over only a small number of sentences. | ||||||
|  |     |  A useful compromise is to run the model over a number of plain-text | ||||||
|  |     |  sentences, and pass the entities to #[code GoldParse], as "true" | ||||||
|  |     |  annotations. This encourages the optimizer to find a solution that | ||||||
|  |     |  predicts the new category with minimal difference from the previous | ||||||
|  |     |  output. | ||||||
|  | 
 | ||||||
|  | +h(2, "saving-loading") Saving and loading | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  After training our model, you'll usually want to save its state, and load | ||||||
|  |     |  it back later. You can do this with the #[code Language.save_to_directory()] | ||||||
|  |     |  method: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     nlp.save_to_directory('/home/me/data/en_technology') | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  To make the model more convenient to deploy, we recommend wrapping it as | ||||||
|  |     |  a Python package, so that you can install it via pip and load it as a | ||||||
|  |     |  module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command] | ||||||
|  |     |  to create all required files and directories. | ||||||
|  | 
 | ||||||
|  | +code(false, "bash"). | ||||||
|  |     python -m spacy package /home/me/data/en_technology /home/me/my_models | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  To build the package and create a #[code .tar.gz] archive, run | ||||||
|  |     |  #[code python setup.py sdist] from within its directory. | ||||||
|  | 
 | ||||||
|  | +infobox("Saving and loading models") | ||||||
|  |     |  For more information and a detailed guide on how to package your model, | ||||||
|  |     |  see the documentation on | ||||||
|  |     |  #[+a("/docs/usage/saving-loading") saving and loading models]. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  After you've generated and installed the package, you'll be able to | ||||||
|  |     |  load the model as follows: | ||||||
|  | 
 | ||||||
|  | +code. | ||||||
|  |     import en_technology | ||||||
|  |     nlp = en_technology.load() | ||||||
|  | 
 | ||||||
|  | +h(2, "example") Example: Adding and training an #[code ANIMAL] entity | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This script shows how to add a new entity type to an existing pre-trained | ||||||
|  |     |  NER model. To keep the example short and simple, only four sentences are | ||||||
|  |     |  provided as examples. In practice, you'll need many more — | ||||||
|  |     |  #[strong a few hundred] would be a good start. You will also likely need | ||||||
|  |     |  to mix in #[strong examples of other entity types], which might be | ||||||
|  |     |  obtained by running the entity recognizer over unlabelled sentences, and | ||||||
|  |     |  adding their annotations to the training set. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  For the full, runnable script of this example, see | ||||||
|  |     |  #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py]. | ||||||
|  | 
 | ||||||
|  | +code("Training the entity recognizer"). | ||||||
|  |     import spacy | ||||||
|  |     from spacy.pipeline import EntityRecognizer | ||||||
|  |     from spacy.gold import GoldParse | ||||||
|  |     from spacy.tagger import Tagger | ||||||
|  |     import random | ||||||
|  | 
 | ||||||
|  |     model_name = 'en' | ||||||
|  |     entity_label = 'ANIMAL' | ||||||
|  |     output_directory = '/path/to/model' | ||||||
|  |     train_data = [ | ||||||
|  |         ("Horses are too tall and they pretend to care about your feelings", | ||||||
|  |         [(0, 6, 'ANIMAL')]), | ||||||
|  |         ("horses are too tall and they pretend to care about your feelings", | ||||||
|  |         [(0, 6, 'ANIMAL')]), | ||||||
|  |         ("horses pretend to care about your feelings", | ||||||
|  |         [(0, 6, 'ANIMAL')]), | ||||||
|  |         ("they pretend to care about your feelings, those horses", | ||||||
|  |         [(48, 54, 'ANIMAL')]) | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     nlp = spacy.load(model_name) | ||||||
|  |     nlp.entity.add_label(entity_label) | ||||||
|  |     ner = train_ner(nlp, train_data, output_directory) | ||||||
|  | 
 | ||||||
|  |     def train_ner(nlp, train_data, output_dir): | ||||||
|  |         # Add new words to vocab | ||||||
|  |         for raw_text, _ in train_data: | ||||||
|  |             doc = nlp.make_doc(raw_text) | ||||||
|  |             for word in doc: | ||||||
|  |                 _ = nlp.vocab[word.orth] | ||||||
|  | 
 | ||||||
|  |         for itn in range(20): | ||||||
|  |             random.shuffle(train_data) | ||||||
|  |             for raw_text, entity_offsets in train_data: | ||||||
|  |                 gold = GoldParse(doc, entities=entity_offsets) | ||||||
|  |                 doc = nlp.make_doc(raw_text) | ||||||
|  |                 nlp.tagger(doc) | ||||||
|  |                 loss = nlp.entity.update(doc, gold) | ||||||
|  |         nlp.end_training() | ||||||
|  |         nlp.save_to_directory(output_dir) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  The actual training is performed by looping over the examples, and | ||||||
|  |     |  calling #[code nlp.entity.update()]. The #[code update()] method steps | ||||||
|  |     |  through the words of the input. At each word, it makes a prediction. It | ||||||
|  |     |  then consults the annotations provided on the #[code GoldParse] instance, | ||||||
|  |     |  to see whether it was right. If it was wrong, it adjusts its weights so | ||||||
|  |     |  that the correct action will score higher next time. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  After training your model, you can | ||||||
|  |     |  #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping | ||||||
|  |     |  models as Python packages, for ease of deployment. | ||||||
|  | @ -1,13 +1,10 @@ | ||||||
| include ../../_includes/_mixins | include ../../_includes/_mixins | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  This tutorial describes how to train new statistical models for spaCy's |     |  This workflow describes how to train new statistical models for spaCy's | ||||||
|     |  part-of-speech tagger, named entity recognizer and dependency parser. |     |  part-of-speech tagger, named entity recognizer and dependency parser. | ||||||
| 
 |     |  Once the model is trained, you can then | ||||||
| p |     |  #[+a("/docs/usage/saving-loading") save and load] it. | ||||||
|     |  I'll start with some quick code examples, that describe how to train |  | ||||||
|     |  each model. I'll then provide a bit of background about the algorithms, |  | ||||||
|     |  and explain how the data and feature templates work. |  | ||||||
| 
 | 
 | ||||||
| +h(2, "train-pos-tagger") Training the part-of-speech tagger | +h(2, "train-pos-tagger") Training the part-of-speech tagger | ||||||
| 
 | 
 | ||||||
|  | @ -48,7 +45,21 @@ p | ||||||
| p | p | ||||||
|     +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example |     +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example | ||||||
| 
 | 
 | ||||||
| +h(2, "train-entity") Training the dependency parser | +h(2, "extend-entity") Extending the named entity recognizer | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  All #[+a("/docs/usage/models") spaCy models] support online learning, so | ||||||
|  |     |  you can update a pre-trained model with new examples. You can even add | ||||||
|  |     |  new classes to an existing model, to recognise a new entity type, | ||||||
|  |     |  part-of-speech, or syntactic relation. Updating an existing model is | ||||||
|  |     |  particularly useful as a "quick and dirty solution", if you have only a | ||||||
|  |     |  few corrections or annotations. | ||||||
|  | 
 | ||||||
|  | p.o-inline-list | ||||||
|  |     +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example | ||||||
|  |     +button("/docs/usage/training-ner", false, "secondary") Usage Workflow | ||||||
|  | 
 | ||||||
|  | +h(2, "train-dependency") Training the dependency parser | ||||||
| 
 | 
 | ||||||
| +code. | +code. | ||||||
|     from spacy.vocab import Vocab |     from spacy.vocab import Vocab | ||||||
|  | @ -67,7 +78,7 @@ p | ||||||
| p | p | ||||||
|     +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example |     +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example | ||||||
| 
 | 
 | ||||||
| +h(2, 'feature-templates') Customizing the feature extraction | +h(2, "feature-templates") Customizing the feature extraction | ||||||
| 
 | 
 | ||||||
| p | p | ||||||
|     |  spaCy currently uses linear models for the tagger, parser and entity |     |  spaCy currently uses linear models for the tagger, parser and entity | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user