From 31fa73293a260a764fda000f72a167a56eab6330 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:03:28 +0200 Subject: [PATCH 01/43] Move read_json out to own util function --- spacy/util.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 219009f17..5fd9d563b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -113,12 +113,15 @@ def check_renamed_kwargs(renamed, kwargs): raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) +def read_json(location): + with location.open('r', encoding='utf8') as f: + return ujson.load(f) + + def parse_package_meta(package_path, package, require=True): location = package_path / package / 'meta.json' if location.is_file(): - with location.open('r', encoding='utf8') as f: - meta = ujson.load(f) - return meta + return read_json(location) elif require: raise IOError("Could not read meta.json from %s" % location) else: From 13c8a42d2b7f4ef630b33ede0239a70a1c0140bc Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:03:58 +0200 Subject: [PATCH 02/43] Fix typos --- spacy/cli/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index fb0de2cc5..d85f1a92a 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -47,9 +47,9 @@ def check_dirs(input_path, output_path): def create_dirs(package_path, force): if package_path.exists(): if force: - shutil.rmtree(unicode_(package_path.as_posix)) + shutil.rmtree(unicode_(package_path)) else: - util.sys_exit(unicode_(package_path.as_posix), + util.sys_exit(unicode_(package_path), "Please delete the directory and try again.", title="Package directory already exists") Path.mkdir(package_path, parents=True) From a7574b75728174c7fdab6c74a8f18f0c024489e4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:06:02 +0200 Subject: [PATCH 03/43] Add more options to read in meta data in package command Add meta option to supply path to meta.json. If no meta path is set, check if meta.json exists in input directory and use it. Otherwise, prompt for details on the command line. --- spacy/__main__.py | 5 +++-- spacy/cli/package.py | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 8d511d823..4c065a7e6 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -63,15 +63,16 @@ class CLI(object): @plac.annotations( input_dir=("directory with model data", "positional", None, str), output_dir=("output parent directory", "positional", None, str), + meta=("path to meta.json", "option", "m", str), force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) - def package(self, input_dir, output_dir, force=False): + def package(self, input_dir, output_dir, meta=None, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ - cli_package(input_dir, output_dir, force) + cli_package(input_dir, output_dir, meta, force) @plac.annotations( diff --git a/spacy/cli/package.py b/spacy/cli/package.py index d85f1a92a..26ce01a18 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -9,15 +9,22 @@ from ..compat import unicode_, json_dumps from .. import util -def package(input_dir, output_dir, force): +def package(input_dir, output_dir, meta_path, force): input_path = Path(input_dir) output_path = Path(output_dir) - check_dirs(input_path, output_path) + meta_path = util.ensure_path(meta_path) + check_dirs(input_path, output_path, meta_path) template_setup = get_template('setup.py') template_manifest = get_template('MANIFEST.in') template_init = get_template('en_model_name/__init__.py') - meta = generate_meta() + + meta_path = meta_path or input_path / 'meta.json' + if meta_path.is_file(): + util.print_msg(unicode_(meta_path), title="Reading meta.json from file") + meta = util.read_json(meta_path) + else: + meta = generate_meta() model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] @@ -37,11 +44,13 @@ def package(input_dir, output_dir, force): title="Successfully created package {p}".format(p=model_name_v)) -def check_dirs(input_path, output_path): +def check_dirs(input_path, output_path, meta_path): if not input_path.exists(): util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found") if not output_path.exists(): util.sys_exit(unicode_(output_path), title="Output directory not found") + if meta_path and not meta_path.exists(): + util.sys_exit(unicode_(meta_path), title="meta.json not found") def create_dirs(package_path, force): From e3de035814fa5b5bd2561bbbadc037deb8c15298 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:13:17 +0200 Subject: [PATCH 04/43] Add meta validation to check for required settings Complain if no "lang", "name" or "version" is found (those settings are used in directory / package names). Package will still build without, but it'll inevitably fail somewhere down the line. --- spacy/cli/package.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 26ce01a18..1abc36837 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -26,6 +26,7 @@ def package(input_dir, output_dir, meta_path, force): else: meta = generate_meta() + validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] main_path = output_path / model_name_v @@ -89,6 +90,14 @@ def generate_meta(): return meta +def validate_meta(meta, keys): + for key in keys: + if key not in meta or meta[key] == '': + util.sys_exit( + "This setting is required to build your package.", + title='No "{k}" setting found in meta.json'.format(k=key)) + + def get_template(filepath): url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' r = requests.get(url + filepath) From a3ddbc0444c1f9ec49e59d8c58b1bb06fe595e1a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:14:36 +0200 Subject: [PATCH 05/43] Add note about --force flag to error message --- spacy/cli/package.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 1abc36837..102b07472 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -60,7 +60,8 @@ def create_dirs(package_path, force): shutil.rmtree(unicode_(package_path)) else: util.sys_exit(unicode_(package_path), - "Please delete the directory and try again.", + "Please delete the directory and try again, or use the --force " + "flag to overwrite existing directories.", title="Package directory already exists") Path.mkdir(package_path, parents=True) From 8191e33cf1a59c20262d795b012fd7077f78f0e4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:32:31 +0200 Subject: [PATCH 06/43] Update link error message with info on permissions --- spacy/cli/link.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 9abb7bfb4..781adda2c 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -49,9 +49,12 @@ def symlink(model_path, link_name, force): # This is quite dirty, but just making sure other errors are caught so # users at least see a proper message. util.sys_exit( - "Creating a symlink in spacy/data failed. You can still import " - "the model as a Python package and call its load() method, or " - "create the symlink manually:", + "Creating a symlink in spacy/data failed. Make sure you have the " + "required permissions and try re-running the command as admin, or " + "use a virtualenv to install spaCy in a user directory, instead of " + "doing a system installation.", + "You can still import the model as a Python package and call its " + "load() method, or create the symlink manually:", "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)), title="Error: Couldn't link model to '{l}'".format(l=link_name)) From d29c825ca4af79c1da9e7140dc3730fa6dcc4383 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:37:24 +0200 Subject: [PATCH 07/43] Update docs for package command --- website/docs/usage/cli.jade | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index ebd034bb8..5ad8a214d 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -249,14 +249,16 @@ p p | Generate a #[+a("/docs/usage/models#own-models") model Python package] - | from an existing model data directory. All data files are copied over, - | and the meta data can be entered directly from the command line. While - | this feature is still experimental, the required file templates are - | downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. - | This means you need to be connected to the internet to use this command. + | from an existing model data directory. All data files are copied over. + | If the path to a meta.json is supplied, or a meta.json is found in the + | input directory, this file is used. Otherwise, the data can be entered + | directly from the command line. While this feature is still experimental, + | the required file templates are downloaded from + | #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means + | you need to be connected to the internet to use this command. +code(false, "bash"). - python -m spacy package [input_dir] [output_dir] [--force] + python -m spacy package [input_dir] [output_dir] [--meta] [--force] +table(["Argument", "Type", "Description"]) +row @@ -269,6 +271,11 @@ p +cell positional +cell Directory to create package folder in. + +row + +cell #[code meta] + +cell option + +cell Path to meta.json file (optional). + +row +cell #[code --force], #[code -f] +cell flag From d10bd0eaf9a1ce355dca11826edd0b9208c13523 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:42:34 +0200 Subject: [PATCH 08/43] Fix formatting --- spacy/util.py | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 5fd9d563b..f8af8baa3 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -129,10 +129,11 @@ def parse_package_meta(package_path, package, require=True): def get_raw_input(description, default=False): - """Get user input via raw_input / input and return input value. Takes a + """ + Get user input via raw_input / input and return input value. Takes a description for the prompt, and an optional default value that's displayed - with the prompt.""" - + with the prompt. + """ additional = ' (default: {d})'.format(d=default) if default else '' prompt = ' {d}{a}: '.format(d=description, a=additional) user_input = input_(prompt) @@ -140,9 +141,10 @@ def get_raw_input(description, default=False): def print_table(data, **kwargs): - """Print data in table format. Can either take a list of tuples or a - dictionary, which will be converted to a list of tuples.""" - + """ + Print data in table format. Can either take a list of tuples or a + dictionary, which will be converted to a list of tuples. + """ if type(data) == dict: data = list(data.items()) @@ -158,10 +160,11 @@ def print_table(data, **kwargs): def print_markdown(data, **kwargs): - """Print listed data in GitHub-flavoured Markdown format so it can be + """ + Print listed data in GitHub-flavoured Markdown format so it can be copy-pasted into issues. Can either take a list of tuples or a dictionary, - which will be converted to a list of tuples.""" - + which will be converted to a list of tuples. + """ def excl_value(value): # don't print value if it contains absolute path of directory (i.e. # personal info). Other conditions can be included here if necessary. @@ -178,16 +181,16 @@ def print_markdown(data, **kwargs): if 'title' in kwargs and kwargs['title']: print(tpl_title.format(msg=kwargs['title'])) - print(tpl_msg.format(msg=markdown)) def print_msg(*text, **kwargs): - """Print formatted message. Each positional argument is rendered as newline- + """ + Print formatted message. Each positional argument is rendered as newline- separated paragraph. If kwarg 'title' exist, title is printed above the text and highlighted (using ANSI escape sequences manually to avoid unnecessary - dependency).""" - + dependency). + """ message = '\n\n'.join([_wrap_text(t) for t in text]) tpl_msg = '\n{msg}\n' tpl_title = '\n\033[93m{msg}\033[0m' @@ -199,9 +202,10 @@ def print_msg(*text, **kwargs): def _wrap_text(text): - """Wrap text at given width using textwrap module. Indent should consist of - spaces. Its length is deducted from wrap width to ensure exact wrapping.""" - + """ + Wrap text at given width using textwrap module. Indent should consist of + spaces. Its length is deducted from wrap width to ensure exact wrapping. + """ wrap_max = 80 indent = ' ' wrap_width = wrap_max - len(indent) @@ -211,10 +215,11 @@ def _wrap_text(text): def sys_exit(*messages, **kwargs): - """Performs SystemExit. For modules used from the command line, like + """ + Performs SystemExit. For modules used from the command line, like download and link. To print message, use the same arguments as for - print_msg().""" - + print_msg(). + """ if messages: print_msg(*messages, **kwargs) sys.exit(0) From c7adca58a9c85423d97f859c06b6c92e8aee35ab Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 16:55:01 +0200 Subject: [PATCH 09/43] Tidy up example and only save/test if output_directory is not None --- examples/training/train_new_entity_type.py | 31 ++++++++++------------ 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index af98ef583..cbe2963d3 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,22 +1,16 @@ from __future__ import unicode_literals, print_function -import json -import pathlib + import random +from pathlib import Path import spacy from spacy.pipeline import EntityRecognizer from spacy.gold import GoldParse from spacy.tagger import Tagger - -try: - unicode -except: - unicode = str - def train_ner(nlp, train_data, output_dir): - # Add new words to vocab. + # Add new words to vocab for raw_text, _ in train_data: doc = nlp.make_doc(raw_text) for word in doc: @@ -30,11 +24,14 @@ def train_ner(nlp, train_data, output_dir): nlp.tagger(doc) loss = nlp.entity.update(doc, gold) nlp.end_training() - nlp.save_to_directory(output_dir) + if output_dir: + nlp.save_to_directory(output_dir) def main(model_name, output_directory=None): nlp = spacy.load(model_name) + if output_directory is not None: + output_directory = Path(output_directory) train_data = [ ( @@ -55,18 +52,18 @@ def main(model_name, output_directory=None): ) ] nlp.entity.add_label('ANIMAL') - if output_directory is not None: - output_directory = pathlib.Path(output_directory) ner = train_ner(nlp, train_data, output_directory) + # Test that the entity is recognized doc = nlp('Do you like horses?') for ent in doc.ents: print(ent.label_, ent.text) - nlp2 = spacy.load('en', path=output_directory) - nlp2.entity.add_label('ANIMAL') - doc2 = nlp2('Do you like horses?') - for ent in doc2.ents: - print(ent.label_, ent.text) + if output_directory: + nlp2 = spacy.load('en', path=output_directory) + nlp2.entity.add_label('ANIMAL') + doc2 = nlp2('Do you like horses?') + for ent in doc2.ents: + print(ent.label_, ent.text) if __name__ == '__main__': From 137b210bcfbb5eced0fd44d56ba9d5cf515b89f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 18:02:42 +0200 Subject: [PATCH 10/43] Restore use of FTRL training --- spacy/syntax/parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 4473045e9..0bc9cb4ef 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -40,7 +40,7 @@ from ..strings cimport StringStore from ..gold cimport GoldParse -USE_FTRL = False +USE_FTRL = True DEBUG = False def set_debug(val): global DEBUG From 6a4221a6decd92d7656c139f4b297095f3ab4158 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 18:07:53 +0200 Subject: [PATCH 11/43] Allow lemma to be set from Python. Re #973 --- spacy/tokens/token.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 94491614c..f146f5cd6 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -192,6 +192,8 @@ cdef class Token: property lemma: def __get__(self): return self.c.lemma + def __set__(self, int lemma): + self.c.lemma = lemma property pos: def __get__(self): @@ -570,6 +572,8 @@ cdef class Token: property lemma_: def __get__(self): return self.vocab.strings[self.c.lemma] + def __set__(self, unicode lemma_): + self.c.lemma = self.vocab.strings[lemma_] property pos_: def __get__(self): From 89a4f262fc7ec88b6deae3e65ca75d5c138a7352 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 13:00:37 -0500 Subject: [PATCH 12/43] Fix training methods --- spacy/cli/train.py | 13 +++++++------ spacy/gold.pyx | 6 +++--- spacy/language.py | 9 ++++++--- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 489430634..3900c7f39 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals, division, print_function import json -from pathlib import Path +from ..util import ensure_path from ..scorer import Scorer from ..gold import GoldParse, merge_sents from ..gold import read_json_file as read_gold_json @@ -12,9 +12,9 @@ from .. import util def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner, parser_L1): - output_path = Path(output_dir) - train_path = Path(train_data) - dev_path = Path(dev_data) + output_path = ensure_path(output_dir) + train_path = ensure_path(train_data) + dev_path = ensure_path(dev_data) check_dirs(output_path, train_path, dev_path) lang = util.get_lang_class(language) @@ -43,7 +43,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne def train_config(config): - config_path = Path(config) + config_path = ensure_path(config) if not config_path.is_file(): util.sys_exit(config_path.as_posix(), title="Config file not found") config = json.load(config_path) @@ -57,7 +57,8 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ entity_cfg, n_iter): print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") - with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: + with Language.train(output_path, train_data, + pos=tagger_cfg, deps=parser_cfg, ner=entity_cfg) as trainer: for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for doc, gold in epoch: trainer.update(doc, gold) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 425ad0fe0..1e55075c7 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -5,9 +5,9 @@ from __future__ import unicode_literals, print_function import io import re import ujson -from pathlib import Path from .syntax import nonproj +from .util import ensure_path def tags_to_entities(tags): @@ -139,12 +139,12 @@ def _min_edit_path(cand_words, gold_words): def read_json_file(loc, docs_filter=None): - loc = Path(loc) + loc = ensure_path(loc) if loc.is_dir(): for filename in loc.iterdir(): yield from read_json_file(loc / filename) else: - with io.open(loc, 'r', encoding='utf8') as file_: + with loc.open('r', encoding='utf8') as file_: docs = ujson.load(file_) for doc in docs: if docs_filter is not None and not docs_filter(doc): diff --git a/spacy/language.py b/spacy/language.py index 4b6c3397d..47408921c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -204,15 +204,18 @@ class Language(object): @classmethod @contextmanager def train(cls, path, gold_tuples, **configs): - if parser_cfg['pseudoprojective']: + parser_cfg = configs.get('deps', {}) + if parser_cfg.get('pseudoprojective'): # preprocess training data here before ArcEager.get_labels() is called gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) for subdir in ('deps', 'ner', 'pos'): if subdir not in configs: configs[subdir] = {} - configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) - configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) + if parser_cfg: + configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) + if 'ner' in configs: + configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) cls.setup_directory(path, **configs) From 90cf6b9429c336663a24b609da02368cd066217f Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 19:32:21 +0200 Subject: [PATCH 13/43] Add pytest to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f1f26171b..6212ab3cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 +pytest>=3.0.6,<4.0.0 From 0084466a6695dea92a4b42f9a24b65682d443db0 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 19:51:29 +0200 Subject: [PATCH 14/43] Remove unused utf8open util and replace os.path with ensure_path --- spacy/tests/tokenizer/test_tokenizer.py | 7 +++---- spacy/util.py | 4 ---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 22afa1f43..da79b43a8 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -3,9 +3,8 @@ from __future__ import unicode_literals from ...vocab import Vocab from ...tokenizer import Tokenizer -from ...util import utf8open +from ... import util -from os import path import pytest @@ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n @pytest.mark.parametrize('file_name', ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): - loc = path.join(path.dirname(__file__), file_name) - text = utf8open(loc).read() + loc = util.ensure_path(__file__).parent / file_name + text = loc.open('r', encoding='utf8').read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100 diff --git a/spacy/util.py b/spacy/util.py index f8af8baa3..3318725ec 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -103,10 +103,6 @@ def normalize_slice(length, start, stop, step=None): return start, stop -def utf8open(loc, mode='r'): - return io.open(loc, mode, encoding='utf8') - - def check_renamed_kwargs(renamed, kwargs): for old, new in renamed.items(): if old in kwargs: From ed7e19ad685c3e7c615feb39793cd318f1e6580c Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 19:54:16 +0200 Subject: [PATCH 15/43] Remove unused import --- spacy/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 3318725ec..24c0d74f1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals, print_function -import io import ujson import re from pathlib import Path From d3759dfb3224a78a93663f1a74b810e4323be86c Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:34:37 +0200 Subject: [PATCH 16/43] Fix docstring --- spacy/deprecated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index f481a2502..65053089a 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides): def resolve_model_name(name): """ If spaCy is loaded with 'de', check if symlink already exists. If - not, user have upgraded from older version and have old models installed. + not, user may have upgraded from older version and have old models installed. Check if old model directory exists and if so, return that instead and create shortcut link. If English model is found and no shortcut exists, raise error and tell user to install new model. From 7670c745b6370fafd29ccedcdcba875156ccd576 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:00 +0200 Subject: [PATCH 17/43] Update spacy.load() and fix path checks --- spacy/__init__.py | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index bc668121f..06e9374ea 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,39 +1,38 @@ # coding: utf8 from __future__ import unicode_literals -from pathlib import Path - -from .util import set_lang_class, get_lang_class, parse_package_meta +from . import util from .deprecated import resolve_model_name from .cli import info from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he -set_lang_class(en.English.lang, en.English) -set_lang_class(de.German.lang, de.German) -set_lang_class(es.Spanish.lang, es.Spanish) -set_lang_class(pt.Portuguese.lang, pt.Portuguese) -set_lang_class(fr.French.lang, fr.French) -set_lang_class(it.Italian.lang, it.Italian) -set_lang_class(hu.Hungarian.lang, hu.Hungarian) -set_lang_class(zh.Chinese.lang, zh.Chinese) -set_lang_class(nl.Dutch.lang, nl.Dutch) -set_lang_class(sv.Swedish.lang, sv.Swedish) -set_lang_class(fi.Finnish.lang, fi.Finnish) -set_lang_class(bn.Bengali.lang, bn.Bengali) -set_lang_class(he.Hebrew.lang, he.Hebrew) +_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, + it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, + fi.Finnish, bn.Bengali, he.Hebrew) + + +for _lang in _languages: + util.set_lang_class(_lang.lang, _lang) def load(name, **overrides): - data_path = overrides.get('path', util.get_data_path()) - model_name = resolve_model_name(name) - meta = parse_package_meta(data_path, model_name, require=False) + if overrides.get('path') in (None, False, True): + data_path = util.get_data_path() + model_name = resolve_model_name(name) + model_path = data_path / model_name + if not model_path.exists(): + model_path = None + util.print_msg( + "Only loading the '{}' tokenizer.".format(name), + title="Warning: no model found for '{}'".format(name)) + else: + model_path = util.ensure_path(overrides['path']) + data_path = model_path.parent + meta = util.parse_package_meta(data_path, model_name, require=False) lang = meta['lang'] if meta and 'lang' in meta else name - cls = get_lang_class(lang) + cls = util.get_lang_class(lang) overrides['meta'] = meta - model_path = Path(data_path / model_name) - if model_path.exists(): - overrides['path'] = model_path - + overrides['path'] = model_path return cls(**overrides) From 1f9f867c70fc76ae1c0b307e38997c17e084c051 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:08 +0200 Subject: [PATCH 18/43] Remove unused util function --- spacy/util.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 24c0d74f1..573489682 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -45,15 +45,6 @@ def ensure_path(path): return path -def or_(val1, val2): - if val1 is not None: - return val1 - elif callable(val2): - return val2() - else: - return val2 - - def read_regex(path): path = ensure_path(path) with path.open() as file_: From 5cb17b9f33ae3803a8fc25305f71153acf71374f Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:47 +0200 Subject: [PATCH 19/43] Add NER training docs --- website/docs/usage/_data.json | 5 + website/docs/usage/training-ner.jade | 174 +++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 website/docs/usage/training-ner.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index c8c85af1d..f81fb245f 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -22,6 +22,7 @@ "Custom tokenization": "customizing-tokenizer", "Training": "training", "Adding languages": "adding-languages" + "Training NER": "training-ner", }, "Examples": { "Tutorials": "tutorials", @@ -106,6 +107,10 @@ "training": { "title": "Training the tagger, parser and entity recognizer" + "training-ner": { + "title": "Training the Named Entity Recognizer", + "next": "saving-loading" + }, }, "pos-tagging": { diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade new file mode 100644 index 000000000..78eb4905e --- /dev/null +++ b/website/docs/usage/training-ner.jade @@ -0,0 +1,174 @@ +include ../../_includes/_mixins + +p + | All #[+a("/docs/usage/models") spaCy models] support online learning, so + | you can update a pre-trained model with new examples. You can even add + | new classes to an existing model, to recognise a new entity type, + | part-of-speech, or syntactic relation. Updating an existing model is + | particularly useful as a "quick and dirty solution", if you have only a + | few corrections or annotations. + ++h(2, "improving-accuracy") Improving accuracy on existing entity types + +p + | To update the model, you first need to create an instance of + | #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels + | you want to learn. You will then pass this instance to the + | #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]] + | method. For example: + ++code. + import spacy + from spacy.gold import GoldParse + + nlp = spacy.load('en') + doc = nlp.make_doc(u'Facebook released React in 2014') + gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE']) + nlp.entity.update(doc, gold) + +p + | You'll usually need to provide many examples to meaningfully improve the + | system — a few hundred is a good start, although more is better. You + | should avoid iterating over the same few examples multiple times, or the + | model is likely to "forget" how to annotate other examples. If you + | iterate over the same few examples, you're effectively changing the loss + | function. The optimizer will find a way to minimize the loss on your + | examples, without regard for the consequences on the examples it's no + | longer paying attention to. + +p + | One way to avoid this "catastrophic forgetting" problem is to "remind" + | the model of other examples by augmenting your annotations with sentences + | annotated with entities automatically recognised by the original model. + | Ultimately, this is an empirical process: you'll need to + | #[strong experiment on your own data] to find a solution that works best + | for you. + ++h(2, "adding") Adding a new entity type + +p + | You can add new entity types to an existing model. Let's say we want to + | recognise the category #[code TECHNOLOGY]. The new category will include + | programming languages, frameworks and platforms. First, we need to + | register the new entity type: + ++code. + nlp.entity.add_label('TECHNOLOGY') + +p + | Next, iterate over your examples, calling #[code entity.update()]. As + | above, we want to avoid iterating over only a small number of sentences. + | A useful compromise is to run the model over a number of plain-text + | sentences, and pass the entities to #[code GoldParse], as "true" + | annotations. This encourages the optimizer to find a solution that + | predicts the new category with minimal difference from the previous + | output. + ++h(2, "saving-loading") Saving and loading + +p + | After training our model, you'll usually want to save its state, and load + | it back later. You can do this with the #[code Language.save_to_directory()] + | method: + ++code. + nlp.save_to_directory('/home/me/data/en_technology') + +p + | To make the model more convenient to deploy, we recommend wrapping it as + | a Python package, so that you can install it via pip and load it as a + | module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command] + | to create all required files and directories. + ++code(false, "bash"). + python -m spacy package /home/me/data/en_technology /home/me/my_models + +p + | To build the package and create a #[code .tar.gz] archive, run + | #[code python setup.py sdist] from within its directory. + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading") saving and loading models]. + +p + | After you've generated and installed the package, you'll be able to + | load the model as follows: + ++code. + import en_technology + nlp = en_technology.load() + ++h(2, "example") Example: Adding and training an #[code ANIMAL] entity + +p + | This script shows how to add a new entity type to an existing pre-trained + | NER model. To keep the example short and simple, only four sentences are + | provided as examples. In practice, you'll need many more — + | #[strong a few hundred] would be a good start. You will also likely need + | to mix in #[strong examples of other entity types], which might be + | obtained by running the entity recognizer over unlabelled sentences, and + | adding their annotations to the training set. + +p + | For the full, runnable script of this example, see + | #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py]. + ++code("Training the entity recognizer"). + import spacy + from spacy.pipeline import EntityRecognizer + from spacy.gold import GoldParse + from spacy.tagger import Tagger + import random + + model_name = 'en' + entity_label = 'ANIMAL' + output_directory = '/path/to/model' + train_data = [ + ("Horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("horses pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("they pretend to care about your feelings, those horses", + [(48, 54, 'ANIMAL')]) + ] + + nlp = spacy.load(model_name) + nlp.entity.add_label(entity_label) + ner = train_ner(nlp, train_data, output_directory) + + def train_ner(nlp, train_data, output_dir): + # Add new words to vocab + for raw_text, _ in train_data: + doc = nlp.make_doc(raw_text) + for word in doc: + _ = nlp.vocab[word.orth] + + for itn in range(20): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + gold = GoldParse(doc, entities=entity_offsets) + doc = nlp.make_doc(raw_text) + nlp.tagger(doc) + loss = nlp.entity.update(doc, gold) + nlp.end_training() + nlp.save_to_directory(output_dir) + +p + +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example + +p + | The actual training is performed by looping over the examples, and + | calling #[code nlp.entity.update()]. The #[code update()] method steps + | through the words of the input. At each word, it makes a prediction. It + | then consults the annotations provided on the #[code GoldParse] instance, + | to see whether it was right. If it was wrong, it adjusts its weights so + | that the correct action will score higher next time. + +p + | After training your model, you can + | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping + | models as Python packages, for ease of deployment. From b15bdb5279583ea648cad4aafa976747237b40da Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:56 +0200 Subject: [PATCH 20/43] Update training docs --- website/docs/usage/training.jade | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 39f524829..8a5c111bd 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -1,13 +1,10 @@ include ../../_includes/_mixins p - | This tutorial describes how to train new statistical models for spaCy's + | This workflow describes how to train new statistical models for spaCy's | part-of-speech tagger, named entity recognizer and dependency parser. - -p - | I'll start with some quick code examples, that describe how to train - | each model. I'll then provide a bit of background about the algorithms, - | and explain how the data and feature templates work. + | Once the model is trained, you can then + | #[+a("/docs/usage/saving-loading") save and load] it. +h(2, "train-pos-tagger") Training the part-of-speech tagger @@ -48,7 +45,21 @@ p p +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example -+h(2, "train-entity") Training the dependency parser ++h(2, "extend-entity") Extending the named entity recognizer + +p + | All #[+a("/docs/usage/models") spaCy models] support online learning, so + | you can update a pre-trained model with new examples. You can even add + | new classes to an existing model, to recognise a new entity type, + | part-of-speech, or syntactic relation. Updating an existing model is + | particularly useful as a "quick and dirty solution", if you have only a + | few corrections or annotations. + +p.o-inline-list + +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example + +button("/docs/usage/training-ner", false, "secondary") Usage Workflow + ++h(2, "train-dependency") Training the dependency parser +code. from spacy.vocab import Vocab @@ -67,7 +78,7 @@ p p +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example -+h(2, 'feature-templates') Customizing the feature extraction ++h(2, "feature-templates") Customizing the feature extraction p | spaCy currently uses linear models for the tagger, parser and entity From 17e974338860a231054fd3d214e4248e06b594ac Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:09 +0200 Subject: [PATCH 21/43] Add saving & loading models docs --- website/docs/usage/_data.json | 4 + website/docs/usage/saving-loading.jade | 108 +++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 website/docs/usage/saving-loading.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index f81fb245f..edb37bbad 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -23,6 +23,7 @@ "Training": "training", "Adding languages": "adding-languages" "Training NER": "training-ner", + "Saving & loading": "saving-loading" }, "Examples": { "Tutorials": "tutorials", @@ -111,6 +112,9 @@ "title": "Training the Named Entity Recognizer", "next": "saving-loading" }, + + "saving-loading": { + "title": "Saving and loading models" }, "pos-tagging": { diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade new file mode 100644 index 000000000..063c5dc50 --- /dev/null +++ b/website/docs/usage/saving-loading.jade @@ -0,0 +1,108 @@ +include ../../_includes/_mixins + +p + | After training your model, you'll usually want to save its state, and load + | it back later. You can do this with the #[code Language.save_to_directory()] + | method: + ++code. + nlp.save_to_directory('/home/me/data/en_example_model') + +p + | The directory will be created if it doesn't exist, and the whole pipeline + | will be written out. To make the model more convenient to deploy, we + | recommend wrapping it as a Python package. + ++h(2, "generating") Generating a model package + ++infobox("Important note") + | The model packages are #[strong not suitable] for the public + | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not + | designed for binary data and files over 50 MB. However, if your company + | is running an internal installation of pypi, publishing your models on + | there can be a convenient solution to share them with your team. + +p + | spaCy comes with a handy CLI command that will create all required files, + | and walk you through generating the meta data. You can also create the + | meta.json manually and place it in the model data directory, or supply a + | path to it using the #[code --meta] flag. For more info on this, see the + | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + ++aside-code("meta.json", "json"). + { + "name": "example_model", + "lang": "en", + "version": "1.0.0", + "spacy_version": ">=1.7.0,<2.0.0", + "description": "Example model for spaCy", + "author": "You", + "email": "you@example.com", + "license": "CC BY-SA 3.0" + } + ++code(false, "bash"). + python -m spacy package /home/me/data/en_example_model /home/me/my_models + +p This command will create a model package directory that should look like this: + ++code("Directory structure", "yaml"). + └── / + ├── MANIFEST.in # to include meta.json + ├── meta.json # model meta data + ├── setup.py # setup file for pip installation + └── en_example_model # model directory + ├── __init__.py # init for pip installation + └── en_example_model-1.0.0 # model data + +p + | You can also find templates for all files in our + | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | If you're creating the package manually, keep in mind that the directories + | need to be named according to the naming conventions of + | #[code [language]_[type]] and #[code [language]_[type]-[version]]. The + | #[code lang] setting in the meta.json is also used to create the + | respective #[code Language] class in spaCy, which will later be returned + | by the model's #[code load()] method. + ++h(2, "building") Building a model package + +p + | To build the package, run the following command from within the + | directory. This will create a #[code .tar.gz] archive in a directory + | #[code /dist]. + ++code(false, "bash"). + python setup.py sdist + +p + | For more information on building Python packages, see the + | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + + ++h(2, "loading") Loading a model package + +p + | Model packages can be installed by pointing pip to the model's + | #[code .tar.gz] archive: + ++code(false, "bash"). + pip install /path/to/en_example_model-1.0.0.tar.gz + +p You'll then be able to load the model as follows: + ++code. + import en_example_model + nlp = en_example_model.load() + +p + | To load the model via #[code spacy.load()], you can also + | create a #[+a("/docs/usage/models#usage") shortcut link] that maps the + | package name to a custom model name of your choice: + ++code(false, "bash"). + python -m spacy link en_example_model example + ++code. + import spacy + nlp = spacy.load('example') From 5bbbb7674b93d82d7235a5b183da5506235a0951 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:22 +0200 Subject: [PATCH 22/43] Add training examples to tutorials --- website/docs/usage/_data.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index edb37bbad..dc71ef618 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -365,6 +365,18 @@ }, "code": { + "Training a new entity type": { + "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py", + "author": "Matthew Honnibal", + "tags": ["ner", "training"] + }, + + "Training an NER system from scratch": { + "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py", + "author": "Matthew Honnibal", + "tags": ["ner", "training"] + }, + "Information extraction": { "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", "author": "Matthew Honnibal", From c365795bf6ef0055364c41344fba56853ecc97ef Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:31 +0200 Subject: [PATCH 23/43] Update navigation --- website/docs/usage/_data.json | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index dc71ef618..2ffbf9d68 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -20,8 +20,8 @@ "Word vectors": "word-vectors-similarities", "Deep learning": "deep-learning", "Custom tokenization": "customizing-tokenizer", + "Adding languages": "adding-languages", "Training": "training", - "Adding languages": "adding-languages" "Training NER": "training-ner", "Saving & loading": "saving-loading" }, @@ -103,11 +103,14 @@ "customizing-tokenizer": { "title": "Customizing the tokenizer", - "next": "training" + "next": "adding-languages" }, "training": { - "title": "Training the tagger, parser and entity recognizer" + "title": "Training spaCy's statistical models", + "next": "saving-loading" + }, + "training-ner": { "title": "Training the Named Entity Recognizer", "next": "saving-loading" From dea79224edcb536253c0e74803f112b2891ca4ed Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:51 +0200 Subject: [PATCH 24/43] Remove saving & loading docs and link to new workflow --- website/docs/usage/models.jade | 67 +++++----------------------------- 1 file changed, 9 insertions(+), 58 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 39c271df4..9d50dcbc0 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -235,62 +235,13 @@ p p | If you've trained your own model, for example for - | #[+a("/docs/usage/adding-languages") additional languages], you can - | create a shortuct link for it by pointing #[code spacy.link] to the - | model's data directory. To allow your model to be downloaded and - | installed via pip, you'll also need to generate a package for it. You can - | do this manually, or via the new - | #[+a("/docs/usage/cli#package") #[code spacy package] command] that will - | create all required files, and walk you through generating the meta data. + | #[+a("/docs/usage/adding-languages") additional languages] or + | #[+a("/docs/usage/train-ner") custom named entities], you can save its + | state using the #[code Language.save_to_directory()] method. To make the + | model more convenient to deploy, we recommend wrapping it as a Python + | package. - -+infobox("Important note") - | The model packages are #[strong not suitable] for the public - | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not - | designed for binary data and files over 50 MB. However, if your company - | is running an internal installation of pypi, publishing your models on - | there can be a convenient solution to share them with your team. - -p The model directory should look like this: - -+code("Directory structure", "yaml"). - └── / - ├── MANIFEST.in # to include meta.json - ├── meta.json # model meta data - ├── setup.py # setup file for pip installation - └── en_core_web_md # model directory - ├── __init__.py # init for pip installation - └── en_core_web_md-1.2.0 # model data - -p - | You can find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. - | Unless you want to customise installation and loading, the only file - | you'll need to modify is #[code meta.json], which includes the model's - | meta data. It will later be copied into the package and data directory. - -+code("meta.json", "json"). - { - "name": "core_web_md", - "lang": "en", - "version": "1.2.0", - "spacy_version": "1.7.0", - "description": "English model for spaCy", - "author": "Explosion AI", - "email": "contact@explosion.ai", - "license": "MIT" - } - -p - | Keep in mind that the directories need to be named according to the - | naming conventions. The #[code lang] setting is also used to create the - | respective #[code Language] class in spaCy, which will later be returned - | by the model's #[code load()] method. - -p - | To generate the package, run the following command from within the - | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. - -+code(false, "bash"). - python setup.py sdist ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading") saving and loading models]. From e4dd645c378acd32a5e91b38b51b94427abf3d46 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:59 +0200 Subject: [PATCH 25/43] Update link --- website/docs/usage/cli.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 5ad8a214d..e4d762615 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -248,7 +248,7 @@ p +tag experimental p - | Generate a #[+a("/docs/usage/models#own-models") model Python package] + | Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | from an existing model data directory. All data files are copied over. | If the path to a meta.json is supplied, or a meta.json is found in the | input directory, this file is used. Otherwise, the data can be entered From 264af6cd17d734efa5cb958f4cded7d852f80237 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:37:29 +0200 Subject: [PATCH 26/43] Add documentation --- examples/training/train_new_entity_type.py | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index cbe2963d3..ef4070153 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,3 +1,32 @@ +#!/usr/bin/env python +""" +Example of training and additional entity type + +This script shows how to add a new entity type to an existing pre-trained NER +model. To keep the example short and simple, only four sentences are provided +as examples. In practice, you'll need many more — a few hundred would be a +good start. You will also likely need to mix in examples of other entity +types, which might be obtained by running the entity recognizer over unlabelled +sentences, and adding their annotations to the training set. + +The actual training is performed by looping over the examples, and calling +`nlp.entity.update()`. The `update()` method steps through the words of the +input. At each word, it makes a prediction. It then consults the annotations +provided on the GoldParse instance, to see whether it was right. If it was +wrong, it adjusts its weights so that the correct action will score higher +next time. + +After training your model, you can save it to a directory. We recommend +wrapping models as Python packages, for ease of deployment. + +For more details, see the documentation: +* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner +* Saving and loading models: https://spacy.io/docs/usage/saving-loading + +Developed for: spaCy 1.7.6 +Last tested for: spaCy 1.7.6 +""" +# coding: utf8 from __future__ import unicode_literals, print_function import random From 6145b7c15334f43d093c6bcd7fb2dd145ec0df98 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:53:25 +0200 Subject: [PATCH 27/43] Remove redundant Path --- spacy/cli/link.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 781adda2c..e5d590e5a 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -29,7 +29,7 @@ def link_package(package_name, link_name, force=False): def symlink(model_path, link_name, force): model_path = Path(model_path) - if not Path(model_path).exists(): + if not model_path.exists(): util.sys_exit( "The data should be located in {p}".format(p=model_path), title="Can't locate model data") From 4931c56afc7ec607b76d312b28ee4ea00cb77002 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 13:59:38 -0500 Subject: [PATCH 28/43] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ecdbc1fef..b9b7444a8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.7.5' +__version__ = '1.7.6' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 13d30b6c01d25472f9b222be3788ef9e1a8ce9e3 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 21:18:39 +0200 Subject: [PATCH 29/43] xfail lemmatizer test that's causing problems (see #546) --- spacy/tests/tagger/test_lemmatizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 3e2933fcd..5db0d0b2c 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -16,6 +16,7 @@ def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas): assert lemmatizer.noun(text) == set(lemmas) +@pytest.mark.xfail @pytest.mark.models def test_tagger_lemmatizer_base_forms(lemmatizer): if lemmatizer is None: From 5c5f8c0a72f43ba2139185786b4e08884096b8fa Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:14:38 +0200 Subject: [PATCH 30/43] Check if full string is found in lang classes first This allows users to set arbitrary strings. (Otherwise, custom lang class "my_custom_class" would always load Burmese "my" tokenizer if one was available.) --- spacy/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 573489682..f807dae9e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -20,9 +20,11 @@ def set_lang_class(name, cls): def get_lang_class(name): + if name in LANGUAGES: + return LANGUAGES[name] lang = re.split('[^a-zA-Z0-9]', name, 1)[0] if lang not in LANGUAGES: - raise RuntimeError('Language not supported: %s' % lang) + raise RuntimeError('Language not supported: %s' % name) return LANGUAGES[lang] From 97647c46cdbd623e34da4c162ceecd4b97b0946e Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:14:45 +0200 Subject: [PATCH 31/43] Add docstring and todo note --- spacy/util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index f807dae9e..0ccdfbd72 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -107,6 +107,13 @@ def read_json(location): def parse_package_meta(package_path, package, require=True): + """ + Check if a meta.json exists in a package and return its contents as a + dictionary. If require is set to True, raise an error if no meta.json found. + """ + # TODO: Allow passing in full model path and only require one argument + # instead of path and package name. This lets us avoid passing in an awkward + # empty string in spacy.load() if user supplies full model path. location = package_path / package / 'meta.json' if location.is_file(): return read_json(location) From ad168ba88c8d18b3755f3c49ced4cb6c34248fc7 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:15:51 +0200 Subject: [PATCH 32/43] Set model name to empty string if path override exists Required for parse_package_meta, which composes path of data_path and model_name (needs to be fixed in the future) --- spacy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/__init__.py b/spacy/__init__.py index 06e9374ea..22f406771 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -30,6 +30,7 @@ def load(name, **overrides): else: model_path = util.ensure_path(overrides['path']) data_path = model_path.parent + model_name = '' meta = util.parse_package_meta(data_path, model_name, require=False) lang = meta['lang'] if meta and 'lang' in meta else name cls = util.get_lang_class(lang) From 5610fdcc064877174f383654b615c1c97b2ff96d Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:16:47 +0200 Subject: [PATCH 33/43] Get language name first if no model path exists Makes sure spaCy fails early if no tokenizer exists, and allows printing better error message. --- spacy/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 22f406771..efd6c00c0 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -23,9 +23,10 @@ def load(name, **overrides): model_name = resolve_model_name(name) model_path = data_path / model_name if not model_path.exists(): + lang_name = util.get_lang_class(name).lang model_path = None util.print_msg( - "Only loading the '{}' tokenizer.".format(name), + "Only loading the '{}' tokenizer.".format(lang_name), title="Warning: no model found for '{}'".format(name)) else: model_path = util.ensure_path(overrides['path']) From 17c9fffb9e1a5318cc9e5de320f1ea38d51ee1a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 15:28:16 -0500 Subject: [PATCH 34/43] Fix naked except --- spacy/cli/link.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index e5d590e5a..f2d2fd436 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -48,7 +48,7 @@ def symlink(model_path, link_name, force): except: # This is quite dirty, but just making sure other errors are caught so # users at least see a proper message. - util.sys_exit( + util.print_msg( "Creating a symlink in spacy/data failed. Make sure you have the " "required permissions and try re-running the command as admin, or " "use a virtualenv to install spaCy in a user directory, instead of " @@ -57,6 +57,7 @@ def symlink(model_path, link_name, force): "load() method, or create the symlink manually:", "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)), title="Error: Couldn't link model to '{l}'".format(l=link_name)) + raise util.print_msg( "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), From 4efd6fb9d60f2d36a1f9ae6892c2d6bd4021140c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 15:28:27 -0500 Subject: [PATCH 35/43] Fix training --- spacy/language.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 47408921c..f47b1d0cc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -239,8 +239,7 @@ class Language(object): self.pipeline = self.Defaults.create_pipeline(self) yield Trainer(self, gold_tuples) self.end_training() - self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg, - pos=self.tagger.cfg) + self.save_to_directory(path) def __init__(self, **overrides): if 'data_dir' in overrides and 'path' not in overrides: From de5062711b5b76d481bf9360a9cad99adf5f080b Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:26:24 +0200 Subject: [PATCH 36/43] Update adding languages workflow to reflect changes in __init__.py --- website/docs/usage/adding-languages.jade | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index d1541bc87..0c98cc5ca 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -63,14 +63,16 @@ p tag_map = TAG_MAP stop_words = STOP_WORDS -p Additionally, the new #[code Language] class needs to be registered in #[+src(gh("spaCy", "spacy/__init__.py")) spacy/__init__.py] using the #[code set_lang_class()] function, so that you can use #[code spacy.load()]. +p + | Additionally, the new #[code Language] class needs to be added to the + | list of available languages in #[+src(gh("spaCy", "spacy/__init__.py")) __init__.py]. + | The languages are then registered using the #[code set_lang_class()] function. +code("spacy/__init__.py"). from . import en from . import xx - set_lang_class(en.English.lang, en.English) - set_lang_class(xx.Xxxxx.lang, xx.Xxxxx) + _languages = (en.English, ..., xx.Xxxxx) p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]: From 16a8521efa4275fc8f87c28e00f06745e5cd40b4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:38:38 +0200 Subject: [PATCH 37/43] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index b9b7444a8..5e438c7af 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.7.6' +__version__ = '1.8.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 02e7512b914316a5f96ed2eb1a65bcf4a3f6281a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:39:58 +0200 Subject: [PATCH 38/43] Increment version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index d117f74af..03fcbb956 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,7 +12,7 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.7", + "SPACY_VERSION": "1.8", "LATEST_NEWS": { "url": "https://survey.spacy.io/", "title": "Take the spaCy user survey and help us improve the library!" From db7e046faa693ff759afb8449e720043157617c9 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 23:23:59 +0200 Subject: [PATCH 39/43] Update version --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index b9cd1d5ce..67a6a6c66 100644 --- a/README.rst +++ b/README.rst @@ -10,7 +10,7 @@ open-source software, released under the MIT license. 📊 **Help us improve the library!** `Take the spaCy user survey `_. -💫 **Version 1.7 out now!** `Read the release notes here. `_ +💫 **Version 1.8 out now!** `Read the release notes here. `_ .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy From cffaf521520c7bf3643475081539713831f49acd Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 23:34:14 +0200 Subject: [PATCH 40/43] Update README.rst --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 67a6a6c66..d08860fb8 100644 --- a/README.rst +++ b/README.rst @@ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests: =========== ============== =========== Version Date Description =========== ============== =========== +`v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading `v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands `v1.7.3`_ ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes `v1.7.2`_ ``2017-03-20`` Small fixes to beam parser and model linking @@ -350,6 +351,7 @@ Version Date Description `v0.93`_ ``2015-09-22`` Bug fixes to word vectors =========== ============== =========== +.. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 .. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 .. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3 .. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2 From 734b0a4e4ae0634a24e8f44303d7bccd49fbc31f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 16 Apr 2017 23:42:16 +0200 Subject: [PATCH 41/43] Update train_new_entity_type.py --- examples/training/train_new_entity_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index ef4070153..d5d9492f1 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Example of training and additional entity type +Example of training an additional entity type This script shows how to add a new entity type to an existing pre-trained NER model. To keep the example short and simple, only four sentences are provided From e7ae3b7cc20ea29d02aadac6bc635102f00c4bc9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 16 Apr 2017 23:56:12 +0200 Subject: [PATCH 42/43] Fix formatting and typo (closes #967) --- examples/pos_tag.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/pos_tag.py b/examples/pos_tag.py index c61d29636..1dd6add0f 100644 --- a/examples/pos_tag.py +++ b/examples/pos_tag.py @@ -1,7 +1,8 @@ -'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated +""" +Print part-of-speech tagged, true-cased, (very roughly) sentence-separated text, with each "sentence" on a newline, and spaces between tokens. Supports multi-processing. -''' +""" from __future__ import print_function, unicode_literals, division import io import bz2 @@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra): def iter_texts_from_json_bz2(loc): - ''' + """ Iterator of unicode strings, one per document (here, a comment). Expects a a path to a BZ2 file, which should be new-line delimited JSON. The document text should be in a string field titled 'body'. This is the data format of the Reddit comments corpus. - ''' + """ with bz2.BZ2File(loc) as file_: for i, line in enumerate(file_): yield ujson.loads(line)['body'] @@ -80,7 +81,7 @@ def is_sent_begin(word): def main(in_loc, out_dir, n_workers=4, batch_size=100000): if not path.exists(out_dir): path.join(out_dir) - texts = partition(batch_size, iter_texts(in_loc)) + texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) From c6c3162c5031933fc26932668772a501fc7869de Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 00:00:27 +0200 Subject: [PATCH 43/43] Fix lightning tour example (closes #889) --- website/docs/usage/lightning-tour.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 31982d516..967d0c61e 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -137,7 +137,7 @@ p return word.ent_type != 0 def count_parent_verb_by_person(docs): - counts = defaultdict(defaultdict(int)) + counts = defaultdict(lambda: defaultdict(int)) for doc in docs: for ent in doc.ents: if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: