From 7f579ae834398b2045bd19e9032e82ee425739c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Sun, 19 Mar 2017 11:40:29 +0100 Subject: [PATCH 01/30] Remove duplicate keys in [en|fi] data dicts --- spacy/en/morph_rules.py | 1 - spacy/fi/tokenizer_exceptions.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/spacy/en/morph_rules.py b/spacy/en/morph_rules.py index 2b8aae823..51a50736e 100644 --- a/spacy/en/morph_rules.py +++ b/spacy/en/morph_rules.py @@ -21,7 +21,6 @@ MORPH_RULES = { "them": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, "mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, - "yours": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Poss": "Yes", "Reflex": "Yes"}, "his": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, "hers": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, "its": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, diff --git a/spacy/fi/tokenizer_exceptions.py b/spacy/fi/tokenizer_exceptions.py index 52ea7428a..09775a2f4 100644 --- a/spacy/fi/tokenizer_exceptions.py +++ b/spacy/fi/tokenizer_exceptions.py @@ -193,9 +193,6 @@ TOKENIZER_EXCEPTIONS = { "vm.": [ {ORTH: "vm.", LEMMA: "viimeksi mainittu"} ], - "siht.": [ - {ORTH: "siht.", LEMMA: "sihteeri"} - ], "srk.": [ {ORTH: "srk.", LEMMA: "seurakunta"} ] From 81b28ca606bb956b74849bf7971cb0b381431887 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Mar 2017 18:01:51 +0100 Subject: [PATCH 02/30] Update models docs with info on retraining own models --- website/docs/usage/models.jade | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index d45d8d45e..ae1417a29 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -14,9 +14,12 @@ p | model name. +infobox("Important note") - | Due to improvements in the English lemmatizer in v1.7.0, you need to download the - | new English model. The German model is still compatible and will be - | recognised and linked automatically. + | Due to improvements in the English lemmatizer in v1.7.0, you need to + | #[strong download the new English models]. The German model is still + | compatible. If you've trained statistical models that use spaCy's + | annotations, you should #[strong retrain your models after updating spaCy]. + | If you don't retrain your models, you may suffer train/test skew, which + | might decrease your accuracy. +aside-code("Quickstart"). # Install spaCy and download English model From adbcac65918137634e080e3e4689c3456cde593b Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Mar 2017 22:48:21 +0100 Subject: [PATCH 03/30] Fix spacing --- spacy/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/util.py b/spacy/util.py index 49c51b436..d1252e41d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -13,6 +13,7 @@ import textwrap from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE + try: basestring except NameError: From a6c036180344899471c715ad5798b78c116fdee5 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Mar 2017 22:48:32 +0100 Subject: [PATCH 04/30] Handle raw_input vs input in Python 2 and 3 --- spacy/util.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index d1252e41d..f8fc76b05 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -20,6 +20,12 @@ except NameError: basestring = str +try: + raw_input +except NameError: # Python 3 + raw_input = input + + LANGUAGES = {} _data_path = pathlib.Path(__file__).parent / 'data' From 5aea327a5b5a4fe74b023d6ecad0689a41ee895f Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Mar 2017 22:48:56 +0100 Subject: [PATCH 05/30] Add util function to get raw user input --- spacy/util.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index f8fc76b05..1f1cdbb6e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -168,6 +168,17 @@ def parse_package_meta(package_path, package, require=True): return None +def get_raw_input(description, default=False): + """Get user input via raw_input / input and return input value. Takes a + description for the prompt, and an optional default value that's displayed + with the prompt.""" + + additional = ' (default: {d})'.format(d=default) if default else '' + prompt = ' {d}{a}: '.format(d=description, a=additional) + user_input = raw_input(prompt) + return user_input + + def print_table(data, **kwargs): """Print data in table format. Can either take a list of tuples or a dictionary, which will be converted to a list of tuples.""" From a54e3c2efe6d388b1cb2fa0bc9fc867165c9025d Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Mar 2017 22:49:36 +0100 Subject: [PATCH 06/30] Remove empty line --- spacy/__main__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 9addbccde..cf740c8fe 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,5 +1,4 @@ # coding: utf8 -# from __future__ import print_function # NB! This breaks in plac on Python 2!! #from __future__ import unicode_literals, From bf240132d70b497e6c5c57407e5ca1cfdc9b17e3 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Mar 2017 22:50:13 +0100 Subject: [PATCH 07/30] Add cli.package command to build model packages --- spacy/__main__.py | 17 ++++- spacy/cli/__init__.py | 1 + spacy/cli/package.py | 149 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 spacy/cli/package.py diff --git a/spacy/__main__.py b/spacy/__main__.py index cf740c8fe..ba34c478f 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -7,12 +7,13 @@ import plac from spacy.cli import download as cli_download from spacy.cli import link as cli_link from spacy.cli import info as cli_info +from spacy.cli import package as cli_package class CLI(object): """Command-line interface for spaCy""" - commands = ('download', 'link', 'info') + commands = ('download', 'link', 'info', 'package') @plac.annotations( model=("model to download (shortcut or model name)", "positional", None, str), @@ -58,6 +59,20 @@ class CLI(object): cli_info(model, markdown) + @plac.annotations( + input_dir=("directory with model data", "positional", None, str), + output_dir=("output directory", "positional", None, str) + ) + def package(self, input_dir, output_dir): + """ + Generate Python package for model data, including meta and required + installation files. A new directory will be created in the specified + output directory. + """ + + cli_package(input_dir, output_dir) + + def __missing__(self, name): print("\n Command %r does not exist\n" % name) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2c45b471a..2383e04b9 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,3 +1,4 @@ from .download import download from .info import info from .link import link +from .package import package diff --git a/spacy/cli/package.py b/spacy/cli/package.py new file mode 100644 index 000000000..9d1ff7183 --- /dev/null +++ b/spacy/cli/package.py @@ -0,0 +1,149 @@ +# coding: utf8 +from __future__ import unicode_literals + +import json +from shutil import copytree +from pathlib import Path + +from .. import about +from .. import util + + +def package(input_dir, output_dir): + input_path = Path(input_dir) + output_path = Path(output_dir) + check_dirs(input_path, output_path) + + meta = generate_meta() + model_name = meta['lang'] + '_' + meta['name'] + model_name_v = model_name + '-' + meta['version'] + main_path = output_path / model_name_v + package_path = main_path / model_name + + Path.mkdir(package_path, parents=True) + copytree(input_path, package_path / model_name_v) + create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) + create_file(main_path / 'setup.py', TEMPLATE_SETUP.strip()) + create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST.strip()) + create_file(package_path / '__init__.py', TEMPLATE_INIT.strip()) + + util.print_msg( + main_path.as_posix(), + "To build the package, run python setup.py sdist in that directory.", + title="Successfully reated package {p}".format(p=model_name_v)) + + +def check_dirs(input_path, output_path): + if not input_path.exists(): + util.sys_exit(input_path.as_poisx(), title="Model directory not found") + if not output_path.exists(): + util.sys_exit(output_path.as_posix(), title="Output directory not found") + + +def create_file(file_path, contents): + file_path.touch() + file_path.write_text(contents, encoding='utf-8') + + +def generate_meta(): + settings = [('lang', 'Model language', 'en'), + ('name', 'Model name', 'model'), + ('version', 'Model version', '0.0.0'), + ('spacy_version', 'Required spaCy version', '>=2.0.0,<3.0.0'), + ('description', 'Model description', False), + ('author', 'Author', False), + ('email', 'Author email', False), + ('url', 'Author website', False), + ('license', 'License', 'MIT')] + + util.print_msg("Enter the package settings for your model.", title="Generating meta.json") + + meta = {} + for setting, desc, default in settings: + response = util.get_raw_input(desc, default) + meta[setting] = default if response == '' and default else response + return meta + + +TEMPLATE_MANIFEST = """ +include meta.json +""" + + +TEMPLATE_SETUP = """ +#!/usr/bin/env python +# coding: utf8 +from __future__ import unicode_literals + +import io +import json +from os import path, walk +from shutil import copy +from setuptools import setup + + +def load_meta(fp): + with io.open(fp, encoding='utf8') as f: + return json.load(f) + + +def list_files(data_dir): + output = [] + for root, _, filenames in walk(data_dir): + for filename in filenames: + if not filename.startswith('.'): + output.append(path.join(root, filename)) + output = [path.relpath(p, path.dirname(data_dir)) for p in output] + output.append('meta.json') + return output + + +def setup_package(): + root = path.abspath(path.dirname(__file__)) + meta_path = path.join(root, 'meta.json') + meta = load_meta(meta_path) + model_name = str(meta['lang'] + '_' + meta['name']) + model_dir = path.join(model_name, model_name + '-' + meta['version']) + + copy(meta_path, path.join(root, model_name)) + copy(meta_path, path.join(root, model_dir)) + + setup( + name=model_name, + description=meta['description'], + author=meta['author'], + author_email=meta['email'], + url=meta['url'], + version=meta['version'], + license=meta['license'], + packages=[model_name], + package_data={model_name: list_files(model_dir)}, + install_requires=['spacy' + meta['spacy_version']], + zip_safe=False, + ) + + +if __name__ == '__main__': + setup_package() +""" + + +TEMPLATE_INIT = """ +from pathlib import Path +from spacy.util import get_lang_class +import pkg_resources +import json + + +def load_meta(): + with (Path(__file__).parent / 'meta.json').open() as f: + return json.load(f) + + +def load(**kwargs): + meta = load_meta() + version = meta['version'] + data_dir = pkg_resources.resource_filename(__name__, __name__ + '-' + version) + lang = get_lang_class(meta['lang']) + return lang(path=Path(data_dir), **kwargs) +""" From b2bcdec0f608dcf64147e94f1547b267baf007b6 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Mar 2017 22:50:55 +0100 Subject: [PATCH 08/30] Update docstring --- spacy/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index ba34c478f..23d87acb3 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -67,7 +67,7 @@ class CLI(object): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified - output directory. + output directory, and model data will be copied over. """ cli_package(input_dir, output_dir) From 8eb9a2b35503a54ac7d3aca403e4d48516a8900b Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 02:05:14 +0100 Subject: [PATCH 09/30] Fix formatting --- spacy/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 23d87acb3..e539ed78d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -32,8 +32,8 @@ class CLI(object): @plac.annotations( origin=("package name or local path to model", "positional", None, str), - link_name=("Name of shortuct link to create", "positional", None, str), - force=("Force overwriting of existing link", "flag", "f", bool) + link_name=("name of shortuct link to create", "positional", None, str), + force=("force overwriting of existing link", "flag", "f", bool) ) def link(self, origin, link_name, force=False): """ From 448a916d0d3e3af2d5fe161cd5ed5828e6517ab7 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 02:05:34 +0100 Subject: [PATCH 10/30] Add --force option to override directory --- spacy/__main__.py | 7 ++++--- spacy/cli/package.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index e539ed78d..a5ba66fee 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -61,16 +61,17 @@ class CLI(object): @plac.annotations( input_dir=("directory with model data", "positional", None, str), - output_dir=("output directory", "positional", None, str) + output_dir=("output directory", "positional", None, str), + force=("force overwriting of existing output directory", "flag", "f", bool) ) - def package(self, input_dir, output_dir): + def package(self, input_dir, output_dir, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ - cli_package(input_dir, output_dir) + cli_package(input_dir, output_dir, force) def __missing__(self, name): diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 9d1ff7183..abd3f6e4e 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -9,7 +9,7 @@ from .. import about from .. import util -def package(input_dir, output_dir): +def package(input_dir, output_dir, force): input_path = Path(input_dir) output_path = Path(output_dir) check_dirs(input_path, output_path) From 64e38f304e1f9374e3b731e3721434fe33867bb4 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 02:06:29 +0100 Subject: [PATCH 11/30] Only import shutil --- spacy/cli/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index abd3f6e4e..d23b03821 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import json -from shutil import copytree +import shutil from pathlib import Path from .. import about @@ -21,7 +21,7 @@ def package(input_dir, output_dir, force): package_path = main_path / model_name Path.mkdir(package_path, parents=True) - copytree(input_path, package_path / model_name_v) + shutil.copytree(input_path, package_path / model_name_v) create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) create_file(main_path / 'setup.py', TEMPLATE_SETUP.strip()) create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST.strip()) From 46bc3c36b00dfb95f90da5769181d0255f860cd8 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 02:06:37 +0100 Subject: [PATCH 12/30] Fix typo --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index d23b03821..bf424e075 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -30,7 +30,7 @@ def package(input_dir, output_dir, force): util.print_msg( main_path.as_posix(), "To build the package, run python setup.py sdist in that directory.", - title="Successfully reated package {p}".format(p=model_name_v)) + title="Successfully created package {p}".format(p=model_name_v)) def check_dirs(input_path, output_path): From 5230ed5b98e3f1e9c83b1205c87db962c5844804 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 02:06:53 +0100 Subject: [PATCH 13/30] Move directory check and overwriting/creating dirs to own function --- spacy/cli/package.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index bf424e075..59b45ab5f 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -20,7 +20,7 @@ def package(input_dir, output_dir, force): main_path = output_path / model_name_v package_path = main_path / model_name - Path.mkdir(package_path, parents=True) + create_dirs(package_path, force) shutil.copytree(input_path, package_path / model_name_v) create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) create_file(main_path / 'setup.py', TEMPLATE_SETUP.strip()) @@ -40,6 +40,17 @@ def check_dirs(input_path, output_path): util.sys_exit(output_path.as_posix(), title="Output directory not found") +def create_dirs(package_path, force): + if package_path.exists(): + if force: + shutil.rmtree(package_path) + else: + util.sys_exit(package_path.as_posix(), + "Please delete the directory and try again.", + title="Package directory already exists") + Path.mkdir(package_path, parents=True) + + def create_file(file_path, contents): file_path.touch() file_path.write_text(contents, encoding='utf-8') From 3f4e3fda1d21a90b5d7b3e3fe70e650120b19c84 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 11:17:36 +0100 Subject: [PATCH 14/30] Update command and fetch file templates from GitHub While feature is still experimental, this allows files to be modified without having to ship a new version of spaCy. --- spacy/cli/package.py | 105 ++++++++----------------------------------- 1 file changed, 18 insertions(+), 87 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 59b45ab5f..6a0f36ff9 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import json import shutil +import requests from pathlib import Path from .. import about @@ -14,7 +15,11 @@ def package(input_dir, output_dir, force): output_path = Path(output_dir) check_dirs(input_path, output_path) + template_setup = get_template('setup.py') + template_init = get_template('en_model_name/__init__.py') + template_manifest = 'include meta.json' meta = generate_meta() + model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] main_path = output_path / model_name_v @@ -23,13 +28,13 @@ def package(input_dir, output_dir, force): create_dirs(package_path, force) shutil.copytree(input_path, package_path / model_name_v) create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) - create_file(main_path / 'setup.py', TEMPLATE_SETUP.strip()) - create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST.strip()) - create_file(package_path / '__init__.py', TEMPLATE_INIT.strip()) + create_file(main_path / 'setup.py', template_setup) + create_file(main_path / 'MANIFEST.in', template_manifest) + create_file(package_path / '__init__.py', template_init) util.print_msg( main_path.as_posix(), - "To build the package, run python setup.py sdist in that directory.", + "To build the package, run `python setup.py sdist` in that directory.", title="Successfully created package {p}".format(p=model_name_v)) @@ -60,7 +65,7 @@ def generate_meta(): settings = [('lang', 'Model language', 'en'), ('name', 'Model name', 'model'), ('version', 'Model version', '0.0.0'), - ('spacy_version', 'Required spaCy version', '>=2.0.0,<3.0.0'), + ('spacy_version', 'Required spaCy version', '>=1.7.0,<2.0.0'), ('description', 'Model description', False), ('author', 'Author', False), ('email', 'Author email', False), @@ -76,85 +81,11 @@ def generate_meta(): return meta -TEMPLATE_MANIFEST = """ -include meta.json -""" - - -TEMPLATE_SETUP = """ -#!/usr/bin/env python -# coding: utf8 -from __future__ import unicode_literals - -import io -import json -from os import path, walk -from shutil import copy -from setuptools import setup - - -def load_meta(fp): - with io.open(fp, encoding='utf8') as f: - return json.load(f) - - -def list_files(data_dir): - output = [] - for root, _, filenames in walk(data_dir): - for filename in filenames: - if not filename.startswith('.'): - output.append(path.join(root, filename)) - output = [path.relpath(p, path.dirname(data_dir)) for p in output] - output.append('meta.json') - return output - - -def setup_package(): - root = path.abspath(path.dirname(__file__)) - meta_path = path.join(root, 'meta.json') - meta = load_meta(meta_path) - model_name = str(meta['lang'] + '_' + meta['name']) - model_dir = path.join(model_name, model_name + '-' + meta['version']) - - copy(meta_path, path.join(root, model_name)) - copy(meta_path, path.join(root, model_dir)) - - setup( - name=model_name, - description=meta['description'], - author=meta['author'], - author_email=meta['email'], - url=meta['url'], - version=meta['version'], - license=meta['license'], - packages=[model_name], - package_data={model_name: list_files(model_dir)}, - install_requires=['spacy' + meta['spacy_version']], - zip_safe=False, - ) - - -if __name__ == '__main__': - setup_package() -""" - - -TEMPLATE_INIT = """ -from pathlib import Path -from spacy.util import get_lang_class -import pkg_resources -import json - - -def load_meta(): - with (Path(__file__).parent / 'meta.json').open() as f: - return json.load(f) - - -def load(**kwargs): - meta = load_meta() - version = meta['version'] - data_dir = pkg_resources.resource_filename(__name__, __name__ + '-' + version) - lang = get_lang_class(meta['lang']) - return lang(path=Path(data_dir), **kwargs) -""" +def get_template(filepath): + url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' + r = requests.get(url + filepath) + if r.status_code != 200: + util.sys_exit( + "Couldn't fetch template files from GitHub.", + title="Server error ({c})".format(c=r.status_code)) + return r.text From 09b24bc5a9dfd69d2f95c0225599c44170659351 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 11:19:21 +0100 Subject: [PATCH 15/30] Add docs for package command --- spacy/__main__.py | 2 +- website/docs/usage/cli.jade | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index a5ba66fee..cde146cba 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -62,7 +62,7 @@ class CLI(object): @plac.annotations( input_dir=("directory with model data", "positional", None, str), output_dir=("output directory", "positional", None, str), - force=("force overwriting of existing output directory", "flag", "f", bool) + force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) def package(self, input_dir, output_dir, force=False): """ diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 990117542..4a9ba3dd1 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -103,3 +103,40 @@ p +cell #[code --help], #[code -h] +cell flag +cell Show help message and available arguments. + ++h(2, "package") Package + +tag experimental + +p + | Generate a #[+a("/docs/usage/models#own-models") model Python package] + | from an existing model data directory. All data files are copied over, + | and the meta data can be entered directly from the command line. While + | this feature is still experimental, the templates for the + | #[+src(gh("spacy-dev-resources", "templates/model/setup.py")) setup.py] and + | #[+src(gh("spacy-dev-resources", "templates/model/en_morel_name/__init__.py")) __init__.py] + | are downloaded from GitHub. This means you need to be connected to the + | internet to use this command. + ++code(false, "bash"). + python -m spacy package [input_dir] [output_dir] [--force] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code input_dir] + +cell positional + +cell Path to directory containing model data. + + +row + +cell #[code output_dir] + +cell positional + +cell Directory to create package folder in. + + +row + +cell #[code --force], #[code -f] + +cell flag + +cell Force overwriting of existing folder in output directory. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. From 49bbfdaac1036f052b38f6991ea65de8efc3478f Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 11:25:01 +0100 Subject: [PATCH 16/30] Add info on CLI to docs on own models --- website/docs/usage/models.jade | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index ae1417a29..39c271df4 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -238,7 +238,11 @@ p | #[+a("/docs/usage/adding-languages") additional languages], you can | create a shortuct link for it by pointing #[code spacy.link] to the | model's data directory. To allow your model to be downloaded and - | installed via pip, you'll also need to generate a package for it. + | installed via pip, you'll also need to generate a package for it. You can + | do this manually, or via the new + | #[+a("/docs/usage/cli#package") #[code spacy package] command] that will + | create all required files, and walk you through generating the meta data. + +infobox("Important note") | The model packages are #[strong not suitable] for the public From cf0094187e356e9b762a1796b092734d4e30d654 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 11:32:38 +0100 Subject: [PATCH 17/30] Fetch MANIFEST.in from GitHub as well --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 6a0f36ff9..5894ec049 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -16,8 +16,8 @@ def package(input_dir, output_dir, force): check_dirs(input_path, output_path) template_setup = get_template('setup.py') + template_manifest = get_template('MANIFEST.in') template_init = get_template('en_model_name/__init__.py') - template_manifest = 'include meta.json' meta = generate_meta() model_name = meta['lang'] + '_' + meta['name'] From fa6e3cefbb482e97d0c21c51d852bcda5f31b089 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 11:35:29 +0100 Subject: [PATCH 18/30] Simplify package command docs --- website/docs/usage/cli.jade | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 4a9ba3dd1..66be83923 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -111,11 +111,9 @@ p | Generate a #[+a("/docs/usage/models#own-models") model Python package] | from an existing model data directory. All data files are copied over, | and the meta data can be entered directly from the command line. While - | this feature is still experimental, the templates for the - | #[+src(gh("spacy-dev-resources", "templates/model/setup.py")) setup.py] and - | #[+src(gh("spacy-dev-resources", "templates/model/en_morel_name/__init__.py")) __init__.py] - | are downloaded from GitHub. This means you need to be connected to the - | internet to use this command. + | this feature is still experimental, the required file templates are + | downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. + | This means you need to be connected to the internet to use this command. +code(false, "bash"). python -m spacy package [input_dir] [output_dir] [--force] From 3e134b5b2b5fdf7866ce63201be764897861a4b7 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 12:15:33 +0100 Subject: [PATCH 19/30] Make sure paths in copytree and rmtree are strings --- spacy/cli/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 5894ec049..a5c41adec 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -26,7 +26,7 @@ def package(input_dir, output_dir, force): package_path = main_path / model_name create_dirs(package_path, force) - shutil.copytree(input_path, package_path / model_name_v) + shutil.copytree((input_path, package_path / model_name_v).as_posix()) create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) @@ -48,7 +48,7 @@ def check_dirs(input_path, output_path): def create_dirs(package_path, force): if package_path.exists(): if force: - shutil.rmtree(package_path) + shutil.rmtree(package_path.as_posix()) else: util.sys_exit(package_path.as_posix(), "Please delete the directory and try again.", From ae466475607540a0b439a808dde6a5d553c6084f Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 12:21:42 +0100 Subject: [PATCH 20/30] Fix brackets --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index a5c41adec..5072e272b 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -26,7 +26,7 @@ def package(input_dir, output_dir, force): package_path = main_path / model_name create_dirs(package_path, force) - shutil.copytree((input_path, package_path / model_name_v).as_posix()) + shutil.copytree(input_path, (package_path / model_name_v).as_posix()) create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) From 83a999ea83e7298ce9b999d91087dc8556f57956 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 12:24:43 +0100 Subject: [PATCH 21/30] Change default license from MIT to CC --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 5072e272b..e2c8000b3 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -70,7 +70,7 @@ def generate_meta(): ('author', 'Author', False), ('email', 'Author email', False), ('url', 'Author website', False), - ('license', 'License', 'MIT')] + ('license', 'License', 'CC BY-NC 3.0')] util.print_msg("Enter the package settings for your model.", title="Generating meta.json") From d74aa428ad6137b585ea000bdd8f2dde3c4da03d Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 12:26:00 +0100 Subject: [PATCH 22/30] Fix path --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index e2c8000b3..b4dc76e9a 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -26,7 +26,7 @@ def package(input_dir, output_dir, force): package_path = main_path / model_name create_dirs(package_path, force) - shutil.copytree(input_path, (package_path / model_name_v).as_posix()) + shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix()) create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) From c3a9f738960356dc395a789b786aa095bdad8e4f Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 21 Mar 2017 12:35:22 +0100 Subject: [PATCH 23/30] Fix writing to file --- spacy/cli/package.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index b4dc76e9a..5cab2b4bc 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -58,7 +58,7 @@ def create_dirs(package_path, force): def create_file(file_path, contents): file_path.touch() - file_path.write_text(contents, encoding='utf-8') + file_path.open('w').write(contents, encoding='utf-8') def generate_meta(): From f332bf05be536cfffc80205269116d7fb0a0e363 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 21 Mar 2017 21:08:54 +0100 Subject: [PATCH 24/30] Remove unused import statements --- spacy/cfile.pyx | 2 +- spacy/gold.pyx | 4 ---- spacy/language.py | 2 -- spacy/matcher.pyx | 7 ++----- spacy/morphology.pyx | 4 ---- spacy/pipeline.pyx | 1 - spacy/tagger.pyx | 6 +----- spacy/tokenizer.pyx | 3 --- spacy/util.py | 3 --- 9 files changed, 4 insertions(+), 28 deletions(-) diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx index ceebe2e59..d5d4bf353 100644 --- a/spacy/cfile.pyx +++ b/spacy/cfile.pyx @@ -1,4 +1,4 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from libc.stdio cimport fopen, fclose, fread, fwrite from libc.string cimport memcpy diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 358412fab..471018109 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,16 +1,12 @@ # cython: profile=True from __future__ import unicode_literals, print_function -import numpy import io import json -import random import re import os from os import path -from libc.string cimport memset - import ujson as json from .syntax import nonproj diff --git a/spacy/language.py b/spacy/language.py index 573bb5a86..4542eae3b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,6 +1,5 @@ from __future__ import absolute_import from __future__ import unicode_literals -from warnings import warn import pathlib from contextlib import contextmanager import shutil @@ -33,7 +32,6 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP from .syntax.parser import get_templates from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer -from .pipeline import BeamDependencyParser, BeamEntityRecognizer from .syntax.arc_eager import ArcEager from .syntax.ner import BiluoPushDown diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 5c52ae9d0..1883ae89a 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -2,13 +2,10 @@ # cython: infer_types=True from __future__ import unicode_literals -from os import path - from .typedefs cimport attr_t from .typedefs cimport hash_t from .attrs cimport attr_id_t -from .structs cimport TokenC, LexemeC -from .lexeme cimport Lexeme +from .structs cimport TokenC from cymem.cymem cimport Pool from preshed.maps cimport PreshMap @@ -17,7 +14,7 @@ from libcpp.pair cimport pair from murmurhash.mrmr cimport hash64 from libc.stdint cimport int32_t -from .attrs cimport ID, LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE +from .attrs cimport ID, ENT_TYPE from . import attrs from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 26405e988..e98ef1e92 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,12 +1,8 @@ # cython: infer_types from __future__ import unicode_literals -from os import path - from libc.string cimport memset -from .lemmatizer import Lemmatizer - try: import ujson as json except ImportError: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 59e1994a9..b2d622329 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -2,7 +2,6 @@ from .syntax.parser cimport Parser from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager -from .vocab cimport Vocab from .tagger import Tagger # TODO: The disorganization here is pretty embarrassing. At least it's only diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 1f6b587c5..4a2ef082a 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -1,20 +1,16 @@ import json import pathlib from collections import defaultdict -from libc.string cimport memset from cymem.cymem cimport Pool -from thinc.typedefs cimport atom_t, weight_t +from thinc.typedefs cimport atom_t from thinc.extra.eg cimport Example from thinc.structs cimport ExampleC from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec -from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG -from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON -from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .gold cimport GoldParse from .attrs cimport * diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 5a4eb844a..42f090cde 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,13 +1,10 @@ # cython: embedsignature=True from __future__ import unicode_literals -import re import pathlib from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc -from cpython cimport Py_UNICODE_ISSPACE - try: import ujson as json diff --git a/spacy/util.py b/spacy/util.py index 49c51b436..b255b92db 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,11 +8,8 @@ import os.path import pathlib import sys -import six import textwrap -from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE - try: basestring except NameError: From 7568cd6bf8a156a37e3c254ea65f5a479102c424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 21 Mar 2017 23:00:13 +0100 Subject: [PATCH 25/30] Split CONLLX file using tabs and not default split separators --- bin/parser/train_ud.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index c87f40680..98a93dd88 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals import plac import json -from os import path -import shutil -import os import random -import io import pathlib from spacy.tokens import Doc from spacy.syntax.nonproj import PseudoProjectivity from spacy.language import Language from spacy.gold import GoldParse -from spacy.vocab import Vocab from spacy.tagger import Tagger from spacy.pipeline import DependencyParser, BeamDependencyParser from spacy.syntax.parser import get_templates @@ -23,7 +18,6 @@ import spacy.attrs import io - def read_conllx(loc, n=0): with io.open(loc, 'r', encoding='utf8') as file_: text = file_.read() @@ -35,7 +29,8 @@ def read_conllx(loc, n=0): lines.pop(0) tokens = [] for line in lines: - id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() + id_, word, lemma, pos, tag, morph, head, dep, _1, \ + _2 = line.split('\t') if '-' in id_ or '.' in id_: continue try: From 08346dba1a94989c6a286e51a122a0f2661592d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 21 Mar 2017 23:18:54 +0100 Subject: [PATCH 26/30] Use specific language class instead of base Language class --- bin/parser/train_ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 98a93dd88..afc4491cb 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -129,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) - nlp = Language(vocab=vocab, tagger=tagger, parser=parser) + nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) From 07199c3e8b1f7f91c41e7d19f364c902d3e9590b Mon Sep 17 00:00:00 2001 From: Andrew Poliakov Date: Wed, 22 Mar 2017 11:43:22 +0300 Subject: [PATCH 27/30] Fix infinite recursion in spacy.info --- spacy/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 70b3363d6..62ab41c90 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -49,7 +49,3 @@ def load(name, **overrides): overrides['path'] = model_path return cls(**overrides) - - -def info(name, markdown): - info(name, markdown) From ce065e5d65bc2a880d9e1993129b6beff6468c39 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 22 Mar 2017 10:02:14 +0100 Subject: [PATCH 28/30] Fix imports --- spacy/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 70b3363d6..80bd1c539 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,7 +5,7 @@ import json from pathlib import Path from .util import set_lang_class, get_lang_class, parse_package_meta from .deprecated import resolve_model_name -from .cli.info import info +from .cli import info from . import en from . import de @@ -49,7 +49,3 @@ def load(name, **overrides): overrides['path'] = model_path return cls(**overrides) - - -def info(name, markdown): - info(name, markdown) From 4a9a1126a4aabfeb20fe555c042d333b1d6c982f Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 22 Mar 2017 10:02:59 +0100 Subject: [PATCH 29/30] Update syntax highlighting color scheme --- website/assets/css/_variables.sass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index bfef915be..1c38d114a 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -44,7 +44,7 @@ $color-red: #d9515d $color-green: #3ec930 $color-yellow: #f4c025 -$syntax-highlighting: ( comment: #949e9b, tag: #3ec930, number: #B084EB, selector: #FFB86C, operator: #FF2C6D, function: #09a3d5, keyword: #45A9F9, regex: #f4c025 ) +$syntax-highlighting: ( comment: #949e9b, tag: #b084eb, number: #b084eb, selector: #ffb86c, operator: #ff2c6d, function: #35b3dc, keyword: #45a9f9, regex: #f4c025 ) $pattern: $color-theme url("/assets/img/pattern_#{$theme}.jpg") center top repeat $pattern-overlay: transparent url("/assets/img/pattern_landing.jpg") center -138px no-repeat From 8bc05c2ba97dd51fa9a066def0ab82a97ca55d11 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 23 Mar 2017 11:07:59 +0100 Subject: [PATCH 30/30] Delete old training scripts (resolves #911) --- bin/parser/conll_parse.py | 130 ------------------- bin/parser/nn_train.py | 261 -------------------------------------- 2 files changed, 391 deletions(-) delete mode 100644 bin/parser/conll_parse.py delete mode 100755 bin/parser/nn_train.py diff --git a/bin/parser/conll_parse.py b/bin/parser/conll_parse.py deleted file mode 100644 index 85a81c432..000000000 --- a/bin/parser/conll_parse.py +++ /dev/null @@ -1,130 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import codecs -import random -import time -import gzip - -import plac -import cProfile -import pstats - -import spacy.util -from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir - -from spacy.syntax.parser import GreedyParser -from spacy.syntax.parser import OracleError -from spacy.syntax.util import Config - - -def is_punct_label(label): - return label == 'P' or label.lower() == 'punct' - - -def read_gold(file_): - """Read a standard CoNLL/MALT-style format""" - sents = [] - for sent_str in file_.read().strip().split('\n\n'): - ids = [] - words = [] - heads = [] - labels = [] - tags = [] - for i, line in enumerate(sent_str.split('\n')): - id_, word, pos_string, head_idx, label = _parse_line(line) - words.append(word) - if head_idx == -1: - head_idx = i - ids.append(id_) - heads.append(head_idx) - labels.append(label) - tags.append(pos_string) - text = ' '.join(words) - sents.append((text, [words], ids, words, tags, heads, labels)) - return sents - - -def _parse_line(line): - pieces = line.split() - id_ = int(pieces[0]) - word = pieces[1] - pos = pieces[3] - head_idx = int(pieces[6]) - label = pieces[7] - return id_, word, pos, head_idx, label - - -def iter_data(paragraphs, tokenizer, gold_preproc=False): - for raw, tokenized, ids, words, tags, heads, labels in paragraphs: - assert len(words) == len(heads) - for words in tokenized: - sent_ids = ids[:len(words)] - sent_tags = tags[:len(words)] - sent_heads = heads[:len(words)] - sent_labels = labels[:len(words)] - sent_heads = _map_indices_to_tokens(sent_ids, sent_heads) - tokens = tokenizer.tokens_from_list(words) - yield tokens, sent_tags, sent_heads, sent_labels - ids = ids[len(words):] - tags = tags[len(words):] - heads = heads[len(words):] - labels = labels[len(words):] - - -def _map_indices_to_tokens(ids, heads): - mapped = [] - for head in heads: - if head not in ids: - mapped.append(None) - else: - mapped.append(ids.index(head)) - return mapped - - - -def evaluate(Language, dev_loc, model_dir): - global loss - nlp = Language() - n_corr = 0 - pos_corr = 0 - n_tokens = 0 - total = 0 - skipped = 0 - loss = 0 - with codecs.open(dev_loc, 'r', 'utf8') as file_: - paragraphs = read_gold(file_) - for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer): - assert len(tokens) == len(labels) - nlp.tagger.tag_from_strings(tokens, tag_strs) - nlp.parser(tokens) - for i, token in enumerate(tokens): - try: - pos_corr += token.tag_ == tag_strs[i] - except: - print i, token.orth_, token.tag - raise - n_tokens += 1 - if heads[i] is None: - skipped += 1 - continue - if is_punct_label(labels[i]): - continue - n_corr += token.head.i == heads[i] - total += 1 - print loss, skipped, (loss+skipped + total) - print pos_corr / n_tokens - return float(n_corr) / (total + loss) - - -def main(dev_loc, model_dir): - print evaluate(English, dev_loc, model_dir) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/parser/nn_train.py b/bin/parser/nn_train.py deleted file mode 100755 index 72c9e04f1..000000000 --- a/bin/parser/nn_train.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python -from __future__ import division -from __future__ import unicode_literals - -import os -from os import path -import shutil -import codecs -import random - -import plac -import cProfile -import pstats -import re - -import spacy.util -from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir - -from spacy.syntax.util import Config -from spacy.gold import read_json_file -from spacy.gold import GoldParse - -from spacy.scorer import Scorer - -from spacy.syntax.parser import Parser, get_templates -from spacy._theano import TheanoModel - -import theano -import theano.tensor as T - -from theano.printing import Print - -import numpy -from collections import OrderedDict, defaultdict - - -theano.config.profile = False -theano.config.floatX = 'float32' -floatX = theano.config.floatX - - -def L1(L1_reg, *weights): - return L1_reg * sum(abs(w).sum() for w in weights) - - -def L2(L2_reg, *weights): - return L2_reg * sum((w ** 2).sum() for w in weights) - - -def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6): - updates = OrderedDict() - for param in params: - value = param.get_value(borrow=True) - accu = theano.shared(np.zeros(value.shape, dtype=value.dtype), - broadcastable=param.broadcastable) - - grad = T.grad(loss, param) - accu_new = rho * accu + (1 - rho) * grad ** 2 - updates[accu] = accu_new - updates[param] = param - (eta * grad / T.sqrt(accu_new + eps)) - return updates - - -def relu(x): - return x * (x > 0) - - -def feed_layer(activation, weights, bias, input_): - return activation(T.dot(input_, weights) + bias) - - -def init_weights(n_in, n_out): - rng = numpy.random.RandomState(1235) - - weights = numpy.asarray( - rng.standard_normal(size=(n_in, n_out)) * numpy.sqrt(2.0 / n_in), - dtype=theano.config.floatX - ) - bias = numpy.zeros((n_out,), dtype=theano.config.floatX) - return [wrapper(weights, name='W'), wrapper(bias, name='b')] - - -def compile_model(n_classes, n_hidden, n_in, optimizer): - x = T.vector('x') - costs = T.ivector('costs') - loss = T.scalar('loss') - - maxent_W, maxent_b = init_weights(n_hidden, n_classes) - hidden_W, hidden_b = init_weights(n_in, n_hidden) - - # Feed the inputs forward through the network - p_y_given_x = feed_layer( - T.nnet.softmax, - maxent_W, - maxent_b, - feed_layer( - relu, - hidden_W, - hidden_b, - x)) - - loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8) - - train_model = theano.function( - name='train_model', - inputs=[x, costs], - outputs=[p_y_given_x[0], T.grad(loss, x), loss], - updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]), - on_unused_input='warn' - ) - - evaluate_model = theano.function( - name='evaluate_model', - inputs=[x], - outputs=[ - feed_layer( - T.nnet.softmax, - maxent_W, - maxent_b, - feed_layer( - relu, - hidden_W, - hidden_b, - x - ) - )[0] - ] - ) - return train_model, evaluate_model - - -def score_model(scorer, nlp, annot_tuples, verbose=False): - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold, verbose=verbose) - - -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', - eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10, - seed=0, n_sents=0, verbose=False): - - dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') - if path.exists(dep_model_dir): - shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) - os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - - Config.write(dep_model_dir, 'config', - seed=seed, - templates=tuple(), - labels=Language.ParserTransitionSystem.get_labels(gold_tuples), - vector_lengths=(nv_word, nv_tag, nv_label), - hidden_nodes=nv_hidden, - eta=eta, - mu=mu - ) - - # Bake-in hyper-parameters - optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps) - nlp = Language(data_dir=model_dir) - n_classes = nlp.parser.model.n_classes - train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer) - nlp.parser.model = TheanoModel(n_classes, input_spec, train, - predict, model_loc) - - if n_sents > 0: - gold_tuples = gold_tuples[:n_sents] - print "Itn.\tP.Loss\tUAS\tTag %\tToken %" - log_loc = path.join(model_dir, 'job.log') - for itn in range(n_iter): - scorer = Scorer() - loss = 0 - for _, sents in gold_tuples: - for annot_tuples, ctnt in sents: - if len(annot_tuples[1]) == 1: - continue - score_model(scorer, nlp, annot_tuples) - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) - assert gold.is_projective - loss += nlp.parser.train(tokens, gold) - nlp.tagger.train(tokens, gold.tags) - random.shuffle(gold_tuples) - logline = '%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, - scorer.tags_acc, - scorer.token_acc) - print logline - with open(log_loc, 'aw') as file_: - file_.write(logline + '\n') - nlp.parser.model.end_training() - nlp.tagger.model.end_training() - nlp.vocab.strings.dump(path.join(model_dir, 'vocab', 'strings.txt')) - return nlp - - -def evaluate(nlp, gold_tuples, gold_preproc=True): - scorer = Scorer() - for raw_text, sents in gold_tuples: - for annot_tuples, brackets in sents: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - gold = GoldParse(tokens, annot_tuples) - scorer.score(tokens, gold) - return scorer - - -@plac.annotations( - train_loc=("Location of training file or directory"), - dev_loc=("Location of development file or directory"), - model_dir=("Location of output model directory",), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), - n_sents=("Number of training sentences", "option", "n", int), - n_iter=("Number of training iterations", "option", "i", int), - verbose=("Verbose error reporting", "flag", "v", bool), - - nv_word=("Word vector length", "option", "W", int), - nv_tag=("Tag vector length", "option", "T", int), - nv_label=("Label vector length", "option", "L", int), - nv_hidden=("Hidden nodes length", "option", "H", int), - eta=("Learning rate", "option", "E", float), - mu=("Momentum", "option", "M", float), -) -def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, verbose=False, - nv_word=10, nv_tag=10, nv_label=10, nv_hidden=10, - eta=0.1, mu=0.9, eval_only=False): - - - - - gold_train = list(read_json_file(train_loc, lambda doc: 'wsj' in doc['id'])) - - nlp = train(English, gold_train, model_dir, - feat_set='embed', - eta=eta, mu=mu, - nv_word=nv_word, nv_tag=nv_tag, nv_label=nv_label, nv_hidden=nv_hidden, - n_sents=n_sents, n_iter=n_iter, - verbose=verbose) - - scorer = evaluate(nlp, list(read_json_file(dev_loc))) - - print 'TOK', 100-scorer.token_acc - print 'POS', scorer.tags_acc - print 'UAS', scorer.uas - print 'LAS', scorer.las - - print 'NER P', scorer.ents_p - print 'NER R', scorer.ents_r - print 'NER F', scorer.ents_f - - -if __name__ == '__main__': - plac.call(main)