From 37c7c85a86bb6472d6918dd87c99ea6a2170037f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 30 Nov 2018 20:16:14 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20New=20JSON=20helpers,=20training?= =?UTF-8?q?=20data=20internals=20&=20CLI=20rewrite=20(#2932)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command --- requirements.txt | 2 + setup.py | 2 + spacy/__main__.py | 49 +-- spacy/cli/__init__.py | 27 +- spacy/cli/_messages.py | 30 ++ spacy/cli/convert.py | 86 +++- spacy/cli/converters/__init__.py | 10 +- spacy/cli/converters/conll_ner2json.py | 46 +-- spacy/cli/converters/conllu2json.py | 85 ++-- spacy/cli/converters/conllubio2json.py | 48 +-- spacy/cli/converters/iob2json.py | 58 ++- spacy/cli/converters/jsonl2json.py | 30 +- spacy/cli/debug_data.py | 398 ++++++++++++++++++ spacy/cli/download.py | 56 +-- spacy/cli/evaluate.py | 142 ++++--- spacy/cli/info.py | 66 +-- spacy/cli/init_model.py | 137 ++++--- spacy/cli/link.py | 31 +- spacy/cli/package.py | 125 +++--- spacy/cli/pretrain.py | 280 ++++++------- spacy/cli/profile.py | 65 +-- spacy/cli/schemas/__init__.py | 51 +++ spacy/cli/schemas/meta.json | 128 ++++++ spacy/cli/schemas/training.json | 146 +++++++ spacy/cli/train.py | 450 ++++++++++++--------- spacy/cli/ud/__init__.py | 2 + spacy/cli/{ => ud}/conll17_ud_eval.py | 3 +- spacy/cli/{ => ud}/ud_run_test.py | 237 ++++++----- spacy/cli/{ => ud}/ud_train.py | 366 ++++++++++------- spacy/cli/validate.py | 119 +++--- spacy/cli/vocab.py | 59 --- spacy/compat.py | 10 - spacy/displacy/__init__.py | 10 +- spacy/errors.py | 6 + spacy/gold.pyx | 113 +++--- spacy/tests/doc/test_doc_api.py | 18 - spacy/tests/doc/test_to_json.py | 65 +++ spacy/tests/matcher/test_phrase_matcher.py | 1 - spacy/tests/test_gold.py | 33 -- spacy/tests/test_json_schemas.py | 44 ++ spacy/tests/test_misc.py | 1 - spacy/tokens/doc.pyx | 64 +-- spacy/tokens/printers.py | 74 ---- spacy/util.py | 91 ++--- website/api/_top-level/_util.jade | 31 -- website/api/cli.jade | 120 ++++-- 46 files changed, 2476 insertions(+), 1539 deletions(-) create mode 100644 spacy/cli/debug_data.py create mode 100644 spacy/cli/schemas/__init__.py create mode 100644 spacy/cli/schemas/meta.json create mode 100644 spacy/cli/schemas/training.json create mode 100644 spacy/cli/ud/__init__.py rename spacy/cli/{ => ud}/conll17_ud_eval.py (99%) rename spacy/cli/{ => ud}/ud_run_test.py (55%) rename spacy/cli/{ => ud}/ud_train.py (57%) delete mode 100644 spacy/cli/vocab.py create mode 100644 spacy/tests/doc/test_to_json.py create mode 100644 spacy/tests/test_json_schemas.py delete mode 100644 spacy/tokens/printers.py diff --git a/requirements.txt b/requirements.txt index 545bab989..c6d43ddd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,8 @@ ujson>=1.35 dill>=0.2,<0.3 regex==2018.01.10 requests>=2.13.0,<3.0.0 +jsonschema>=2.6.0,<3.0.0 +wasabi>=0.0.8,<1.1.0 pathlib==1.0.1; python_version < "3.4" # Development dependencies pytest>=4.0.0,<5.0.0 diff --git a/setup.py b/setup.py index a230fbd4a..8e956f3bd 100755 --- a/setup.py +++ b/setup.py @@ -207,6 +207,8 @@ def setup_package(): "regex==2018.01.10", "dill>=0.2,<0.3", "requests>=2.13.0,<3.0.0", + "jsonschema>=2.6.0,<3.0.0", + "wasabi>=0.0.8,<1.1.0", 'pathlib==1.0.1; python_version < "3.4"', ], setup_requires=["wheel"], diff --git a/spacy/__main__.py b/spacy/__main__.py index 5d712ea15..a1679d7fd 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,40 +1,41 @@ # coding: utf8 from __future__ import print_function + # NB! This breaks in plac on Python 2!! # from __future__ import unicode_literals -if __name__ == '__main__': +if __name__ == "__main__": import plac import sys + from wasabi import Printer from spacy.cli import download, link, info, package, train, pretrain, convert - from spacy.cli import vocab, init_model, profile, evaluate, validate - from spacy.cli import ud_train, ud_evaluate - from spacy.util import prints + from spacy.cli import init_model, profile, evaluate, validate + from spacy.cli import ud_train, ud_evaluate, debug_data + + msg = Printer() commands = { - 'download': download, - 'link': link, - 'info': info, - 'train': train, - 'pretrain': pretrain, - 'ud-train': ud_train, - 'evaluate': evaluate, - 'ud-evaluate': ud_evaluate, - 'convert': convert, - 'package': package, - 'vocab': vocab, - 'init-model': init_model, - 'profile': profile, - 'validate': validate + "download": download, + "link": link, + "info": info, + "train": train, + "pretrain": pretrain, + "debug-data": debug_data, + "ud-train": ud_train, + "evaluate": evaluate, + "ud-evaluate": ud_evaluate, + "convert": convert, + "package": package, + "init-model": init_model, + "profile": profile, + "validate": validate, } if len(sys.argv) == 1: - prints(', '.join(commands), title="Available commands", exits=1) + msg.info("Available commands", ", ".join(commands), exits=1) command = sys.argv.pop(1) - sys.argv[0] = 'spacy %s' % command + sys.argv[0] = "spacy %s" % command if command in commands: plac.call(commands[command], sys.argv[1:]) else: - prints( - "Available: %s" % ', '.join(commands), - title="Unknown command: %s" % command, - exits=1) + available = "Available: {}".format(", ".join(commands)) + msg.fail("Unknown command: {}".format(command), available, exits=1) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 5497c55ce..4ab1c7c55 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -1,14 +1,13 @@ -from .download import download -from .info import info -from .link import link -from .package import package -from .profile import profile -from .train import train -from .pretrain import pretrain -from .evaluate import evaluate -from .convert import convert -from .vocab import make_vocab as vocab -from .init_model import init_model -from .validate import validate -from .ud_train import main as ud_train -from .conll17_ud_eval import main as ud_evaluate +from .download import download # noqa: F401 +from .info import info # noqa: F401 +from .link import link # noqa: F401 +from .package import package # noqa: F401 +from .profile import profile # noqa: F401 +from .train import train # noqa: F401 +from .pretrain import pretrain # noqa: F401 +from .debug_data import debug_data # noqa: F401 +from .evaluate import evaluate # noqa: F401 +from .convert import convert # noqa: F401 +from .init_model import init_model # noqa: F401 +from .validate import validate # noqa: F401 +from .ud import ud_train, ud_evaluate # noqa: F401 diff --git a/spacy/cli/_messages.py b/spacy/cli/_messages.py index 01ec9dbf6..2ac6599c5 100644 --- a/spacy/cli/_messages.py +++ b/spacy/cli/_messages.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals +# fmt: off + class Messages(object): M001 = ("Download successful but linking failed") M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " @@ -73,3 +75,31 @@ class Messages(object): M052 = ("Not a valid meta.json format") M053 = ("Expected dict but got: {meta_type}") M054 = ("No --lang specified, but tokenization required.") + M055 = ("Training pipeline: {pipeline}") + M056 = ("Starting with base model '{model}'") + M057 = ("Starting with blank model '{model}'") + M058 = ("Loading vector from model '{model}'") + M059 = ("Can't use multitask objective without '{pipe}' in the pipeline") + M060 = ("Counting training words (limit={limit})") + M061 = ("\nSaving model...") + M062 = ("Output directory is not empty.") + M063 = ("Incompatible arguments") + M064 = ("The -f and -c arguments are deprecated, and not compatible with " + "the -j argument, which should specify the same information. " + "Either merge the frequencies and clusters data into the " + "JSONL-formatted file (recommended), or use only the -f and -c " + "files, without the other lexical attributes.") + M065 = ("This can lead to unintended side effects when saving the model. " + "Please use an empty directory or a different path instead. If " + "the specified output path doesn't exist, the directory will be " + "created for you.") + M066 = ("Saved model to output directory") + M067 = ("Can't find lexical data") + M068 = ("Sucessfully compiled vocab and vectors, and saved model") + M069 = ("Unknown file type: '{name}'") + M070 = ("Supported file types: '{options}'") + M071 = ("Loaded pretrained tok2vec for: {components}") + M072 = ("Model language ('{model_lang}') doesn't match language specified " + "as `lang` argument ('{lang}') ") + +# fmt: on diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index cfbb9e56a..b41b22036 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -3,49 +3,91 @@ from __future__ import unicode_literals import plac from pathlib import Path +from wasabi import Printer +from ..util import write_jsonl, write_json +from ..compat import json_dumps, path2str from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json from .converters import ner_jsonl2json from ._messages import Messages -from ..util import prints + # Converters are matched by file extension. To add a converter, add a new # entry to this dict with the file extension mapped to the converter function # imported from /converters. CONVERTERS = { - 'conllubio': conllubio2json, - 'conllu': conllu2json, - 'conll': conllu2json, - 'ner': conll_ner2json, - 'iob': iob2json, - 'jsonl': ner_jsonl2json + "conllubio": conllubio2json, + "conllu": conllu2json, + "conll": conllu2json, + "ner": conll_ner2json, + "iob": iob2json, + "jsonl": ner_jsonl2json, } +# File types +FILE_TYPES = ("json", "jsonl") + @plac.annotations( - input_file=("input file", "positional", None, str), - output_dir=("output directory for converted file", "positional", None, str), + input_file=("Input file", "positional", None, str), + output_dir=("Output directory for converted file", "positional", None, str), + file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str), n_sents=("Number of sentences per doc", "option", "n", int), converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), lang=("Language (if tokenizer required)", "option", "l", str), - morphology=("Enable appending morphology to tags", "flag", "m", bool)) -def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto', - lang=None): + morphology=("Enable appending morphology to tags", "flag", "m", bool), +) +def convert( + input_file, + output_dir="-", + file_type="jsonl", + n_sents=1, + morphology=False, + converter="auto", + lang=None, +): """ Convert files into JSON format for use with train command and other - experiment management functions. + experiment management functions. If no output_dir is specified, the data + is written to stdout, so you can pipe them forward to a JSONL file: + $ spacy convert some_file.conllu > some_file.jsonl """ + msg = Printer() input_path = Path(input_file) - output_path = Path(output_dir) + if file_type not in FILE_TYPES: + msg.fail( + Messages.M069.format(name=file_type), + Messages.M070.format(options=", ".join(FILE_TYPES)), + exits=1, + ) if not input_path.exists(): - prints(input_path, title=Messages.M028, exits=1) - if not output_path.exists(): - prints(output_path, title=Messages.M029, exits=1) - if converter == 'auto': + msg.fail(Messages.M028, input_path, exits=1) + if output_dir != "-" and not Path(output_dir).exists(): + msg.fail(Messages.M029, output_dir, exits=1) + if converter == "auto": converter = input_path.suffix[1:] if converter not in CONVERTERS: - prints(Messages.M031.format(converter=converter), - title=Messages.M030, exits=1) + msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1) + # Use converter function to convert data func = CONVERTERS[converter] - func(input_path, output_path, - n_sents=n_sents, use_morphology=morphology, lang=lang) + input_data = input_path.open("r", encoding="utf-8").read() + data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang) + if output_dir != "-": + # Export data to a file + suffix = ".{}".format(file_type) + output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) + if file_type == "json": + write_json(output_file, data) + elif file_type == "jsonl": + write_jsonl(output_file, data) + msg.good( + Messages.M032.format(name=path2str(output_file)), + Messages.M033.format(n_docs=len(data)), + ) + else: + # Print to stdout + if file_type == "json": + print(json_dumps(data)) + elif file_type == "jsonl": + for line in data: + print(json_dumps(line)) diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index c6898fa98..c0be857a8 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -1,5 +1,5 @@ -from .conllu2json import conllu2json -from .conllubio2json import conllubio2json -from .iob2json import iob2json -from .conll_ner2json import conll_ner2json -from .jsonl2json import ner_jsonl2json +from .conllu2json import conllu2json # noqa: F401 +from .conllubio2json import conllubio2json # noqa: F401 +from .iob2json import iob2json # noqa: F401 +from .conll_ner2json import conll_ner2json # noqa: F401 +from .jsonl2json import ner_jsonl2json # noqa: F401 diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index f53261488..3fd51a140 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -1,52 +1,38 @@ # coding: utf8 from __future__ import unicode_literals -from .._messages import Messages -from ...compat import json_dumps, path2str -from ...util import prints from ...gold import iob_to_biluo -def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): +def conll_ner2json(input_data, **kwargs): """ Convert files in the CoNLL-2003 NER format into JSON format for use with train cli. """ - docs = read_conll_ner(input_path) - - output_filename = input_path.parts[-1].replace(".conll", "") + ".json" - output_filename = input_path.parts[-1].replace(".conll", "") + ".json" - output_file = output_path / output_filename - with output_file.open('w', encoding='utf-8') as f: - f.write(json_dumps(docs)) - prints(Messages.M033.format(n_docs=len(docs)), - title=Messages.M032.format(name=path2str(output_file))) - - -def read_conll_ner(input_path): - text = input_path.open('r', encoding='utf-8').read() - i = 0 - delimit_docs = '-DOCSTART- -X- O O' + delimit_docs = "-DOCSTART- -X- O O" output_docs = [] - for doc in text.strip().split(delimit_docs): + for doc in input_data.strip().split(delimit_docs): doc = doc.strip() if not doc: continue output_doc = [] - for sent in doc.split('\n\n'): + for sent in doc.split("\n\n"): sent = sent.strip() if not sent: continue - lines = [line.strip() for line in sent.split('\n') if line.strip()] + lines = [line.strip() for line in sent.split("\n") if line.strip()] words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) biluo_ents = iob_to_biluo(iob_ents) - output_doc.append({'tokens': [ - {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in - zip(words, tags, biluo_ents) - ]}) - output_docs.append({ - 'id': len(output_docs), - 'paragraphs': [{'sentences': output_doc}] - }) + output_doc.append( + { + "tokens": [ + {"orth": w, "tag": tag, "ner": ent} + for (w, tag, ent) in zip(words, tags, biluo_ents) + ] + } + ) + output_docs.append( + {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]} + ) output_doc = [] return output_docs diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index e26020ad2..f1102a94a 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -1,34 +1,27 @@ # coding: utf8 from __future__ import unicode_literals -from .._messages import Messages -from ...compat import json_dumps, path2str -from ...util import prints -from ...gold import iob_to_biluo import re +from ...gold import iob_to_biluo -def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): +def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None): """ Convert conllu files into JSON format for use with train cli. use_morphology parameter enables appending morphology to tags, which is useful for languages such as Spanish, where UD tags are not so rich. - """ - # by @dvsrepo, via #11 explosion/spacy-dev-resources - """ Extract NER tags if available and convert them so that they follow BILUO and the Wikipedia scheme """ + # by @dvsrepo, via #11 explosion/spacy-dev-resources # by @katarkor - docs = [] sentences = [] - conll_tuples = read_conllx(input_path, use_morphology=use_morphology) + conll_tuples = read_conllx(input_data, use_morphology=use_morphology) checked_for_ner = False has_ner_tags = False - for i, (raw_text, tokens) in enumerate(conll_tuples): sentence, brackets = tokens[0] if not checked_for_ner: @@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang= sentences.append(generate_sentence(sentence, has_ner_tags)) # Real-sized documents could be extracted using the comments on the # conluu document - - if(len(sentences) % n_sents == 0): + if len(sentences) % n_sents == 0: doc = create_doc(sentences, i) docs.append(doc) sentences = [] - - output_filename = input_path.parts[-1].replace(".conll", ".json") - output_filename = input_path.parts[-1].replace(".conllu", ".json") - output_file = output_path / output_filename - with output_file.open('w', encoding='utf-8') as f: - f.write(json_dumps(docs)) - prints(Messages.M033.format(n_docs=len(docs)), - title=Messages.M032.format(name=path2str(output_file))) + return docs def is_ner(tag): - - """ - Check the 10th column of the first token to determine if the file contains - NER tags """ - - tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag) + Check the 10th column of the first token to determine if the file contains + NER tags + """ + tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) if tag_match: return True elif tag == "O": @@ -67,29 +50,29 @@ def is_ner(tag): else: return False -def read_conllx(input_path, use_morphology=False, n=0): - text = input_path.open('r', encoding='utf-8').read() + +def read_conllx(input_data, use_morphology=False, n=0): i = 0 - for sent in text.strip().split('\n\n'): - lines = sent.strip().split('\n') + for sent in input_data.strip().split("\n\n"): + lines = sent.strip().split("\n") if lines: - while lines[0].startswith('#'): + while lines[0].startswith("#"): lines.pop(0) tokens = [] for line in lines: - parts = line.split('\t') + parts = line.split("\t") id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts - if '-' in id_ or '.' in id_: + if "-" in id_ or "." in id_: continue try: id_ = int(id_) - 1 - head = (int(head) - 1) if head != '0' else id_ - dep = 'ROOT' if dep == 'root' else dep - tag = pos if tag == '_' else tag - tag = tag+'__'+morph if use_morphology else tag + head = (int(head) - 1) if head != "0" else id_ + dep = "ROOT" if dep == "root" else dep + tag = pos if tag == "_" else tag + tag = tag + "__" + morph if use_morphology else tag tokens.append((id_, word, tag, head, dep, iob)) - except: + except: # noqa: E722 print(line) raise tuples = [list(t) for t in zip(*tokens)] @@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0): if n >= 1 and i >= n: break + def simplify_tags(iob): - """ Simplify tags obtained from the dataset in order to follow Wikipedia scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to - 'MISC'. + 'MISC'. """ - new_iob = [] for tag in iob: - tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag) + tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag) if tag_match: prefix = tag_match.group(1) suffix = tag_match.group(2) - if suffix == 'GPE_LOC': - suffix = 'LOC' - elif suffix == 'GPE_ORG': - suffix = 'ORG' - elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG': - suffix = 'MISC' - tag = prefix + '-' + suffix + if suffix == "GPE_LOC": + suffix = "LOC" + elif suffix == "GPE_ORG": + suffix = "ORG" + elif suffix != "PER" and suffix != "LOC" and suffix != "ORG": + suffix = "MISC" + tag = prefix + "-" + suffix new_iob.append(tag) return new_iob + def generate_sentence(sent, has_ner_tags): (id_, word, tag, head, dep, iob) = sent sentence = {} @@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags): return sentence -def create_doc(sentences,id): +def create_doc(sentences, id): doc = {} paragraph = {} doc["id"] = id diff --git a/spacy/cli/converters/conllubio2json.py b/spacy/cli/converters/conllubio2json.py index 881b8c533..bd6ee7996 100644 --- a/spacy/cli/converters/conllubio2json.py +++ b/spacy/cli/converters/conllubio2json.py @@ -1,65 +1,54 @@ # coding: utf8 from __future__ import unicode_literals -from ...compat import json_dumps, path2str -from ...util import prints from ...gold import iob_to_biluo -def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): + +def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None): """ Convert conllu files into JSON format for use with train cli. use_morphology parameter enables appending morphology to tags, which is useful for languages such as Spanish, where UD tags are not so rich. """ # by @dvsrepo, via #11 explosion/spacy-dev-resources - docs = [] sentences = [] - conll_tuples = read_conllx(input_path, use_morphology=use_morphology) - + conll_tuples = read_conllx(input_data, use_morphology=use_morphology) for i, (raw_text, tokens) in enumerate(conll_tuples): sentence, brackets = tokens[0] sentences.append(generate_sentence(sentence)) # Real-sized documents could be extracted using the comments on the # conluu document - if(len(sentences) % n_sents == 0): + if len(sentences) % n_sents == 0: doc = create_doc(sentences, i) docs.append(doc) sentences = [] - - output_filename = input_path.parts[-1].replace(".conll", ".json") - output_filename = input_path.parts[-1].replace(".conllu", ".json") - output_file = output_path / output_filename - with output_file.open('w', encoding='utf-8') as f: - f.write(json_dumps(docs)) - prints("Created %d documents" % len(docs), - title="Generated output file %s" % path2str(output_file)) + return docs -def read_conllx(input_path, use_morphology=False, n=0): - text = input_path.open('r', encoding='utf-8').read() +def read_conllx(input_data, use_morphology=False, n=0): i = 0 - for sent in text.strip().split('\n\n'): - lines = sent.strip().split('\n') + for sent in input_data.strip().split("\n\n"): + lines = sent.strip().split("\n") if lines: - while lines[0].startswith('#'): + while lines[0].startswith("#"): lines.pop(0) tokens = [] for line in lines: - parts = line.split('\t') + parts = line.split("\t") id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts - if '-' in id_ or '.' in id_: + if "-" in id_ or "." in id_: continue try: id_ = int(id_) - 1 - head = (int(head) - 1) if head != '0' else id_ - dep = 'ROOT' if dep == 'root' else dep - tag = pos if tag == '_' else tag - tag = tag+'__'+morph if use_morphology else tag - ner = ner if ner else 'O' + head = (int(head) - 1) if head != "0" else id_ + dep = "ROOT" if dep == "root" else dep + tag = pos if tag == "_" else tag + tag = tag + "__" + morph if use_morphology else tag + ner = ner if ner else "O" tokens.append((id_, word, tag, head, dep, ner)) - except: + except: # noqa: E722 print(line) raise tuples = [list(t) for t in zip(*tokens)] @@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0): if n >= 1 and i >= n: break + def generate_sentence(sent): (id_, word, tag, head, dep, ner) = sent sentence = {} @@ -85,7 +75,7 @@ def generate_sentence(sent): return sentence -def create_doc(sentences,id): +def create_doc(sentences, id): doc = {} paragraph = {} doc["id"] = id diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 5a0e9e046..24e78989b 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,26 +1,24 @@ # coding: utf8 from __future__ import unicode_literals -from cytoolz import partition_all, concat -from .._messages import Messages -from ...compat import json_dumps, path2str -from ...util import prints +from cytoolz import partition_all + from ...gold import iob_to_biluo -def iob2json(input_path, output_path, n_sents=10, *a, **k): +def iob2json(input_data, n_sents=10, *args, **kwargs): """ Convert IOB files into JSON format for use with train cli. """ - with input_path.open('r', encoding='utf8') as file_: - sentences = read_iob(file_) - docs = merge_sentences(sentences, n_sents) - output_filename = input_path.parts[-1].replace(".iob", ".json") - output_file = output_path / output_filename - with output_file.open('w', encoding='utf-8') as f: - f.write(json_dumps(docs)) - prints(Messages.M033.format(n_docs=len(docs)), - title=Messages.M032.format(name=path2str(output_file))) + docs = [] + for group in partition_all(n_sents, docs): + group = list(group) + first = group.pop(0) + to_extend = first["paragraphs"][0]["sentences"] + for sent in group[1:]: + to_extend.extend(sent["paragraphs"][0]["sentences"]) + docs.append(first) + return docs def read_iob(raw_sents): @@ -28,30 +26,20 @@ def read_iob(raw_sents): for line in raw_sents: if not line.strip(): continue - tokens = [t.split('|') for t in line.split()] + tokens = [t.split("|") for t in line.split()] if len(tokens[0]) == 3: words, pos, iob = zip(*tokens) else: words, iob = zip(*tokens) - pos = ['-'] * len(words) + pos = ["-"] * len(words) biluo = iob_to_biluo(iob) - sentences.append([ - {'orth': w, 'tag': p, 'ner': ent} - for (w, p, ent) in zip(words, pos, biluo) - ]) - sentences = [{'tokens': sent} for sent in sentences] - paragraphs = [{'sentences': [sent]} for sent in sentences] - docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] + sentences.append( + [ + {"orth": w, "tag": p, "ner": ent} + for (w, p, ent) in zip(words, pos, biluo) + ] + ) + sentences = [{"tokens": sent} for sent in sentences] + paragraphs = [{"sentences": [sent]} for sent in sentences] + docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs] return docs - -def merge_sentences(docs, n_sents): - counter = 0 - merged = [] - for group in partition_all(n_sents, docs): - group = list(group) - first = group.pop(0) - to_extend = first['paragraphs'][0]['sentences'] - for sent in group[1:]: - to_extend.extend(sent['paragraphs'][0]['sentences']) - merged.append(first) - return merged diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 3508a05cd..26fdca302 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -1,33 +1,21 @@ # coding: utf8 from __future__ import unicode_literals -import ujson as json +import ujson + +from ...util import get_lang_class from .._messages import Messages -from ...compat import json_dumps, path2str -from ...util import prints, get_lang_class -from ...gold import docs_to_json -def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False): +def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False): if lang is None: - prints(Messages.M054, exits=True) + raise ValueError(Messages.M054) json_docs = [] - input_tuples = list(read_jsonl(input_path)) + input_tuples = [ujson.loads(line) for line in input_data] nlp = get_lang_class(lang)() for i, (raw_text, ents) in enumerate(input_tuples): doc = nlp.make_doc(raw_text) doc[0].is_sent_start = True - doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']] - json_docs.append(docs_to_json(i, [doc])) - - output_filename = input_path.parts[-1].replace(".jsonl", ".json") - output_loc = output_path / output_filename - with (output_loc).open('w', encoding='utf8') as file_: - file_.write(json_dumps(json_docs)) - prints(Messages.M033.format(n_docs=len(json_docs)), - title=Messages.M032.format(name=path2str(output_loc))) - -def read_jsonl(input_path): - with input_path.open('r', encoding='utf8') as file_: - for line in file_: - yield json.loads(line) + doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]] + json_docs.append(doc.to_json()) + return json_docs diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py new file mode 100644 index 000000000..5bf602828 --- /dev/null +++ b/spacy/cli/debug_data.py @@ -0,0 +1,398 @@ +# coding: utf8 +from __future__ import unicode_literals, print_function + +from pathlib import Path +from collections import Counter +import plac +import sys +from wasabi import Printer, MESSAGES + +from ..gold import GoldCorpus, read_json_object +from ..util import load_model, get_lang_class, read_json, read_jsonl + +# from .schemas import get_schema, validate_json +from ._messages import Messages + + +# Minimum number of expected occurences of label in data to train new label +NEW_LABEL_THRESHOLD = 50 +# Minimum number of expected examples to train a blank model +BLANK_MODEL_MIN_THRESHOLD = 100 +BLANK_MODEL_THRESHOLD = 2000 + + +@plac.annotations( + lang=("model language", "positional", None, str), + train_path=("location of JSON-formatted training data", "positional", None, Path), + dev_path=("location of JSON-formatted development data", "positional", None, Path), + base_model=("name of model to update (optional)", "option", "b", str), + pipeline=( + "Comma-separated names of pipeline components to train", + "option", + "p", + str, + ), + ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), + ignore_validation=( + "Don't exit if JSON format validation fails", + "flag", + "IV", + bool, + ), + verbose=("Print additional information and explanations", "flag", "V", bool), + no_format=("Don't pretty-print the results", "flag", "NF", bool), +) +def debug_data( + lang, + train_path, + dev_path, + base_model=None, + pipeline="tagger,parser,ner", + ignore_warnings=False, + ignore_validation=False, + verbose=False, + no_format=False, +): + msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) + + # Make sure all files and paths exists if they are needed + if not train_path.exists(): + msg.fail(Messages.M050, train_path, exits=1) + if not dev_path.exists(): + msg.fail(Messages.M051, dev_path, exits=1) + + # Initialize the model and pipeline + pipeline = [p.strip() for p in pipeline.split(",")] + if base_model: + nlp = load_model(base_model) + else: + lang_cls = get_lang_class(lang) + nlp = lang_cls() + + msg.divider("Data format validation") + # Load the data in one – might take a while but okay in this case + with msg.loading("Loading {}...".format(train_path.parts[-1])): + train_data = _load_file(train_path, msg) + with msg.loading("Loading {}...".format(dev_path.parts[-1])): + dev_data = _load_file(dev_path, msg) + + # Validate data format using the JSON schema + # TODO: update once the new format is ready + # schema = get_schema("training") + train_data_errors = [] # TODO: validate_json(train_data, schema) + dev_data_errors = [] # TODO: validate_json(dev_data, schema) + if not train_data_errors: + msg.good("Training data JSON format is valid") + if not dev_data_errors: + msg.good("Development data JSON format is valid") + for error in train_data_errors: + msg.fail("Training data: {}".format(error)) + for error in dev_data_errors: + msg.fail("Develoment data: {}".format(error)) + if (train_data_errors or dev_data_errors) and not ignore_validation: + sys.exit(1) + + # Create the gold corpus to be able to better analyze data + with msg.loading("Analyzing corpus..."): + train_data = read_json_object(train_data) + dev_data = read_json_object(dev_data) + corpus = GoldCorpus(train_data, dev_data) + train_docs = list(corpus.train_docs(nlp)) + dev_docs = list(corpus.dev_docs(nlp)) + msg.good("Corpus is loadable") + + # Create all gold data here to avoid iterating over the train_docs constantly + gold_data = _compile_gold(train_docs, pipeline) + train_texts = gold_data["texts"] + dev_texts = set([doc.text for doc, gold in dev_docs]) + + msg.divider("Training stats") + msg.text("Training pipeline: {}".format(", ".join(pipeline))) + for pipe in [p for p in pipeline if p not in nlp.factories]: + msg.fail("Pipeline component '{}' not available in factories".format(pipe)) + if base_model: + msg.text("Starting with base model '{}'".format(base_model)) + else: + msg.text("Starting with blank model '{}'".format(lang)) + msg.text("{} training docs".format(len(train_docs))) + msg.text("{} evaluation docs".format(len(dev_docs))) + + overlap = len(train_texts.intersection(dev_texts)) + if overlap: + msg.warn("{} training examples also in evaluation data".format(overlap)) + else: + msg.good("No overlap between training and evaluation data") + if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: + text = "Low number of examples to train from a blank model ({})".format( + len(train_docs) + ) + if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: + msg.fail(text) + else: + msg.warn(text) + msg.text( + "It's recommended to use at least {} examples (minimum {})".format( + BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD + ), + show=verbose, + ) + + msg.divider("Vocab & Vectors") + n_words = gold_data["n_words"] + msg.info( + "{} total {} in the data ({} unique)".format( + n_words, "word" if n_words == 1 else "words", len(gold_data["words"]) + ) + ) + most_common_words = gold_data["words"].most_common(10) + msg.text( + "10 most common words: {}".format( + _format_labels(most_common_words, counts=True) + ), + show=verbose, + ) + if len(nlp.vocab.vectors): + msg.info( + "{} vectors ({} unique keys, {} dimensions)".format( + len(nlp.vocab.vectors), + nlp.vocab.vectors.n_keys, + nlp.vocab.vectors_length, + ) + ) + else: + msg.info("No word vectors present in the model") + + if "ner" in pipeline: + # Get all unique NER labels present in the data + labels = set(label for label in gold_data["ner"] if label not in ("O", "-")) + label_counts = gold_data["ner"] + model_labels = _get_labels_from_model(nlp, "ner") + new_labels = [l for l in labels if l not in model_labels] + existing_labels = [l for l in labels if l in model_labels] + has_low_data_warning = False + has_no_neg_warning = False + + msg.divider("Named Entity Recognition") + msg.info( + "{} new {}, {} existing {}".format( + len(new_labels), + "label" if len(new_labels) == 1 else "labels", + len(existing_labels), + "label" if len(existing_labels) == 1 else "labels", + ) + ) + missing_values = label_counts["-"] + msg.text( + "{} missing {} (tokens with '-' label)".format( + missing_values, "value" if missing_values == 1 else "values" + ) + ) + if new_labels: + labels_with_counts = [ + (label, count) + for label, count in label_counts.most_common() + if label != "-" + ] + labels_with_counts = _format_labels(labels_with_counts, counts=True) + msg.text("New: {}".format(labels_with_counts), show=verbose) + if existing_labels: + msg.text( + "Existing: {}".format(_format_labels(existing_labels)), show=verbose + ) + + for label in new_labels: + if label_counts[label] <= NEW_LABEL_THRESHOLD: + msg.warn( + "Low number of examples for new label '{}' ({})".format( + label, label_counts[label] + ) + ) + has_low_data_warning = True + + with msg.loading("Analyzing label distribution..."): + neg_docs = _get_examples_without_label(train_docs, label) + if neg_docs == 0: + msg.warn( + "No examples for texts WITHOUT new label '{}'".format(label) + ) + has_no_neg_warning = True + + if not has_low_data_warning: + msg.good("Good amount of examples for all labels") + if not has_no_neg_warning: + msg.good("Examples without occurences available for all labels") + + if has_low_data_warning: + msg.text( + "To train a new entity type, your data should include at " + "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD), + show=verbose, + ) + if has_no_neg_warning: + msg.text( + "Training data should always include examples of entities " + "in context, as well as examples without a given entity " + "type.", + show=verbose, + ) + + if "textcat" in pipeline: + msg.divider("Text Classification") + labels = [label for label in gold_data["textcat"]] + model_labels = _get_labels_from_model(nlp, "textcat") + new_labels = [l for l in labels if l not in model_labels] + existing_labels = [l for l in labels if l in model_labels] + msg.info( + "Text Classification: {} new label(s), {} existing label(s)".format( + len(new_labels), len(existing_labels) + ) + ) + if new_labels: + labels_with_counts = _format_labels( + gold_data["textcat"].most_common(), counts=True + ) + msg.text("New: {}".format(labels_with_counts), show=verbose) + if existing_labels: + msg.text( + "Existing: {}".format(_format_labels(existing_labels)), show=verbose + ) + + if "tagger" in pipeline: + msg.divider("Part-of-speech Tagging") + labels = [label for label in gold_data["tags"]] + tag_map = nlp.Defaults.tag_map + msg.info( + "{} {} in data ({} {} in tag map)".format( + len(labels), + "label" if len(labels) == 1 else "labels", + len(tag_map), + "label" if len(tag_map) == 1 else "labels", + ) + ) + labels_with_counts = _format_labels( + gold_data["tags"].most_common(), counts=True + ) + msg.text(labels_with_counts, show=verbose) + non_tagmap = [l for l in labels if l not in tag_map] + if not non_tagmap: + msg.good("All labels present in tag map for language '{}'".format(nlp.lang)) + for label in non_tagmap: + msg.fail( + "Label '{}' not found in tag map for language '{}'".format( + label, nlp.lang + ) + ) + + if "parser" in pipeline: + msg.divider("Dependency Parsing") + labels = [label for label in gold_data["deps"]] + msg.info( + "{} {} in data".format( + len(labels), "label" if len(labels) == 1 else "labels" + ) + ) + labels_with_counts = _format_labels( + gold_data["deps"].most_common(), counts=True + ) + msg.text(labels_with_counts, show=verbose) + + msg.divider("Summary") + good_counts = msg.counts[MESSAGES.GOOD] + warn_counts = msg.counts[MESSAGES.WARN] + fail_counts = msg.counts[MESSAGES.FAIL] + if good_counts: + msg.good( + "{} {} passed".format( + good_counts, "check" if good_counts == 1 else "checks" + ) + ) + if warn_counts: + msg.warn( + "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings") + ) + if fail_counts: + msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) + + if fail_counts: + sys.exit(1) + + +def _load_file(file_path, msg): + file_name = file_path.parts[-1] + if file_path.suffix == ".json": + data = read_json(file_path) + msg.good("Loaded {}".format(file_name)) + return data + elif file_path.suffix == ".jsonl": + data = read_jsonl(file_path) + msg.good("Loaded {}".format(file_name)) + return data + msg.fail( + "Can't load file extension {}".format(file_path.suffix), + "Expected .json or .jsonl", + exits=1, + ) + + +def _compile_gold(train_docs, pipeline): + data = { + "ner": Counter(), + "cats": Counter(), + "tags": Counter(), + "deps": Counter(), + "words": Counter(), + "n_words": 0, + "texts": set(), + } + for doc, gold in train_docs: + data["words"].update(gold.words) + data["n_words"] += len(gold.words) + data["texts"].add(doc.text) + if "ner" in pipeline: + for label in gold.ner: + if label.startswith(("B-", "U-")): + combined_label = label.split("-")[1] + data["ner"][combined_label] += 1 + elif label == "-": + data["ner"]["-"] += 1 + if "textcat" in pipeline: + data["cats"].update(gold.cats) + if "tagger" in pipeline: + data["tags"].update(gold.tags) + if "parser" in pipeline: + data["deps"].update(gold.labels) + return data + + +def _format_labels(labels, counts=False): + if counts: + return ", ".join(["'{}' ({})".format(l, c) for l, c in labels]) + return ", ".join(["'{}'".format(l) for l in labels]) + + +def _get_ner_counts(data): + counter = Counter() + for doc, gold in data: + for label in gold.ner: + if label.startswith(("B-", "U-")): + combined_label = label.split("-")[1] + counter[combined_label] += 1 + elif label == "-": + counter["-"] += 1 + return counter + + +def _get_examples_without_label(data, label): + count = 0 + for doc, gold in data: + labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")] + if label not in labels: + count += 1 + return count + + +def _get_labels_from_model(nlp, pipe_name): + if pipe_name not in nlp.pipe_names: + return set() + pipe = nlp.get_pipe(pipe_name) + return pipe.labels diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 2710d341b..bc725dd16 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -6,34 +6,37 @@ import requests import os import subprocess import sys +from wasabi import Printer from ._messages import Messages from .link import link -from ..util import prints, get_package_path +from ..util import get_package_path from .. import about +msg = Printer() + + @plac.annotations( - model=("model to download, shortcut or name", "positional", None, str), - direct=("force direct download. Needs model name with version and won't " - "perform compatibility check", "flag", "d", bool), - pip_args=("additional arguments to be passed to `pip install` when " - "installing the model")) + model=("Model to download (shortcut or name)", "positional", None, str), + direct=("Force direct download of name + version", "flag", "d", bool), + pip_args=("additional arguments to be passed to `pip install` on model install"), +) def download(model, direct=False, *pip_args): """ Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name - with version. + with version. For direct downloads, the compatibility check will be skipped. """ if direct: - dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args) + dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args) else: shortcuts = get_json(about.__shortcuts__, "available shortcuts") model_name = shortcuts.get(model, model) compatibility = get_compatibility() version = get_version(model_name, compatibility) - dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}' - .format(m=model_name, v=version), pip_args) + dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}" + dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args) if dl != 0: # if download subprocess doesn't return 0, exit sys.exit(dl) try: @@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args): # subprocess package_path = get_package_path(model_name) link(model_name, model, force=True, model_path=package_path) - except: + except: # noqa: E722 # Dirty, but since spacy.download and the auto-linking is # mostly a convenience wrapper, it's best to show a success # message and loading instructions, even if linking fails. - prints(Messages.M001, title=Messages.M002.format(name=model_name)) + msg.warn(Messages.M002.format(name=model_name), Messages.M001) def get_json(url, desc): r = requests.get(url) if r.status_code != 200: - prints(Messages.M004.format(desc=desc, version=about.__version__), - title=Messages.M003.format(code=r.status_code), exits=1) + msg.fail( + Messages.M003.format(code=r.status_code), + Messages.M004.format(desc=desc, version=about.__version__), + exits=1, + ) return r.json() def get_compatibility(): version = about.__version__ - version = version.rsplit('.dev', 1)[0] + version = version.rsplit(".dev", 1)[0] comp_table = get_json(about.__compatibility__, "compatibility table") - comp = comp_table['spacy'] + comp = comp_table["spacy"] if version not in comp: - prints(Messages.M006.format(version=version), title=Messages.M005, - exits=1) + msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1) return comp[version] def get_version(model, comp): - model = model.rsplit('.dev', 1)[0] + model = model.rsplit(".dev", 1)[0] if model not in comp: - prints(Messages.M007.format(name=model, version=about.__version__), - title=Messages.M005, exits=1) + msg.fail( + Messages.M005, + Messages.M007.format(name=model, version=about.__version__), + exits=1, + ) return comp[model][0] def download_model(filename, user_pip_args=None): - download_url = about.__download_url__ + '/' + filename - pip_args = ['--no-cache-dir', '--no-deps'] + download_url = about.__download_url__ + "/" + filename + pip_args = ["--no-cache-dir", "--no-deps"] if user_pip_args: pip_args.extend(user_pip_args) - cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url] + cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] return subprocess.call(cmd, env=os.environ.copy()) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 0ad319267..459c1419b 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function import plac from timeit import default_timer as timer +from wasabi import Printer from ._messages import Messages from ..gold import GoldCorpus -from ..util import prints from .. import util from .. import displacy @plac.annotations( - model=("model name or path", "positional", None, str), - data_path=("location of JSON-formatted evaluation data", "positional", - None, str), - gold_preproc=("use gold preprocessing", "flag", "G", bool), - gpu_id=("use GPU", "option", "g", int), - displacy_path=("directory to output rendered parses as HTML", "option", - "dp", str), - displacy_limit=("limit of parses to render as HTML", "option", "dl", int)) -def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None, - displacy_limit=25): + model=("Model name or path", "positional", None, str), + data_path=("Location of JSON-formatted evaluation data", "positional", None, str), + gold_preproc=("Use gold preprocessing", "flag", "G", bool), + gpu_id=("Use GPU", "option", "g", int), + displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str), + displacy_limit=("Limit of parses to render as HTML", "option", "dl", int), +) +def evaluate( + model, + data_path, + gpu_id=-1, + gold_preproc=False, + displacy_path=None, + displacy_limit=25, +): """ Evaluate a model. To render a sample of parses in a HTML file, set an output directory as the displacy_path argument. """ - + msg = Printer() util.fix_random_seed() if gpu_id >= 0: util.use_gpu(gpu_id) @@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None data_path = util.ensure_path(data_path) displacy_path = util.ensure_path(displacy_path) if not data_path.exists(): - prints(data_path, title=Messages.M034, exits=1) + msg.fail(Messages.M034, data_path, exits=1) if displacy_path and not displacy_path.exists(): - prints(displacy_path, title=Messages.M035, exits=1) + msg.fail(Messages.M035, displacy_path, exits=1) corpus = GoldCorpus(data_path, data_path) nlp = util.load_model(model) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) @@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None scorer = nlp.evaluate(dev_docs, verbose=False) end = timer() nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) - print_results(scorer, time=end - begin, words=nwords, - wps=nwords / (end - begin)) + results = { + "Time": "%.2f s" % end - begin, + "Words": nwords, + "Words/s": "%.0f" % nwords / (end - begin), + "TOK": "%.2f" % scorer.token_acc, + "POS": "%.2f" % scorer.tags_acc, + "UAS": "%.2f" % scorer.uas, + "LAS": "%.2f" % scorer.las, + "NER P": "%.2f" % scorer.ents_p, + "NER R": "%.2f" % scorer.ents_r, + "NER F": "%.2f" % scorer.ents_f, + } + msg.table(results, title="Results") + if displacy_path: docs, golds = zip(*dev_docs) - render_deps = 'parser' in nlp.meta.get('pipeline', []) - render_ents = 'ner' in nlp.meta.get('pipeline', []) - render_parses(docs, displacy_path, model_name=model, - limit=displacy_limit, deps=render_deps, ents=render_ents) - prints(displacy_path, title=Messages.M036.format(n=displacy_limit)) + render_deps = "parser" in nlp.meta.get("pipeline", []) + render_ents = "ner" in nlp.meta.get("pipeline", []) + render_parses( + docs, + displacy_path, + model_name=model, + limit=displacy_limit, + deps=render_deps, + ents=render_ents, + ) + msg.good(Messages.M036.format(n=displacy_limit), displacy_path) -def render_parses(docs, output_path, model_name='', limit=250, deps=True, - ents=True): - docs[0].user_data['title'] = model_name +def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True): + docs[0].user_data["title"] = model_name if ents: - with (output_path / 'entities.html').open('w') as file_: - html = displacy.render(docs[:limit], style='ent', page=True) + with (output_path / "entities.html").open("w") as file_: + html = displacy.render(docs[:limit], style="ent", page=True) file_.write(html) if deps: - with (output_path / 'parses.html').open('w') as file_: - html = displacy.render(docs[:limit], style='dep', page=True, - options={'compact': True}) + with (output_path / "parses.html").open("w") as file_: + html = displacy.render( + docs[:limit], style="dep", page=True, options={"compact": True} + ) file_.write(html) def print_progress(itn, losses, dev_scores, wps=0.0): scores = {} - for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', - 'ents_p', 'ents_r', 'ents_f', 'wps']: + for col in [ + "dep_loss", + "tag_loss", + "uas", + "tags_acc", + "token_acc", + "ents_p", + "ents_r", + "ents_f", + "wps", + ]: scores[col] = 0.0 - scores['dep_loss'] = losses.get('parser', 0.0) - scores['ner_loss'] = losses.get('ner', 0.0) - scores['tag_loss'] = losses.get('tagger', 0.0) + scores["dep_loss"] = losses.get("parser", 0.0) + scores["ner_loss"] = losses.get("ner", 0.0) + scores["tag_loss"] = losses.get("tagger", 0.0) scores.update(dev_scores) - scores['wps'] = wps - tpl = '\t'.join(( - '{:d}', - '{dep_loss:.3f}', - '{ner_loss:.3f}', - '{uas:.3f}', - '{ents_p:.3f}', - '{ents_r:.3f}', - '{ents_f:.3f}', - '{tags_acc:.3f}', - '{token_acc:.3f}', - '{wps:.1f}')) + scores["wps"] = wps + tpl = "\t".join( + ( + "{:d}", + "{dep_loss:.3f}", + "{ner_loss:.3f}", + "{uas:.3f}", + "{ents_p:.3f}", + "{ents_r:.3f}", + "{ents_f:.3f}", + "{tags_acc:.3f}", + "{token_acc:.3f}", + "{wps:.1f}", + ) + ) print(tpl.format(itn, **scores)) - - -def print_results(scorer, time, words, wps): - results = { - 'Time': '%.2f s' % time, - 'Words': words, - 'Words/s': '%.0f' % wps, - 'TOK': '%.2f' % scorer.token_acc, - 'POS': '%.2f' % scorer.tags_acc, - 'UAS': '%.2f' % scorer.uas, - 'LAS': '%.2f' % scorer.las, - 'NER P': '%.2f' % scorer.ents_p, - 'NER R': '%.2f' % scorer.ents_r, - 'NER F': '%.2f' % scorer.ents_f} - util.print_table(results, title="Results") diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 2c97eb340..90387f9f7 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import plac import platform from pathlib import Path +from wasabi import Printer from ._messages import Messages from ..compat import path2str @@ -12,56 +13,65 @@ from .. import about @plac.annotations( - model=("optional: shortcut link of model", "positional", None, str), - markdown=("generate Markdown for GitHub issues", "flag", "md", str), - silent=("don't print anything (just return)", "flag", "s")) + model=("Optional shortcut link of model", "positional", None, str), + markdown=("Generate Markdown for GitHub issues", "flag", "md", str), + silent=("Don't print anything (just return)", "flag", "s"), +) def info(model=None, markdown=False, silent=False): - """Print info about spaCy installation. If a model shortcut link is + """ + Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ + msg = Printer() if model: if util.is_package(model): model_path = util.get_package_path(model) else: model_path = util.get_data_path() / model - meta_path = model_path / 'meta.json' + meta_path = model_path / "meta.json" if not meta_path.is_file(): - util.prints(meta_path, title=Messages.M020, exits=1) + msg.fail(Messages.M020, meta_path, exits=1) meta = util.read_json(meta_path) if model_path.resolve() != model_path: - meta['link'] = path2str(model_path) - meta['source'] = path2str(model_path.resolve()) + meta["link"] = path2str(model_path) + meta["source"] = path2str(model_path.resolve()) else: - meta['source'] = path2str(model_path) + meta["source"] = path2str(model_path) if not silent: - print_info(meta, 'model %s' % model, markdown) + title = "Info about model '{}'".format(model) + model_meta = { + k: v for k, v in meta.items() if k not in ("accuracy", "speed") + } + if markdown: + util.print_markdown(model_meta, title=title) + else: + msg.table(model_meta, title=title) return meta - data = {'spaCy version': about.__version__, - 'Location': path2str(Path(__file__).parent.parent), - 'Platform': platform.platform(), - 'Python version': platform.python_version(), - 'Models': list_models()} + data = { + "spaCy version": about.__version__, + "Location": path2str(Path(__file__).parent.parent), + "Platform": platform.platform(), + "Python version": platform.python_version(), + "Models": list_models(), + } if not silent: - print_info(data, 'spaCy', markdown) + title = "Info about spaCy" + if markdown: + util.print_markdown(data, title=title) + else: + msg.table(data, title=title) return data -def print_info(data, title, markdown): - title = 'Info about %s' % title - if markdown: - util.print_markdown(data, title=title) - else: - util.print_table(data, title=title) - - def list_models(): def exclude_dir(dir_name): # exclude common cache directories and hidden directories - exclude = ['cache', 'pycache', '__pycache__'] - return dir_name in exclude or dir_name.startswith('.') + exclude = ("cache", "pycache", "__pycache__") + return dir_name in exclude or dir_name.startswith(".") + data_path = util.get_data_path() if data_path: models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()] - return ', '.join([m for m in models if not exclude_dir(m)]) - return '-' + return ", ".join([m for m in models if not exclude_dir(m)]) + return "-" diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 580b2df75..4b3406ab0 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -11,13 +11,12 @@ from preshed.counter import PreshCounter import tarfile import gzip import zipfile -import ujson as json -from spacy.lexeme import intify_attrs +from wasabi import Printer from ._messages import Messages from ..vectors import Vectors from ..errors import Errors, Warnings, user_warning -from ..util import prints, ensure_path, get_lang_class +from ..util import ensure_path, get_lang_class, read_jsonl try: import ftfy @@ -25,121 +24,133 @@ except ImportError: ftfy = None +msg = Printer() + + @plac.annotations( - lang=("model language", "positional", None, str), - output_dir=("model output directory", "positional", None, Path), - freqs_loc=("location of words frequencies file", "option", "f", Path), - jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path), - clusters_loc=("optional: location of brown clusters data", - "option", "c", str), - vectors_loc=("optional: location of vectors file in Word2Vec format " - "(either as .txt or zipped as .zip or .tar.gz)", "option", - "v", str), - prune_vectors=("optional: number of vectors to prune to", - "option", "V", int) + lang=("Model language", "positional", None, str), + output_dir=("Model output directory", "positional", None, Path), + freqs_loc=("Location of words frequencies file", "option", "f", Path), + jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path), + clusters_loc=("Optional location of brown clusters data", "option", "c", str), + vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str), + prune_vectors=("Optional number of vectors to prune to", "option", "V", int), ) -def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None, - vectors_loc=None, prune_vectors=-1): +def init_model( + lang, + output_dir, + freqs_loc=None, + clusters_loc=None, + jsonl_loc=None, + vectors_loc=None, + prune_vectors=-1, +): """ Create a new model from raw data, like word frequencies, Brown clusters - and word vectors. + and word vectors. If vectors are provided in Word2Vec format, they can + be either a .txt or zipped as a .zip or .tar.gz. """ if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: - settings = ['-j'] + settings = ["-j"] if freqs_loc: - settings.append('-f') + settings.append("-f") if clusters_loc: - settings.append('-c') - prints(' '.join(settings), - title=( - "The -f and -c arguments are deprecated, and not compatible " - "with the -j argument, which should specify the same information. " - "Either merge the frequencies and clusters data into the " - "jsonl-formatted file (recommended), or use only the -f and " - "-c files, without the other lexical attributes.")) + settings.append("-c") + msg.warn(Messages.M063, Messages.M064) jsonl_loc = ensure_path(jsonl_loc) - lex_attrs = (json.loads(line) for line in jsonl_loc.open()) + lex_attrs = read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): - prints(freqs_loc, title=Messages.M037, exits=1) + msg.fail(Messages.M037, freqs_loc, exits=1) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) - nlp = create_model(lang, lex_attrs) + with msg.loading("Creating model..."): + nlp = create_model(lang, lex_attrs) + msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, prune_vectors) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) - prints(Messages.M039.format(entries=lex_added, vectors=vec_added), - title=Messages.M038) + msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added)) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) return nlp + def open_file(loc): - '''Handle .gz, .tar.gz or unzipped files''' + """Handle .gz, .tar.gz or unzipped files""" loc = ensure_path(loc) - print("Open loc") if tarfile.is_tarfile(str(loc)): - return tarfile.open(str(loc), 'r:gz') - elif loc.parts[-1].endswith('gz'): - return (line.decode('utf8') for line in gzip.open(str(loc), 'r')) - elif loc.parts[-1].endswith('zip'): + return tarfile.open(str(loc), "r:gz") + elif loc.parts[-1].endswith("gz"): + return (line.decode("utf8") for line in gzip.open(str(loc), "r")) + elif loc.parts[-1].endswith("zip"): zip_file = zipfile.ZipFile(str(loc)) names = zip_file.namelist() file_ = zip_file.open(names[0]) - return (line.decode('utf8') for line in file_) + return (line.decode("utf8") for line in file_) else: - return loc.open('r', encoding='utf8') + return loc.open("r", encoding="utf8") + def read_attrs_from_deprecated(freqs_loc, clusters_loc): - probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) - clusters = read_clusters(clusters_loc) if clusters_loc else {} + with msg.loading("Counting frequencies..."): + probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) + msg.good("Counted frequencies") + with msg.loading("Reading clusters..."): + clusters = read_clusters(clusters_loc) if clusters_loc else {} + msg.good("Read clusters") lex_attrs = [] sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) for i, (word, prob) in tqdm(enumerate(sorted_probs)): - attrs = {'orth': word, 'id': i, 'prob': prob} + attrs = {"orth": word, "id": i, "prob": prob} # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx if word in clusters: - attrs['cluster'] = int(clusters[word][::-1], 2) + attrs["cluster"] = int(clusters[word][::-1], 2) else: - attrs['cluster'] = 0 + attrs["cluster"] = 0 lex_attrs.append(attrs) return lex_attrs def create_model(lang, lex_attrs): - print("Creating model...") lang_class = get_lang_class(lang) nlp = lang_class() for lexeme in nlp.vocab: lexeme.rank = 0 lex_added = 0 for attrs in lex_attrs: - if 'settings' in attrs: + if "settings" in attrs: continue - lexeme = nlp.vocab[attrs['orth']] + lexeme = nlp.vocab[attrs["orth"]] lexeme.set_attrs(**attrs) lexeme.is_oov = False lex_added += 1 lex_added += 1 oov_prob = min(lex.prob for lex in nlp.vocab) - nlp.vocab.cfg.update({'oov_prob': oov_prob-1}) + nlp.vocab.cfg.update({"oov_prob": oov_prob - 1}) return nlp + def add_vectors(nlp, vectors_loc, prune_vectors): vectors_loc = ensure_path(vectors_loc) - if vectors_loc and vectors_loc.parts[-1].endswith('.npz'): - nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb'))) + if vectors_loc and vectors_loc.parts[-1].endswith(".npz"): + nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb"))) for lex in nlp.vocab: if lex.rank: nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: - vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) + if vectors_loc: + with msg.loading("Reading vectors from {}".format(vectors_loc)): + vectors_data, vector_keys = read_vectors(vectors_loc) + msg.good("Loaded vectors from {}".format(vectors_loc)) + else: + vectors_data, vector_keys = (None, None) if vector_keys is not None: for word in vector_keys: if word not in nlp.vocab: @@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors): lexeme.is_oov = False if vectors_data is not None: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) - nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang'] - nlp.meta['vectors']['name'] = nlp.vocab.vectors.name + nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"] + nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name if prune_vectors >= 1: nlp.vocab.prune_vectors(prune_vectors) + def read_vectors(vectors_loc): - print("Reading vectors from %s" % vectors_loc) f = open_file(vectors_loc) shape = tuple(int(size) for size in next(f).split()) - vectors_data = numpy.zeros(shape=shape, dtype='f') + vectors_data = numpy.zeros(shape=shape, dtype="f") vectors_keys = [] for i, line in enumerate(tqdm(f)): line = line.rstrip() - pieces = line.rsplit(' ', vectors_data.shape[1]+1) + pieces = line.rsplit(" ", vectors_data.shape[1] + 1) word = pieces.pop(0) if len(pieces) != vectors_data.shape[1]: - raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)) - vectors_data[i] = numpy.asarray(pieces, dtype='f') + msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1) + vectors_data[i] = numpy.asarray(pieces, dtype="f") vectors_keys.append(word) return vectors_data, vectors_keys def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): - print("Counting frequencies...") counts = PreshCounter() total = 0 with freqs_loc.open() as f: for i, line in enumerate(f): - freq, doc_freq, key = line.rstrip().split('\t', 2) + freq, doc_freq, key = line.rstrip().split("\t", 2) freq = int(freq) counts.inc(i + 1, freq) total += freq @@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): probs = {} with freqs_loc.open() as f: for line in tqdm(f): - freq, doc_freq, key = line.rstrip().split('\t', 2) + freq, doc_freq, key = line.rstrip().split("\t", 2) doc_freq = int(doc_freq) freq = int(freq) if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: @@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_clusters(clusters_loc): - print("Reading clusters...") clusters = {} if ftfy is None: user_warning(Warnings.W004) @@ -213,7 +222,7 @@ def read_clusters(clusters_loc): if int(freq) >= 3: clusters[word] = cluster else: - clusters[word] = '0' + clusters[word] = "0" # Expand clusters with re-casing for word, cluster in list(clusters.items()): if word.lower() not in clusters: diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 588304e37..6172dad07 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -3,51 +3,54 @@ from __future__ import unicode_literals import plac from pathlib import Path +from wasabi import Printer from ._messages import Messages from ..compat import symlink_to, path2str -from ..util import prints from .. import util @plac.annotations( origin=("package name or local path to model", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str), - force=("force overwriting of existing link", "flag", "f", bool)) + force=("force overwriting of existing link", "flag", "f", bool), +) def link(origin, link_name, force=False, model_path=None): """ Create a symlink for models within the spacy/data directory. Accepts either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ + msg = Printer() if util.is_package(origin): model_path = util.get_package_path(origin) else: model_path = Path(origin) if model_path is None else Path(model_path) if not model_path.exists(): - prints(Messages.M009.format(path=path2str(model_path)), - title=Messages.M008, exits=1) + msg.fail( + Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1 + ) data_path = util.get_data_path() if not data_path or not data_path.exists(): spacy_loc = Path(__file__).parent.parent - prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1) + msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1) link_path = util.get_data_path() / link_name if link_path.is_symlink() and not force: - prints(Messages.M013, title=Messages.M012.format(name=link_name), - exits=1) + msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1) elif link_path.is_symlink(): # does a symlink exist? # NB: It's important to check for is_symlink here and not for exists, # because invalid/outdated symlinks would return False otherwise. link_path.unlink() - elif link_path.exists(): # does it exist otherwise? + elif link_path.exists(): # does it exist otherwise? # NB: Check this last because valid symlinks also "exist". - prints(Messages.M015, link_path, - title=Messages.M014.format(name=link_name), exits=1) - msg = "%s --> %s" % (path2str(model_path), path2str(link_path)) + msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1) + details = "%s --> %s" % (path2str(model_path), path2str(link_path)) try: symlink_to(link_path, model_path) - except: + except: # noqa: E722 # This is quite dirty, but just making sure other errors are caught. - prints(Messages.M017, msg, title=Messages.M016.format(name=link_name)) + msg.fail(Messages.M016.format(name=link_name), Messages.M017) + msg.text(details) raise - prints(msg, Messages.M019.format(name=link_name), title=Messages.M018) + msg.good(Messages.M018, details) + msg.text(Messages.M019.format(name=link_name)) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 35861224f..84288ac72 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -4,109 +4,106 @@ from __future__ import unicode_literals import plac import shutil from pathlib import Path +from wasabi import Printer, get_raw_input from ._messages import Messages from ..compat import path2str, json_dumps -from ..util import prints from .. import util from .. import about @plac.annotations( - input_dir=("directory with model data", "positional", None, str), - output_dir=("output parent directory", "positional", None, str), - meta_path=("path to meta.json", "option", "m", str), - create_meta=("create meta.json, even if one exists in directory – if " - "existing meta is found, entries are shown as defaults in " - "the command line prompt", "flag", "c", bool), - force=("force overwriting of existing model directory in output directory", - "flag", "f", bool)) -def package(input_dir, output_dir, meta_path=None, create_meta=False, - force=False): + input_dir=("Directory with model data", "positional", None, str), + output_dir=("Output parent directory", "positional", None, str), + meta_path=("Path to meta.json", "option", "m", str), + create_meta=("Create meta.json, even if one exists", "flag", "c", bool), + force=("Force overwriting existing model in output directory", "flag", "f", bool), +) +def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified - output directory, and model data will be copied over. + output directory, and model data will be copied over. If --create-meta is + set and a meta.json already exists in the output directory, the existing + values will be used as the defaults in the command-line prompt. """ + msg = Printer() input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): - prints(input_path, title=Messages.M008, exits=1) + msg.fail(Messages.M008, input_path, exits=1) if not output_path or not output_path.exists(): - prints(output_path, title=Messages.M040, exits=1) + msg.fail(Messages.M040, output_path, exits=1) if meta_path and not meta_path.exists(): - prints(meta_path, title=Messages.M020, exits=1) + msg.fail(Messages.M020, meta_path, exits=1) - meta_path = meta_path or input_path / 'meta.json' + meta_path = meta_path or input_path / "meta.json" if meta_path.is_file(): meta = util.read_json(meta_path) - if not create_meta: # only print this if user doesn't want to overwrite - prints(meta_path, title=Messages.M041) + if not create_meta: # only print if user doesn't want to overwrite + msg.good(Messages.M041, meta_path) else: - meta = generate_meta(input_dir, meta) - meta = validate_meta(meta, ['lang', 'name', 'version']) - model_name = meta['lang'] + '_' + meta['name'] - model_name_v = model_name + '-' + meta['version'] + meta = generate_meta(input_dir, meta, msg) + for key in ("lang", "name", "version"): + if key not in meta or meta[key] == "": + msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1) + model_name = meta["lang"] + "_" + meta["name"] + model_name_v = model_name + "-" + meta["version"] main_path = output_path / model_name_v package_path = main_path / model_name - create_dirs(package_path, force) - shutil.copytree(path2str(input_path), - path2str(package_path / model_name_v)) - create_file(main_path / 'meta.json', json_dumps(meta)) - create_file(main_path / 'setup.py', TEMPLATE_SETUP) - create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST) - create_file(package_path / '__init__.py', TEMPLATE_INIT) - prints(main_path, Messages.M043, - title=Messages.M042.format(name=model_name_v)) - - -def create_dirs(package_path, force): if package_path.exists(): if force: shutil.rmtree(path2str(package_path)) else: - prints(package_path, Messages.M045, title=Messages.M044, exits=1) + msg.fail( + Messages.M044, + Messages.M045.format(path=path2str(package_path)), + exits=1, + ) Path.mkdir(package_path, parents=True) + shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) + create_file(main_path / "meta.json", json_dumps(meta)) + create_file(main_path / "setup.py", TEMPLATE_SETUP) + create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) + create_file(package_path / "__init__.py", TEMPLATE_INIT) + msg.good(Messages.M042.format(name=model_name_v), main_path) + msg.text(Messages.M043) def create_file(file_path, contents): file_path.touch() - file_path.open('w', encoding='utf-8').write(contents) + file_path.open("w", encoding="utf-8").write(contents) -def generate_meta(model_path, existing_meta): +def generate_meta(model_path, existing_meta, msg): meta = existing_meta or {} - settings = [('lang', 'Model language', meta.get('lang', 'en')), - ('name', 'Model name', meta.get('name', 'model')), - ('version', 'Model version', meta.get('version', '0.0.0')), - ('spacy_version', 'Required spaCy version', - '>=%s,<3.0.0' % about.__version__), - ('description', 'Model description', - meta.get('description', False)), - ('author', 'Author', meta.get('author', False)), - ('email', 'Author email', meta.get('email', False)), - ('url', 'Author website', meta.get('url', False)), - ('license', 'License', meta.get('license', 'CC BY-SA 3.0'))] + settings = [ + ("lang", "Model language", meta.get("lang", "en")), + ("name", "Model name", meta.get("name", "model")), + ("version", "Model version", meta.get("version", "0.0.0")), + ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__), + ("description", "Model description", meta.get("description", False)), + ("author", "Author", meta.get("author", False)), + ("email", "Author email", meta.get("email", False)), + ("url", "Author website", meta.get("url", False)), + ("license", "License", meta.get("license", "CC BY-SA 3.0")), + ] nlp = util.load_model_from_path(Path(model_path)) - meta['pipeline'] = nlp.pipe_names - meta['vectors'] = {'width': nlp.vocab.vectors_length, - 'vectors': len(nlp.vocab.vectors), - 'keys': nlp.vocab.vectors.n_keys} - prints(Messages.M047, title=Messages.M046) + meta["pipeline"] = nlp.pipe_names + meta["vectors"] = { + "width": nlp.vocab.vectors_length, + "vectors": len(nlp.vocab.vectors), + "keys": nlp.vocab.vectors.n_keys, + } + msg.divider(Messages.M046) + msg.text(Messages.M047) for setting, desc, default in settings: - response = util.get_raw_input(desc, default) - meta[setting] = default if response == '' and default else response - if about.__title__ != 'spacy': - meta['parent_package'] = about.__title__ - return meta - - -def validate_meta(meta, keys): - for key in keys: - if key not in meta or meta[key] == '': - prints(Messages.M049, title=Messages.M048.format(key=key), exits=1) + response = get_raw_input(desc, default) + meta[setting] = default if response == "" and default else response + if about.__title__ != "spacy": + meta["parent_package"] = about.__title__ return meta diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index ada224b3b..80e60a871 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,66 +1,148 @@ -'''This script is experimental. - -Try pre-training the CNN component of the text categorizer using a cheap -language modelling-like objective. Specifically, we load pre-trained vectors -(from something like word2vec, GloVe, FastText etc), and use the CNN to -predict the tokens' pre-trained vectors. This isn't as easy as it sounds: -we're not merely doing compression here, because heavy dropout is applied, -including over the input words. This means the model must often (50% of the time) -use the context in order to predict the word. - -To evaluate the technique, we're pre-training with the 50k texts from the IMDB -corpus, and then training with only 100 labels. Note that it's a bit dirty to -pre-train with the development data, but also not *so* terrible: we're not using -the development labels, after all --- only the unlabelled text. -''' +# coding: utf8 from __future__ import print_function, unicode_literals + import plac import random import numpy import time -import ujson as json -from pathlib import Path +import ujson import sys from collections import Counter - -import spacy -from spacy.tokens import Doc -from spacy.attrs import ID, HEAD -from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path -from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer +from pathlib import Path from thinc.v2v import Affine, Maxout from thinc.api import wrap from thinc.misc import LayerNorm as LN +from thinc.neural.util import prefer_gpu +from wasabi import Printer + +from ..tokens import Doc +from ..attrs import ID, HEAD +from ..compat import json_dumps +from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer +from .. import util -def prefer_gpu(): - used = spacy.util.use_gpu(0) - if used is None: - return False - else: - import cupy.random - cupy.random.seed(0) - return True +@plac.annotations( + texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str), + vectors_model=("Name or path to vectors model to learn from"), + output_dir=("Directory to write models each epoch", "positional", None, str), + width=("Width of CNN layers", "option", "cw", int), + depth=("Depth of CNN layers", "option", "cd", int), + embed_rows=("Embedding rows", "option", "er", int), + use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), + dropout=("Dropout", "option", "d", float), + seed=("Seed for random number generators", "option", "s", float), + nr_iter=("Number of iterations to pretrain", "option", "i", int), +) +def pretrain( + texts_loc, + vectors_model, + output_dir, + width=96, + depth=4, + embed_rows=2000, + use_vectors=False, + dropout=0.2, + nr_iter=1000, + seed=0, +): + """ + Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, + using an approximate language-modelling objective. Specifically, we load + pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict + vectors which match the pre-trained ones. The weights are saved to a directory + after each epoch. You can then pass a path to one of these pre-trained weights + files to the 'spacy train' command. + This technique may be especially helpful if you have little labelled data. + However, it's still quite experimental, so your mileage may vary. -def load_texts(path): - '''Load inputs from a jsonl file. - - Each line should be a dict like {"text": "..."} - ''' - path = ensure_path(path) - with path.open('r', encoding='utf8') as file_: - texts = [json.loads(line) for line in file_] - random.shuffle(texts) - return texts + To load the weights back in during 'spacy train', you need to ensure + all settings are the same between pretraining and training. The API and + errors around this need some improvement. + """ + config = dict(locals()) + msg = Printer() + util.fix_random_seed(seed) + + has_gpu = prefer_gpu() + msg.info("Using GPU" if has_gpu else "Not using GPU") + + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + msg.good("Created output directory") + util.write_json(output_dir / "config.json", config) + msg.good("Saved settings to config.json") + + # Load texts from file or stdin + if texts_loc != "-": # reading from a file + texts_loc = Path(texts_loc) + if not texts_loc.exists(): + msg.fail("Input text file doesn't exist", texts_loc, exits=1) + with msg.loading("Loading input texts..."): + texts = list(util.read_jsonl(texts_loc)) + msg.good("Loaded input texts") + random.shuffle(texts) + else: # reading from stdin + msg.text("Reading input text from stdin...") + texts = stream_texts() + + with msg.loading("Loading model '{}'...".format(vectors_model)): + nlp = util.load_model(vectors_model) + msg.good("Loaded model '{}'".format(vectors_model)) + pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name + model = create_pretraining_model( + nlp, + Tok2Vec( + width, + embed_rows, + conv_depth=depth, + pretrained_vectors=pretrained_vectors, + bilstm_depth=0, # Requires PyTorch. Experimental. + cnn_maxout_pieces=2, # You can try setting this higher + subword_features=True, + ), + ) # Set to False for character models, e.g. Chinese + optimizer = create_default_optimizer(model.ops) + tracker = ProgressTracker() + msg.divider("Pre-training tok2vec layer") + row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} + msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) + for epoch in range(nr_iter): + for batch in util.minibatch_by_words( + ((text, None) for text in texts), size=5000 + ): + docs = make_docs(nlp, [text for (text, _) in batch]) + loss = make_update(model, docs, optimizer, drop=dropout) + progress = tracker.update(epoch, loss, docs) + if progress: + msg.row(progress, **row_settings) + if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: + break + with model.use_params(optimizer.averages): + with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_: + file_.write(model.tok2vec.to_bytes()) + log = { + "nr_word": tracker.nr_word, + "loss": tracker.loss, + "epoch_loss": tracker.epoch_loss, + "epoch": epoch, + } + with (output_dir / "log.jsonl").open("a") as file_: + file_.write(json_dumps(log) + "\n") + tracker.epoch_loss = 0.0 + if texts_loc != "-": + # Reshuffle the texts if texts were loaded from a file + random.shuffle(texts) def stream_texts(): for line in sys.stdin: - yield json.loads(line) + yield ujson.loads(line) -def make_update(model, docs, optimizer, drop=0.): +def make_update(model, docs, optimizer, drop=0.0): """Perform an update over a single batch of documents. docs (iterable): A batch of `Doc` objects. @@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.): # Don't want to return a cupy object here # The gradients are modified in-place by the BERT MLM, # so we get an accurate loss - loss = float((gradients**2).mean()) + loss = float((gradients ** 2).mean()) return loss @@ -98,7 +180,7 @@ def make_docs(nlp, batch): def get_vectors_loss(ops, docs, prediction): """Compute a mean-squared error loss between the documents' vectors and - the prediction. + the prediction. Note that this is ripe for customization! We could compute the vectors in some other word, e.g. with an LSTM language model, or use some other @@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction): def create_pretraining_model(nlp, tok2vec): - '''Define a network for the pretraining. We simply add an output layer onto + """Define a network for the pretraining. We simply add an output layer onto the tok2vec input model. The tok2vec input model needs to be a model that takes a batch of Doc objects (as a list), and returns a list of arrays. Each array in the output needs to have one row per token in the doc. - ''' + """ output_size = nlp.vocab.vectors.data.shape[1] output_layer = chain( - LN(Maxout(300, pieces=3)), - zero_init(Affine(output_size, drop_factor=0.0)) + LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0)) ) # This is annoying, but the parser etc have the flatten step after # the tok2vec. To load the weights in cleanly, we need to match # the shape of the models' components exactly. So what we cann # "tok2vec" has to be the same set of processes as what the components do. tok2vec = chain(tok2vec, flatten) - model = chain( - tok2vec, - output_layer - ) + model = chain(tok2vec, output_layer) model = masked_language_model(nlp.vocab, model) model.tok2vec = tok2vec model.output_layer = output_layer - model.begin_training([nlp.make_doc('Give it a doc to infer shapes')]) + model.begin_training([nlp.make_doc("Give it a doc to infer shapes")]) return model def masked_language_model(vocab, model, mask_prob=0.15): - '''Convert a model into a BERT-style masked language model''' + """Convert a model into a BERT-style masked language model""" random_words = RandomWords(vocab) - def mlm_forward(docs, drop=0.): + + def mlm_forward(docs, drop=0.0): mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) output, backprop = model.begin_update(docs, drop=drop) def mlm_backward(d_output, sgd=None): - d_output *= 1-mask + d_output *= 1 - mask return backprop(d_output, sgd=sgd) return output, mlm_backward @@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15): def apply_mask(docs, random_words, mask_prob=0.15): N = sum(len(doc) for doc in docs) - mask = numpy.random.uniform(0., 1.0, (N,)) + mask = numpy.random.uniform(0.0, 1.0, (N,)) mask = mask >= mask_prob i = 0 masked_docs = [] @@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15): return mask, masked_docs -def replace_word(word, random_words, mask='[MASK]'): +def replace_word(word, random_words, mask="[MASK]"): roll = random.random() if roll < 0.8: return mask @@ -193,23 +272,25 @@ def replace_word(word, random_words, mask='[MASK]'): else: return word + class RandomWords(object): def __init__(self, vocab): self.words = [lex.text for lex in vocab if lex.prob != 0.0] self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] self.words = self.words[:10000] self.probs = self.probs[:10000] - self.probs = numpy.exp(numpy.array(self.probs, dtype='f')) + self.probs = numpy.exp(numpy.array(self.probs, dtype="f")) self.probs /= self.probs.sum() self._cache = [] def next(self): if not self._cache: - self._cache.extend(numpy.random.choice(len(self.words), 10000, - p=self.probs)) + self._cache.extend( + numpy.random.choice(len(self.words), 10000, p=self.probs) + ) index = self._cache.pop() return self.words[index] - + class ProgressTracker(object): def __init__(self, frequency=1000000): @@ -245,76 +326,3 @@ class ProgressTracker(object): return status else: return None - - -@plac.annotations( - texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str), - vectors_model=("Name or path to vectors model to learn from"), - output_dir=("Directory to write models each epoch", "positional", None, str), - width=("Width of CNN layers", "option", "cw", int), - depth=("Depth of CNN layers", "option", "cd", int), - embed_rows=("Embedding rows", "option", "er", int), - use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), - dropout=("Dropout", "option", "d", float), - seed=("Seed for random number generators", "option", "s", float), - nr_iter=("Number of iterations to pretrain", "option", "i", int), -) -def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4, - embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0): - """ - Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, - using an approximate language-modelling objective. Specifically, we load - pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict - vectors which match the pre-trained ones. The weights are saved to a directory - after each epoch. You can then pass a path to one of these pre-trained weights - files to the 'spacy train' command. - - This technique may be especially helpful if you have little labelled data. - However, it's still quite experimental, so your mileage may vary. - - To load the weights back in during 'spacy train', you need to ensure - all settings are the same between pretraining and training. The API and - errors around this need some improvement. - """ - config = dict(locals()) - output_dir = ensure_path(output_dir) - random.seed(seed) - numpy.random.seed(seed) - if not output_dir.exists(): - output_dir.mkdir() - with (output_dir / 'config.json').open('w') as file_: - file_.write(json.dumps(config)) - has_gpu = prefer_gpu() - print("Use GPU?", has_gpu) - nlp = spacy.load(vectors_model) - pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name - model = create_pretraining_model(nlp, - Tok2Vec(width, embed_rows, - conv_depth=depth, - pretrained_vectors=pretrained_vectors, - bilstm_depth=0, # Requires PyTorch. Experimental. - cnn_maxout_pieces=2, # You can try setting this higher - subword_features=True)) # Set to False for character models, e.g. Chinese - optimizer = create_default_optimizer(model.ops) - tracker = ProgressTracker() - print('Epoch', '#Words', 'Loss', 'w/s') - texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc) - for epoch in range(nr_iter): - for batch in minibatch_by_words(((text, None) for text in texts), size=5000): - docs = make_docs(nlp, [text for (text, _) in batch]) - loss = make_update(model, docs, optimizer, drop=dropout) - progress = tracker.update(epoch, loss, docs) - if progress: - print(*progress) - if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7: - break - with model.use_params(optimizer.averages): - with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_: - file_.write(model.tok2vec.to_bytes()) - with (output_dir / 'log.jsonl').open('a') as file_: - file_.write(json.dumps({'nr_word': tracker.nr_word, - 'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss, - 'epoch': epoch}) + '\n') - tracker.epoch_loss = 0.0 - if texts_loc != '-': - texts = load_texts(texts_loc) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 7c12406ea..506e55871 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -6,45 +6,64 @@ from pathlib import Path import ujson import cProfile import pstats - -import spacy import sys import tqdm import cytoolz import thinc.extra.datasets +from wasabi import Printer - -def read_inputs(loc): - if loc is None: - file_ = sys.stdin - file_ = (line.encode('utf8') for line in file_) - else: - file_ = Path(loc).open() - for line in file_: - data = ujson.loads(line) - text = data['text'] - yield text +from ..util import load_model @plac.annotations( - lang=("model/language", "positional", None, str), - inputs=("Location of input file", "positional", None, read_inputs)) -def profile(lang, inputs=None): + model=("Model to load", "positional", None, str), + inputs=("Location of input file. '-' for stdin.", "positional", None, str), + n_texts=("Maximum number of texts to use if available", "option", "n", int), +) +def profile(model, inputs=None, n_texts=10000): """ Profile a spaCy pipeline, to find out which functions take the most time. + Input should be formatted as one JSON object per line with a key "text". + It can either be provided as a JSONL file, or be read from sys.sytdin. + If no input file is specified, the IMDB dataset is loaded via Thinc. """ + msg = Printer() + if inputs is not None: + inputs = _read_inputs(inputs, msg) if inputs is None: - imdb_train, _ = thinc.extra.datasets.imdb() - inputs, _ = zip(*imdb_train) - inputs = inputs[:25000] - nlp = spacy.load(lang) - texts = list(cytoolz.take(10000, inputs)) - cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), - "Profile.prof") + n_inputs = 25000 + with msg.loading("Loading IMDB dataset via Thinc..."): + imdb_train, _ = thinc.extra.datasets.imdb() + inputs, _ = zip(*imdb_train) + msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs)) + inputs = inputs[:n_inputs] + with msg.loading("Loading model '{}'...".format(model)): + nlp = load_model(model) + msg.good("Loaded model '{}'".format(model)) + texts = list(cytoolz.take(n_texts, inputs)) + cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") + msg.divider("Profile stats") s.strip_dirs().sort_stats("time").print_stats() def parse_texts(nlp, texts): for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): pass + + +def _read_inputs(loc, msg): + if loc == "-": + msg.info("Reading input from sys.stdin") + file_ = sys.stdin + file_ = (line.encode("utf8") for line in file_) + else: + input_path = Path(loc) + if not input_path.exists() or not input_path.is_file(): + msg.fail("Not a valid input data file", loc, exits=1) + msg.info("Using data from {}".format(input_path.parts[-1])) + file_ = input_path.open() + for line in file_: + data = ujson.loads(line) + text = data["text"] + yield text diff --git a/spacy/cli/schemas/__init__.py b/spacy/cli/schemas/__init__.py new file mode 100644 index 000000000..f478c7a9a --- /dev/null +++ b/spacy/cli/schemas/__init__.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from pathlib import Path +from jsonschema import Draft4Validator + +from ...errors import Errors +from ...util import read_json + + +SCHEMAS = {} + + +def get_schema(name): + """Get the JSON schema for a given name. Looks for a .json file in + spacy.cli.schemas, validates the schema and raises ValueError if not found. + + EXAMPLE: + >>> schema = get_schema('training') + + name (unicode): The name of the schema. + RETURNS (dict): The JSON schema. + """ + if name not in SCHEMAS: + schema_path = Path(__file__).parent / "{}.json".format(name) + if not schema_path.exists(): + raise ValueError(Errors.E104.format(name=name)) + schema = read_json(schema_path) + # TODO: replace with (stable) Draft6Validator, if available + validator = Draft4Validator(schema) + validator.check_schema(schema) + SCHEMAS[name] = schema + return SCHEMAS[name] + + +def validate_json(data, schema): + """Validate data against a given JSON schema (see https://json-schema.org). + + data: JSON-serializable data to validate. + schema (dict): The JSON schema. + RETURNS (list): A list of error messages, if available. + """ + validator = Draft4Validator(schema) + errors = [] + for err in sorted(validator.iter_errors(data), key=lambda e: e.path): + if err.path: + err_path = "[{}]".format(" -> ".join([str(p) for p in err.path])) + else: + err_path = "" + errors.append(err.message + " " + err_path) + return errors diff --git a/spacy/cli/schemas/meta.json b/spacy/cli/schemas/meta.json new file mode 100644 index 000000000..36ee1282f --- /dev/null +++ b/spacy/cli/schemas/meta.json @@ -0,0 +1,128 @@ +{ + "$schema": "http://json-schema.org/draft-06/schema", + "type": "object", + "properties": { + "lang": { + "title": "Two-letter language code, e.g. 'en'", + "type": "string", + "minLength": 2, + "maxLength": 2, + "pattern": "^[a-z]*$" + }, + "name": { + "title": "Model name", + "type": "string", + "minLength": 1, + "pattern": "^[a-z_]*$" + }, + "version": { + "title": "Model version", + "type": "string", + "minLength": 1, + "pattern": "^[0-9a-z.-]*$" + }, + "spacy_version": { + "title": "Compatible spaCy version identifier", + "type": "string", + "minLength": 1, + "pattern": "^[0-9a-z.-><=]*$" + }, + "parent_package": { + "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly", + "type": "string", + "minLength": 1, + "default": "spacy" + }, + "pipeline": { + "title": "Names of pipeline components", + "type": "array", + "items": { + "type": "string", + "minLength": 1 + } + }, + "description": { + "title": "Model description", + "type": "string" + }, + "license": { + "title": "Model license", + "type": "string" + }, + "author": { + "title": "Model author name", + "type": "string" + }, + "email": { + "title": "Model author email", + "type": "string", + "format": "email" + }, + "url": { + "title": "Model author URL", + "type": "string", + "format": "uri" + }, + "sources": { + "title": "Training data sources", + "type": "array", + "items": { + "type": "string" + } + }, + "vectors": { + "title": "Included word vectors", + "type": "object", + "properties": { + "keys": { + "title": "Number of unique keys", + "type": "integer", + "minimum": 0 + }, + "vectors": { + "title": "Number of unique vectors", + "type": "integer", + "minimum": 0 + }, + "width": { + "title": "Number of dimensions", + "type": "integer", + "minimum": 0 + } + } + }, + "accuracy": { + "title": "Accuracy numbers", + "type": "object", + "patternProperties": { + "*": { + "type": "number", + "minimum": 0.0 + } + } + }, + "speed": { + "title": "Speed evaluation numbers", + "type": "object", + "patternProperties": { + "*": { + "oneOf": [ + { + "type": "number", + "minimum": 0.0 + }, + { + "type": "integer", + "minimum": 0 + } + ] + } + } + } + }, + "required": [ + "lang", + "name", + "version" + ] +} diff --git a/spacy/cli/schemas/training.json b/spacy/cli/schemas/training.json new file mode 100644 index 000000000..d80ce5c7e --- /dev/null +++ b/spacy/cli/schemas/training.json @@ -0,0 +1,146 @@ +{ + "$schema": "http://json-schema.org/draft-06/schema", + "title": "Training data for spaCy models", + "type": "array", + "items": { + "type": "object", + "properties": { + "text": { + "title": "The text of the training example", + "type": "string", + "minLength": 1 + }, + "ents": { + "title": "Named entity spans in the text", + "type": "array", + "items": { + "type": "object", + "properties": { + "start": { + "title": "Start character offset of the span", + "type": "integer", + "minimum": 0 + }, + "end": { + "title": "End character offset of the span", + "type": "integer", + "minimum": 0 + }, + "label": { + "title": "Entity label", + "type": "string", + "minLength": 1, + "pattern": "^[A-Z0-9]*$" + } + }, + "required": [ + "start", + "end", + "label" + ] + } + }, + "sents": { + "title": "Sentence spans in the text", + "type": "array", + "items": { + "type": "object", + "properties": { + "start": { + "title": "Start character offset of the span", + "type": "integer", + "minimum": 0 + }, + "end": { + "title": "End character offset of the span", + "type": "integer", + "minimum": 0 + } + }, + "required": [ + "start", + "end" + ] + } + }, + "cats": { + "title": "Text categories for the text classifier", + "type": "object", + "patternProperties": { + "*": { + "title": "A text category", + "oneOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "minimum": 0 + } + ] + } + }, + "propertyNames": { + "pattern": "^[A-Z0-9]*$", + "minLength": 1 + } + }, + "tokens": { + "title": "The tokens in the text", + "type": "array", + "items": { + "type": "object", + "minProperties": 1, + "properties": { + "id": { + "title": "Token ID, usually token index", + "type": "integer", + "minimum": 0 + }, + "start": { + "title": "Start character offset of the token", + "type": "integer", + "minimum": 0 + }, + "end": { + "title": "End character offset of the token", + "type": "integer", + "minimum": 0 + }, + "pos": { + "title": "Coarse-grained part-of-speech tag", + "type": "string", + "minLength": 1 + }, + "tag": { + "title": "Fine-grained part-of-speech tag", + "type": "string", + "minLength": 1 + }, + "dep": { + "title": "Dependency label", + "type": "string", + "minLength": 1 + }, + "head": { + "title": "Index of the token's head", + "type": "integer", + "minimum": 0 + } + }, + "required": [ + "start", + "end" + ] + } + }, + "_": { + "title": "Custom user space", + "type": "object" + } + }, + "required": [ + "text" + ] + } +} diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 01c8cb199..d49b94e44 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -6,213 +6,296 @@ from pathlib import Path import tqdm from thinc.neural._classes.model import Model from timeit import default_timer as timer -import json import shutil +from wasabi import Printer from ._messages import Messages +from .._ml import create_default_optimizer from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus -from ..util import prints, minibatch, minibatch_by_words from .. import util from .. import about -from .. import displacy -from ..compat import json_dumps + + +# Take dropout and batch size as generators of values -- dropout +# starts high and decays sharply, to force the optimizer to explore. +# Batch size starts at 1 and grows, so that we make updates quickly +# at the beginning of training. +dropout_rates = util.decaying( + util.env_opt("dropout_from", 0.2), + util.env_opt("dropout_to", 0.2), + util.env_opt("dropout_decay", 0.0), +) +batch_sizes = util.compounding( + util.env_opt("batch_from", 1000), + util.env_opt("batch_to", 1000), + util.env_opt("batch_compound", 1.001), +) @plac.annotations( - lang=("model language", "positional", None, str), - output_dir=("output directory to store model in", "positional", None, str), - train_data=("location of JSON-formatted training data", "positional", - None, str), - dev_data=("location of JSON-formatted development data (optional)", - "positional", None, str), - n_iter=("number of iterations", "option", "n", int), - n_sents=("number of sentences", "option", "ns", int), + lang=("Model language", "positional", None, str), + output_path=("Output directory to store model in", "positional", None, Path), + train_path=("Location of JSON-formatted training data", "positional", None, Path), + dev_path=("Location of JSON-formatted development data", "positional", None, Path), + base_model=("Name of model to update (optional)", "option", "b", str), + pipeline=("Comma-separated names of pipeline components", "option", "p", str), + vectors=("Model to load vectors from", "option", "v", str), + n_iter=("Number of iterations", "option", "n", int), + n_examples=("Number of examples", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), - vectors=("Model to load vectors from", "option", "v"), - no_tagger=("Don't train tagger", "flag", "T", bool), - no_parser=("Don't train parser", "flag", "P", bool), - no_entities=("Don't train NER", "flag", "N", bool), - parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str), - noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float), - entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str), - gold_preproc=("Use gold preprocessing", "flag", "G", bool), version=("Model version", "option", "V", str), - meta_path=("Optional path to meta.json. All relevant properties will be " - "overwritten.", "option", "m", Path), - init_tok2vec=("Path to pretrained weights for the token-to-vector parts " - "of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path), - verbose=("Display more information for debug", "option", None, bool)) -def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, - parser_multitasks='', entity_multitasks='', init_tok2vec=None, - use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0, - no_parser=False, no_entities=False, gold_preproc=False, - version="0.0.0", meta_path=None, verbose=False): + meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), + init_tok2vec=( + "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", + "option", + "t2v", + Path, + ), + parser_multitasks=( + "Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", + "option", + "pt", + str, + ), + entity_multitasks=( + "Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", + "option", + "et", + str, + ), + noise_level=("Amount of corruption for data augmentation", "option", "nl", float), + gold_preproc=("Use gold preprocessing", "flag", "G", bool), + learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), + verbose=("Display more information for debug", "flag", "VV", bool), + debug=("Run data diagnostics before training", "flag", "D", bool), +) +def train( + lang, + output_path, + train_path, + dev_path, + base_model=None, + pipeline="tagger,parser,ner", + vectors=None, + n_iter=30, + n_examples=0, + use_gpu=-1, + version="0.0.0", + meta_path=None, + init_tok2vec=None, + parser_multitasks="", + entity_multitasks="", + noise_level=0.0, + gold_preproc=False, + learn_tokens=False, + verbose=False, + debug=False, +): """ - Train a model. Expects data in spaCy's JSON format. + Train or update a spaCy model. Requires data to be formatted in spaCy's + JSON format. To convert data from other formats, use the `spacy convert` + command. """ + msg = Printer() util.fix_random_seed() - util.set_env_log(True) - n_sents = n_sents or None - output_path = util.ensure_path(output_dir) - train_path = util.ensure_path(train_data) - dev_path = util.ensure_path(dev_data) + util.set_env_log(verbose) + + # Make sure all files and paths exists if they are needed + train_path = util.ensure_path(train_path) + dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) - if not train_path.exists(): - prints(train_path, title=Messages.M050, exits=1) - if dev_path and not dev_path.exists(): - prints(dev_path, title=Messages.M051, exits=1) + if not train_path or not train_path.exists(): + msg.fail(Messages.M050, train_path, exits=1) + if not dev_path or not dev_path.exists(): + msg.fail(Messages.M051, dev_path, exits=1) if meta_path is not None and not meta_path.exists(): - prints(meta_path, title=Messages.M020, exits=1) + msg.fail(Messages.M020, meta_path, exits=1) meta = util.read_json(meta_path) if meta_path else {} if not isinstance(meta, dict): - prints(Messages.M053.format(meta_type=type(meta)), - title=Messages.M052, exits=1) - meta.setdefault('lang', lang) - meta.setdefault('name', 'unnamed') - + msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1) + if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: + msg.fail(Messages.M062, Messages.M065) if not output_path.exists(): output_path.mkdir() - print("Counting training words (limit=%s" % n_sents) - corpus = GoldCorpus(train_path, dev_path, limit=n_sents) - n_train_words = corpus.count_train() - print(n_train_words) - pipeline = ['tagger', 'parser', 'ner'] - if no_tagger and 'tagger' in pipeline: - pipeline.remove('tagger') - if no_parser and 'parser' in pipeline: - pipeline.remove('parser') - if no_entities and 'ner' in pipeline: - pipeline.remove('ner') + # Set up the base model and pipeline. If a base model is specified, load + # the model and make sure the pipeline matches the pipeline setting. If + # training starts from a blank model, intitalize the language class. + pipeline = [p.strip() for p in pipeline.split(",")] + msg.text(Messages.M055.format(pipeline=pipeline)) + if base_model: + msg.text(Messages.M056.format(model=base_model)) + nlp = util.load_model(base_model) + if nlp.lang != lang: + msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1) + other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline] + nlp.disable_pipes(*other_pipes) + for pipe in pipeline: + if pipe not in nlp.pipe_names: + nlp.add_pipe(nlp.create_pipe(pipe)) + else: + msg.text(Messages.M057.format(model=lang)) + lang_cls = util.get_lang_class(lang) + nlp = lang_cls() + for pipe in pipeline: + nlp.add_pipe(nlp.create_pipe(pipe)) + + if learn_tokens: + nlp.add_pipe(nlp.create_pipe("merge_subtokens")) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. - dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1), - util.env_opt('dropout_to', 0.1), - util.env_opt('dropout_decay', 0.0)) - batch_sizes = util.compounding(util.env_opt('batch_from', 750), - util.env_opt('batch_to', 750), - util.env_opt('batch_compound', 1.001)) + dropout_rates = util.decaying( + util.env_opt("dropout_from", 0.1), + util.env_opt("dropout_to", 0.1), + util.env_opt("dropout_decay", 0.0), + ) + batch_sizes = util.compounding( + util.env_opt("batch_from", 750), + util.env_opt("batch_to", 750), + util.env_opt("batch_compound", 1.001), + ) lang_class = util.get_lang_class(lang) nlp = lang_class() - meta['pipeline'] = pipeline + meta["pipeline"] = pipeline nlp.meta.update(meta) if vectors: - print("Load vectors model", vectors) - util.load_model(vectors, vocab=nlp.vocab) - for lex in nlp.vocab: - values = {} - for attr, func in nlp.vocab.lex_attr_getters.items(): - # These attrs are expected to be set by data. Others should - # be set by calling the language functions. - if attr not in (CLUSTER, PROB, IS_OOV, LANG): - values[lex.vocab.strings[attr]] = func(lex.orth_) - lex.set_attrs(**values) - lex.is_oov = False - for name in pipeline: - nlp.add_pipe(nlp.create_pipe(name), name=name) - nlp.add_pipe(nlp.create_pipe('merge_subtokens')) - if parser_multitasks: - for objective in parser_multitasks.split(','): - nlp.parser.add_multitask_objective(objective) - if entity_multitasks: - for objective in entity_multitasks.split(','): - nlp.entity.add_multitask_objective(objective) - optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) - if init_tok2vec is not None: - loaded = _load_pretrained_tok2vec(nlp, init_tok2vec) - print("Loaded pretrained tok2vec for:", loaded) + msg.text(Messages.M058.format(model=vectors)) + _load_vectors(nlp, vectors) + + # Multitask objectives + multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] + for pipe_name, multitasks in multitask_options: + if multitasks: + if pipe_name not in pipeline: + msg.fail(Messages.M059.format(pipe=pipe_name)) + pipe = nlp.get_pipe(pipe_name) + for objective in multitasks.split(","): + pipe.add_multitask_objective(objective) + + # Prepare training corpus + msg.text(Messages.M060.format(limit=n_examples)) + corpus = GoldCorpus(train_path, dev_path, limit=n_examples) + n_train_words = corpus.count_train() + + if base_model: + # Start with an existing model, use default optimizer + optimizer = create_default_optimizer(Model.ops) + else: + # Start with a blank model, call begin_training + optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None - print("Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS") + # Load in pre-trained weights + if init_tok2vec is not None: + components = _load_pretrained_tok2vec(nlp, init_tok2vec) + msg.text(Messages.M071.format(components=components)) + + print( + "\nItn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS" + ) try: for i in range(n_iter): - train_docs = corpus.train_docs(nlp, noise_level=noise_level, - gold_preproc=gold_preproc, max_length=0) + train_docs = corpus.train_docs( + nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 + ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} - for batch in minibatch_by_words(train_docs, size=batch_sizes): + for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) - nlp.update(docs, golds, sgd=optimizer, - drop=next(dropout_rates), losses=losses) + nlp.update( + docs, + golds, + sgd=optimizer, + drop=next(dropout_rates), + losses=losses, + ) pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) - epoch_model_path = output_path / ('model%d' % i) + epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) - dev_docs = list(corpus.dev_docs( - nlp_loaded, - gold_preproc=gold_preproc)) + dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs, verbose) + scorer = nlp_loaded.evaluate(dev_docs, debug) end_time = timer() if use_gpu < 0: gpu_wps = None - cpu_wps = nwords/(end_time-start_time) + cpu_wps = nwords / (end_time - start_time) else: - gpu_wps = nwords/(end_time-start_time) - with Model.use_device('cpu'): + gpu_wps = nwords / (end_time - start_time) + with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) - dev_docs = list(corpus.dev_docs( - nlp_loaded, gold_preproc=gold_preproc)) + dev_docs = list( + corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) + ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() - cpu_wps = nwords/(end_time-start_time) - acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') - with acc_loc.open('w') as file_: - file_.write(json_dumps(scorer.scores)) - meta_loc = output_path / ('model%d' % i) / 'meta.json' - meta['accuracy'] = scorer.scores - meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps, - 'gpu': gpu_wps} - meta['vectors'] = {'width': nlp.vocab.vectors_length, - 'vectors': len(nlp.vocab.vectors), - 'keys': nlp.vocab.vectors.n_keys} - meta['lang'] = nlp.lang - meta['pipeline'] = pipeline - meta['spacy_version'] = '>=%s' % about.__version__ - meta.setdefault('name', 'model%d' % i) - meta.setdefault('version', version) + cpu_wps = nwords / (end_time - start_time) + acc_loc = output_path / ("model%d" % i) / "accuracy.json" + util.write_json(acc_loc, scorer.scores) - with meta_loc.open('w') as file_: - file_.write(json_dumps(meta)) - util.set_env_log(True) - print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, - gpu_wps=gpu_wps) + # Update model meta.json + meta["lang"] = nlp.lang + meta["pipeline"] = nlp.pipe_names + meta["spacy_version"] = ">=%s" % about.__version__ + meta["accuracy"] = scorer.scores + meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps} + meta["vectors"] = { + "width": nlp.vocab.vectors_length, + "vectors": len(nlp.vocab.vectors), + "keys": nlp.vocab.vectors.n_keys, + } + meta.setdefault("name", "model%d" % i) + meta.setdefault("version", version) + meta_loc = output_path / ("model%d" % i) / "meta.json" + util.write_json(meta_loc, meta) + + util.set_env_log(verbose) + + print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) finally: - print("Saving model...") - with nlp.use_params(optimizer.averages): - final_model_path = output_path / 'model-final' - nlp.to_disk(final_model_path) - components = [] - if not no_parser: - components.append('parser') - if not no_tagger: - components.append('tagger') - if not no_entities: - components.append('ner') - _collate_best_model(meta, output_path, components) + with msg.loading(Messages.M061): + with nlp.use_params(optimizer.averages): + final_model_path = output_path / "model-final" + nlp.to_disk(final_model_path) + msg.good(Messages.M066, util.path2str(final_model_path)) + + _collate_best_model(meta, output_path, nlp.pipe_names) + + +def _load_vectors(nlp, vectors): + util.load_model(vectors, vocab=nlp.vocab) + for lex in nlp.vocab: + values = {} + for attr, func in nlp.vocab.lex_attr_getters.items(): + # These attrs are expected to be set by data. Others should + # be set by calling the language functions. + if attr not in (CLUSTER, PROB, IS_OOV, LANG): + values[lex.vocab.strings[attr]] = func(lex.orth_) + lex.set_attrs(**values) + lex.is_oov = False def _load_pretrained_tok2vec(nlp, loc): """Load pre-trained weights for the 'token-to-vector' part of the component models, which is typically a CNN. See 'spacy pretrain'. Experimental. """ - with loc.open('rb') as file_: + with loc.open("rb") as file_: weights_data = file_.read() loaded = [] for name, component in nlp.pipeline: - if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'): + if hasattr(component, "model") and hasattr(component.model, "tok2vec"): component.tok2vec.from_bytes(weights_data) loaded.append(name) return loaded @@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components): bests = {} for component in components: bests[component] = _find_best(output_path, component) - best_dest = output_path / 'model-best' - shutil.copytree(output_path / 'model-final', best_dest) + best_dest = output_path / "model-best" + shutil.copytree(output_path / "model-final", best_dest) for component, best_component_src in bests.items(): shutil.rmtree(best_dest / component) shutil.copytree(best_component_src / component, best_dest / component) - with (best_component_src / 'accuracy.json').open() as file_: - accs = json.load(file_) + accs = util.read_json(best_component_src / "accuracy.json") for metric in _get_metrics(component): - meta['accuracy'][metric] = accs[metric] - with (best_dest / 'meta.json').open('w') as file_: - file_.write(json_dumps(meta)) + meta["accuracy"][metric] = accs[metric] + util.write_json(best_dest / "meta.json", meta) def _find_best(experiment_dir, component): accuracies = [] for epoch_model in experiment_dir.iterdir(): if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": - accs = json.load((epoch_model / "accuracy.json").open()) + accs = util.read_json(epoch_model / "accuracy.json") scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] accuracies.append((scores, epoch_model)) if accuracies: @@ -247,6 +328,7 @@ def _find_best(experiment_dir, component): else: return None + def _get_metrics(component): if component == "parser": return ("las", "uas", "token_acc") @@ -257,50 +339,40 @@ def _get_metrics(component): return ("token_acc",) -def _render_parses(i, to_render): - to_render[0].user_data['title'] = "Batch %d" % i - with Path('/tmp/entities.html').open('w') as file_: - html = displacy.render(to_render[:5], style='ent', page=True) - file_.write(html) - with Path('/tmp/parses.html').open('w') as file_: - html = displacy.render(to_render[:5], style='dep', page=True) - file_.write(html) - - def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): scores = {} - for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', - 'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']: + for col in [ + "dep_loss", + "tag_loss", + "uas", + "tags_acc", + "token_acc", + "ents_p", + "ents_r", + "ents_f", + "cpu_wps", + "gpu_wps", + ]: scores[col] = 0.0 - scores['dep_loss'] = losses.get('parser', 0.0) - scores['ner_loss'] = losses.get('ner', 0.0) - scores['tag_loss'] = losses.get('tagger', 0.0) + scores["dep_loss"] = losses.get("parser", 0.0) + scores["ner_loss"] = losses.get("ner", 0.0) + scores["tag_loss"] = losses.get("tagger", 0.0) scores.update(dev_scores) - scores['cpu_wps'] = cpu_wps - scores['gpu_wps'] = gpu_wps or 0.0 - tpl = ''.join(( - '{:<6d}', - '{dep_loss:<10.3f}', - '{ner_loss:<10.3f}', - '{uas:<8.3f}', - '{ents_p:<8.3f}', - '{ents_r:<8.3f}', - '{ents_f:<8.3f}', - '{tags_acc:<8.3f}', - '{token_acc:<9.3f}', - '{cpu_wps:<9.1f}', - '{gpu_wps:.1f}', - )) + scores["cpu_wps"] = cpu_wps + scores["gpu_wps"] = gpu_wps or 0.0 + tpl = "".join( + ( + "{:<6d}", + "{dep_loss:<10.3f}", + "{ner_loss:<10.3f}", + "{uas:<8.3f}", + "{ents_p:<8.3f}", + "{ents_r:<8.3f}", + "{ents_f:<8.3f}", + "{tags_acc:<8.3f}", + "{token_acc:<9.3f}", + "{cpu_wps:<9.1f}", + "{gpu_wps:.1f}", + ) + ) print(tpl.format(itn, **scores)) - - -def print_results(scorer): - results = { - 'TOK': '%.2f' % scorer.token_acc, - 'POS': '%.2f' % scorer.tags_acc, - 'UAS': '%.2f' % scorer.uas, - 'LAS': '%.2f' % scorer.las, - 'NER P': '%.2f' % scorer.ents_p, - 'NER R': '%.2f' % scorer.ents_r, - 'NER F': '%.2f' % scorer.ents_f} - util.print_table(results, title="Results") diff --git a/spacy/cli/ud/__init__.py b/spacy/cli/ud/__init__.py new file mode 100644 index 000000000..119c46ba4 --- /dev/null +++ b/spacy/cli/ud/__init__.py @@ -0,0 +1,2 @@ +from .conll17_ud_eval import main as ud_evaluate # noqa: F401 +from .ud_train import main as ud_train # noqa: F401 diff --git a/spacy/cli/conll17_ud_eval.py b/spacy/cli/ud/conll17_ud_eval.py similarity index 99% rename from spacy/cli/conll17_ud_eval.py rename to spacy/cli/ud/conll17_ud_eval.py index 3a41f99dc..2f8e632f0 100644 --- a/spacy/cli/conll17_ud_eval.py +++ b/spacy/cli/ud/conll17_ud_eval.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# flake8: noqa # CoNLL 2017 UD Parsing evaluation script. # @@ -214,7 +215,7 @@ def load_conllu(file): start, end = map(int, columns[ID].split("-")) except: raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) - + for _ in range(start, end + 1): word_line = file.readline().rstrip("\r\n") word_columns = word_line.split("\t") diff --git a/spacy/cli/ud_run_test.py b/spacy/cli/ud/ud_run_test.py similarity index 55% rename from spacy/cli/ud_run_test.py rename to spacy/cli/ud/ud_run_test.py index eed0ab1ce..f36df2f80 100644 --- a/spacy/cli/ud_run_test.py +++ b/spacy/cli/ud/ud_run_test.py @@ -1,7 +1,9 @@ -'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes +# flake8: noqa +"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes .conllu format for development data, allowing the official scorer to be used. -''' +""" from __future__ import unicode_literals + import plac import tqdm from pathlib import Path @@ -11,15 +13,17 @@ import json import spacy import spacy.util -from ..tokens import Token, Doc -from ..gold import GoldParse -from ..util import compounding, minibatch_by_words -from ..syntax.nonproj import projectivize -from ..matcher import Matcher -#from ..morphology import Fused_begin, Fused_inside -from .. import displacy +from ...tokens import Token, Doc +from ...gold import GoldParse +from ...util import compounding, minibatch_by_words +from ...syntax.nonproj import projectivize +from ...matcher import Matcher + +# from ...morphology import Fused_begin, Fused_inside +from ... import displacy from collections import defaultdict, Counter from timeit import default_timer as timer + Fused_begin = None Fused_inside = None @@ -30,43 +34,45 @@ import cytoolz from . import conll17_ud_eval -from .. import lang -from .. import lang -from ..lang import zh -from ..lang import ja -from ..lang import ru +from ... import lang +from ...lang import zh +from ...lang import ja +from ...lang import ru ################ # Data reading # ################ -space_re = re.compile('\s+') +space_re = re.compile("\s+") + + def split_text(text): - return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')] - + return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] + ############## # Evaluation # ############## + def read_conllu(file_): docs = [] sent = [] doc = [] for line in file_: - if line.startswith('# newdoc'): + if line.startswith("# newdoc"): if doc: docs.append(doc) doc = [] - elif line.startswith('#'): + elif line.startswith("#"): continue elif not line.strip(): if sent: doc.append(sent) sent = [] else: - sent.append(list(line.strip().split('\t'))) + sent.append(list(line.strip().split("\t"))) if len(sent[-1]) != 10: print(repr(line)) raise ValueError @@ -78,7 +84,7 @@ def read_conllu(file_): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - if text_loc.parts[-1].endswith('.conllu'): + if text_loc.parts[-1].endswith(".conllu"): docs = [] with text_loc.open() as file_: for conllu_doc in read_conllu(file_): @@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): for name, component in nlp.pipeline: docs = list(component.pipe(docs)) else: - with text_loc.open('r', encoding='utf8') as text_file: + with text_loc.open("r", encoding="utf8") as text_file: texts = split_text(text_file.read()) docs = list(nlp.pipe(texts)) - with sys_loc.open('w', encoding='utf8') as out_file: + with sys_loc.open("w", encoding="utf8") as out_file: write_conllu(docs, out_file) - with gold_loc.open('r', encoding='utf8') as gold_file: + with gold_loc.open("r", encoding="utf8") as gold_file: gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open('r', encoding='utf8') as sys_file: + with sys_loc.open("r", encoding="utf8") as sys_file: sys_ud = conll17_ud_eval.load_conllu(sys_file) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) return docs, scores @@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def write_conllu(docs, file_): merger = Matcher(docs[0].vocab) - merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}]) + merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): matches = merger(doc) - spans = [doc[start:end+1] for _, start, end in matches] + spans = [doc[start : end + 1] for _, start, end in matches] offsets = [(span.start_char, span.end_char) for span in spans] for start_char, end_char in offsets: doc.merge(start_char, end_char) # TODO: This shuldn't be necessary? Should be handled in merge for word in doc: if word.i == word.head.i: - word.dep_ = 'ROOT' + word.dep_ = "ROOT" file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# text = {text}\n".format(text=sent.text)) for k, token in enumerate(sent): - file_.write(_get_token_conllu(token, k, len(sent)) + '\n') - file_.write('\n') + file_.write(_get_token_conllu(token, k, len(sent)) + "\n") + file_.write("\n") for word in sent: - if word.head.i == word.i and word.dep_ == 'ROOT': + if word.head.i == word.i and word.dep_ == "ROOT": break else: print("Rootless sentence!") @@ -134,24 +140,34 @@ def write_conllu(docs, file_): def _get_token_conllu(token, k, sent_len): - if token.check_morph(Fused_begin) and (k+1 < sent_len): + if token.check_morph(Fused_begin) and (k + 1 < sent_len): n = 1 text = [token.text] while token.nbor(n).check_morph(Fused_inside): text.append(token.nbor(n).text) n += 1 - id_ = '%d-%d' % (k+1, (k+n)) - fields = [id_, ''.join(text)] + ['_'] * 8 - lines = ['\t'.join(fields)] + id_ = "%d-%d" % (k + 1, (k + n)) + fields = [id_, "".join(text)] + ["_"] * 8 + lines = ["\t".join(fields)] else: lines = [] if token.head.i == token.i: head = 0 else: head = k + (token.head.i - token.i) + 1 - fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_', - str(head), token.dep_.lower(), '_', '_'] - if token.check_morph(Fused_begin) and (k+1 < sent_len): + fields = [ + str(k + 1), + token.text, + token.lemma_, + token.pos_, + token.tag_, + "_", + str(head), + token.dep_.lower(), + "_", + "_", + ] + if token.check_morph(Fused_begin) and (k + 1 < sent_len): if k == 0: fields[1] = token.norm_[0].upper() + token.norm_[1:] else: @@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len): split_end = token._.split_end split_len = (split_end.i - split_start.i) + 1 n_in_split = token.i - split_start.i - subtokens = guess_fused_orths(split_start.text, [''] * split_len) + subtokens = guess_fused_orths(split_start.text, [""] * split_len) fields[1] = subtokens[n_in_split] - lines.append('\t'.join(fields)) - return '\n'.join(lines) + lines.append("\t".join(fields)) + return "\n".join(lines) def guess_fused_orths(word, ud_forms): - '''The UD data 'fused tokens' don't necessarily expand to keys that match + """The UD data 'fused tokens' don't necessarily expand to keys that match the form. We need orths that exact match the string. Here we make a best - effort to divide up the word.''' - if word == ''.join(ud_forms): + effort to divide up the word.""" + if word == "".join(ud_forms): # Happy case: we get a perfect split, with each letter accounted for. return ud_forms elif len(word) == sum(len(subtoken) for subtoken in ud_forms): @@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms): remain = word for subtoken in ud_forms: assert len(subtoken) >= 1 - output.append(remain[:len(subtoken)]) - remain = remain[len(subtoken):] + output.append(remain[: len(subtoken)]) + remain = remain[len(subtoken) :] assert len(remain) == 0, (word, ud_forms, remain) return output else: # Let's say word is 6 long, and there are three subtokens. The orths # *must* equal the original string. Arbitrarily, split [4, 1, 1] - first = word[:len(word)-(len(ud_forms)-1)] + first = word[: len(word) - (len(ud_forms) - 1)] output = [first] - remain = word[len(first):] + remain = word[len(first) :] for i in range(1, len(ud_forms)): assert remain output.append(remain[:1]) @@ -201,60 +217,50 @@ def guess_fused_orths(word, ud_forms): return output - def print_results(name, ud_scores): fields = {} if ud_scores is not None: - fields.update({ - 'words': ud_scores['Words'].f1 * 100, - 'sents': ud_scores['Sentences'].f1 * 100, - 'tags': ud_scores['XPOS'].f1 * 100, - 'uas': ud_scores['UAS'].f1 * 100, - 'las': ud_scores['LAS'].f1 * 100, - }) + fields.update( + { + "words": ud_scores["Words"].f1 * 100, + "sents": ud_scores["Sentences"].f1 * 100, + "tags": ud_scores["XPOS"].f1 * 100, + "uas": ud_scores["UAS"].f1 * 100, + "las": ud_scores["LAS"].f1 * 100, + } + ) else: - fields.update({ - 'words': 0.0, - 'sents': 0.0, - 'tags': 0.0, - 'uas': 0.0, - 'las': 0.0 - }) - tpl = '\t'.join(( - name, - '{las:.1f}', - '{uas:.1f}', - '{tags:.1f}', - '{sents:.1f}', - '{words:.1f}', - )) + fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0}) + tpl = "\t".join( + (name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}") + ) print(tpl.format(**fields)) return fields def get_token_split_start(token): - if token.text == '': + if token.text == "": assert token.i != 0 i = -1 - while token.nbor(i).text == '': + while token.nbor(i).text == "": i -= 1 return token.nbor(i) - elif (token.i+1) < len(token.doc) and token.nbor(1).text == '': + elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "": return token else: return None def get_token_split_end(token): - if (token.i+1) == len(token.doc): - return token if token.text == '' else None - elif token.text != '' and token.nbor(1).text != '': + if (token.i + 1) == len(token.doc): + return token if token.text == "" else None + elif token.text != "" and token.nbor(1).text != "": return None i = 1 - while (token.i+i) < len(token.doc) and token.nbor(i).text == '': + while (token.i + i) < len(token.doc) and token.nbor(i).text == "": i += 1 - return token.nbor(i-1) - + return token.nbor(i - 1) + ################## # Initialization # @@ -262,54 +268,73 @@ def get_token_split_end(token): def load_nlp(experiments_dir, corpus): - nlp = spacy.load(experiments_dir / corpus / 'best-model') + nlp = spacy.load(experiments_dir / corpus / "best-model") return nlp + def initialize_pipeline(nlp, docs, golds, config, device): - nlp.add_pipe(nlp.create_pipe('parser')) + nlp.add_pipe(nlp.create_pipe("parser")) return nlp @plac.annotations( - test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path), + test_data_dir=( + "Path to Universal Dependencies test data", + "positional", + None, + Path, + ), experiment_dir=("Parent directory with output model", "positional", None, Path), - corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str), + corpus=( + "UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", + "positional", + None, + str, + ), ) def main(test_data_dir, experiment_dir, corpus): - Token.set_extension('split_start', getter=get_token_split_start) - Token.set_extension('split_end', getter=get_token_split_end) - Token.set_extension('begins_fused', default=False) - Token.set_extension('inside_fused', default=False) + Token.set_extension("split_start", getter=get_token_split_start) + Token.set_extension("split_end", getter=get_token_split_end) + Token.set_extension("begins_fused", default=False) + Token.set_extension("inside_fused", default=False) lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False lang.ru.Russian.Defaults.use_pymorphy2 = False nlp = load_nlp(experiment_dir, corpus) - - treebank_code = nlp.meta['treebank'] - for section in ('test', 'dev'): - if section == 'dev': - section_dir = 'conll17-ud-development-2017-03-19' - else: - section_dir = 'conll17-ud-test-2017-05-09' - text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt') - udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu') - gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu') - header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD'] - print('\t'.join(header)) - inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path} - for input_type in ('udp', 'raw'): + treebank_code = nlp.meta["treebank"] + for section in ("test", "dev"): + if section == "dev": + section_dir = "conll17-ud-development-2017-03-19" + else: + section_dir = "conll17-ud-test-2017-05-09" + text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt") + udpipe_path = ( + test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu") + ) + gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu") + + header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"] + print("\t".join(header)) + inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path} + for input_type in ("udp", "raw"): input_path = inputs[input_type] - output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section) + output_path = ( + experiment_dir / corpus / "{section}.conllu".format(section=section) + ) parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path) accuracy = print_results(input_type, test_scores) - acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section) - with open(acc_path, 'w') as file_: + acc_path = ( + experiment_dir + / corpus + / "{section}-accuracy.json".format(section=section) + ) + with open(acc_path, "w") as file_: file_.write(json.dumps(accuracy, indent=2)) -if __name__ == '__main__': +if __name__ == "__main__": plac.call(main) diff --git a/spacy/cli/ud_train.py b/spacy/cli/ud/ud_train.py similarity index 57% rename from spacy/cli/ud_train.py rename to spacy/cli/ud/ud_train.py index 424fa6e2e..746607be0 100644 --- a/spacy/cli/ud_train.py +++ b/spacy/cli/ud/ud_train.py @@ -1,7 +1,9 @@ -'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes +# flake8: noqa +"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes .conllu format for development data, allowing the official scorer to be used. -''' +""" from __future__ import unicode_literals + import plac import tqdm from pathlib import Path @@ -11,12 +13,12 @@ import json import spacy import spacy.util -from ..tokens import Token, Doc -from ..gold import GoldParse -from ..util import compounding, minibatch, minibatch_by_words -from ..syntax.nonproj import projectivize -from ..matcher import Matcher -from .. import displacy +from ...tokens import Token, Doc +from ...gold import GoldParse +from ...util import compounding, minibatch, minibatch_by_words +from ...syntax.nonproj import projectivize +from ...matcher import Matcher +from ... import displacy from collections import defaultdict, Counter from timeit import default_timer as timer @@ -27,10 +29,9 @@ import cytoolz from . import conll17_ud_eval -from .. import lang -from .. import lang -from ..lang import zh -from ..lang import ja +from ... import lang +from ...lang import zh +from ...lang import ja try: import torch @@ -42,17 +43,26 @@ except ImportError: # Data reading # ################ -space_re = re.compile('\s+') -def split_text(text): - return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')] - +space_re = re.compile("\s+") -def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, - max_doc_length=None, limit=None): - '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, + +def split_text(text): + return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")] + + +def read_data( + nlp, + conllu_file, + text_file, + raw_text=True, + oracle_segments=False, + max_doc_length=None, + limit=None, +): + """Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, include Doc objects created using nlp.make_doc and then aligned against the gold-standard sequences. If oracle_segments=True, include Doc objects - created from the gold-standard segments. At least one must be True.''' + created from the gold-standard segments. At least one must be True.""" if not raw_text and not oracle_segments: raise ValueError("At least one of raw_text or oracle_segments must be True") paragraphs = split_text(text_file.read()) @@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, for cs in cd: sent = defaultdict(list) for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: - if '.' in id_: + if "." in id_: continue - if '-' in id_: + if "-" in id_: continue - id_ = int(id_)-1 - head = int(head)-1 if head != '0' else id_ - sent['words'].append(word) - sent['tags'].append(tag) - sent['heads'].append(head) - sent['deps'].append('ROOT' if dep == 'root' else dep) - sent['spaces'].append(space_after == '_') - sent['entities'] = ['-'] * len(sent['words']) - sent['heads'], sent['deps'] = projectivize(sent['heads'], - sent['deps']) + id_ = int(id_) - 1 + head = int(head) - 1 if head != "0" else id_ + sent["words"].append(word) + sent["tags"].append(tag) + sent["heads"].append(head) + sent["deps"].append("ROOT" if dep == "root" else dep) + sent["spaces"].append(space_after == "_") + sent["entities"] = ["-"] * len(sent["words"]) + sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"]) if oracle_segments: - docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces'])) + docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) golds.append(GoldParse(docs[-1], **sent)) sent_annots.append(sent) @@ -107,18 +116,18 @@ def read_conllu(file_): sent = [] doc = [] for line in file_: - if line.startswith('# newdoc'): + if line.startswith("# newdoc"): if doc: docs.append(doc) doc = [] - elif line.startswith('#'): + elif line.startswith("#"): continue elif not line.strip(): if sent: doc.append(sent) sent = [] else: - sent.append(list(line.strip().split('\t'))) + sent.append(list(line.strip().split("\t"))) if len(sent[-1]) != 10: print(repr(line)) raise ValueError @@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): flat = defaultdict(list) sent_starts = [] for sent in sent_annots: - flat['heads'].extend(len(flat['words'])+head for head in sent['heads']) - for field in ['words', 'tags', 'deps', 'entities', 'spaces']: + flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"]) + for field in ["words", "tags", "deps", "entities", "spaces"]: flat[field].extend(sent[field]) sent_starts.append(True) - sent_starts.extend([False] * (len(sent['words'])-1)) + sent_starts.extend([False] * (len(sent["words"]) - 1)) # Construct text if necessary - assert len(flat['words']) == len(flat['spaces']) + assert len(flat["words"]) == len(flat["spaces"]) if text is None: - text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) + text = "".join( + word + " " * space for word, space in zip(flat["words"], flat["spaces"]) + ) doc = nlp.make_doc(text) - flat.pop('spaces') + flat.pop("spaces") gold = GoldParse(doc, **flat) gold.sent_starts = sent_starts for i in range(len(gold.heads)): @@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0): return doc, gold + ############################# # Data transforms for spaCy # ############################# + def golds_to_gold_tuples(docs, golds): - '''Get out the annoying 'tuples' format used by begin_training, given the - GoldParse objects.''' + """Get out the annoying 'tuples' format used by begin_training, given the + GoldParse objects.""" tuples = [] for doc, gold in zip(docs, golds): text = doc.text @@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds): # Evaluation # ############## + def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): - if text_loc.parts[-1].endswith('.conllu'): + if text_loc.parts[-1].endswith(".conllu"): docs = [] with text_loc.open() as file_: for conllu_doc in read_conllu(file_): @@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): for name, component in nlp.pipeline: docs = list(component.pipe(docs)) else: - with text_loc.open('r', encoding='utf8') as text_file: + with text_loc.open("r", encoding="utf8") as text_file: texts = split_text(text_file.read()) docs = list(nlp.pipe(texts)) - with sys_loc.open('w', encoding='utf8') as out_file: + with sys_loc.open("w", encoding="utf8") as out_file: write_conllu(docs, out_file) - with gold_loc.open('r', encoding='utf8') as gold_file: + with gold_loc.open("r", encoding="utf8") as gold_file: gold_ud = conll17_ud_eval.load_conllu(gold_file) - with sys_loc.open('r', encoding='utf8') as sys_file: + with sys_loc.open("r", encoding="utf8") as sys_file: sys_ud = conll17_ud_eval.load_conllu(sys_file) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) return docs, scores @@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def write_conllu(docs, file_): merger = Matcher(docs[0].vocab) - merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}]) + merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): matches = merger(doc) - spans = [doc[start:end+1] for _, start, end in matches] + spans = [doc[start : end + 1] for _, start, end in matches] offsets = [(span.start_char, span.end_char) for span in spans] for start_char, end_char in offsets: doc.merge(start_char, end_char) @@ -213,65 +227,82 @@ def write_conllu(docs, file_): file_.write("# text = {text}\n".format(text=sent.text)) for k, token in enumerate(sent): if token.head.i > sent[-1].i or token.head.i < sent[0].i: - for word in doc[sent[0].i-10 : sent[0].i]: + for word in doc[sent[0].i - 10 : sent[0].i]: print(word.i, word.head.i, word.text, word.dep_) for word in sent: print(word.i, word.head.i, word.text, word.dep_) - for word in doc[sent[-1].i : sent[-1].i+10]: + for word in doc[sent[-1].i : sent[-1].i + 10]: print(word.i, word.head.i, word.text, word.dep_) - raise ValueError("Invalid parse: head outside sentence (%s)" % token.text) - file_.write(token._.get_conllu_lines(k) + '\n') - file_.write('\n') + raise ValueError( + "Invalid parse: head outside sentence (%s)" % token.text + ) + file_.write(token._.get_conllu_lines(k) + "\n") + file_.write("\n") def print_progress(itn, losses, ud_scores): fields = { - 'dep_loss': losses.get('parser', 0.0), - 'tag_loss': losses.get('tagger', 0.0), - 'words': ud_scores['Words'].f1 * 100, - 'sents': ud_scores['Sentences'].f1 * 100, - 'tags': ud_scores['XPOS'].f1 * 100, - 'uas': ud_scores['UAS'].f1 * 100, - 'las': ud_scores['LAS'].f1 * 100, + "dep_loss": losses.get("parser", 0.0), + "tag_loss": losses.get("tagger", 0.0), + "words": ud_scores["Words"].f1 * 100, + "sents": ud_scores["Sentences"].f1 * 100, + "tags": ud_scores["XPOS"].f1 * 100, + "uas": ud_scores["UAS"].f1 * 100, + "las": ud_scores["LAS"].f1 * 100, } - header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD'] + header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"] if itn == 0: - print('\t'.join(header)) - tpl = '\t'.join(( - '{:d}', - '{dep_loss:.1f}', - '{las:.1f}', - '{uas:.1f}', - '{tags:.1f}', - '{sents:.1f}', - '{words:.1f}', - )) + print("\t".join(header)) + tpl = "\t".join( + ( + "{:d}", + "{dep_loss:.1f}", + "{las:.1f}", + "{uas:.1f}", + "{tags:.1f}", + "{sents:.1f}", + "{words:.1f}", + ) + ) print(tpl.format(itn, **fields)) -#def get_sent_conllu(sent, sent_id): + +# def get_sent_conllu(sent, sent_id): # lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)] + def get_token_conllu(token, i): if token._.begins_fused: n = 1 while token.nbor(n)._.inside_fused: n += 1 - id_ = '%d-%d' % (i, i+n) - lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_'] + id_ = "%d-%d" % (i, i + n) + lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"] else: lines = [] if token.head.i == token.i: head = 0 else: head = i + (token.head.i - token.i) + 1 - fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_', - str(head), token.dep_.lower(), '_', '_'] - lines.append('\t'.join(fields)) - return '\n'.join(lines) + fields = [ + str(i + 1), + token.text, + token.lemma_, + token.pos_, + token.tag_, + "_", + str(head), + token.dep_.lower(), + "_", + "_", + ] + lines.append("\t".join(fields)) + return "\n".join(lines) -Token.set_extension('get_conllu_lines', method=get_token_conllu) -Token.set_extension('begins_fused', default=False) -Token.set_extension('inside_fused', default=False) + +Token.set_extension("get_conllu_lines", method=get_token_conllu) +Token.set_extension("begins_fused", default=False) +Token.set_extension("inside_fused", default=False) ################## @@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False) def load_nlp(corpus, config, vectors=None): - lang = corpus.split('_')[0] + lang = corpus.split("_")[0] nlp = spacy.blank(lang) if config.vectors: - if not vectors: - raise ValueError("config asks for vectors, but no vectors " - "directory set on command line (use -v)") + if not vectors: + raise ValueError( + "config asks for vectors, but no vectors " + "directory set on command line (use -v)" + ) if (Path(vectors) / corpus).exists(): - nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab') - nlp.meta['treebank'] = corpus + nlp.vocab.from_disk(Path(vectors) / corpus / "vocab") + nlp.meta["treebank"] = corpus return nlp - + def initialize_pipeline(nlp, docs, golds, config, device): - nlp.add_pipe(nlp.create_pipe('tagger')) - nlp.add_pipe(nlp.create_pipe('parser')) + nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.add_pipe(nlp.create_pipe("parser")) if config.multitask_tag: - nlp.parser.add_multitask_objective('tag') + nlp.parser.add_multitask_objective("tag") if config.multitask_sent: - nlp.parser.add_multitask_objective('sent_start') + nlp.parser.add_multitask_objective("sent_start") for gold in golds: for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) if torch is not None and device != -1: - torch.set_default_tensor_type('torch.cuda.FloatTensor') + torch.set_default_tensor_type("torch.cuda.FloatTensor") optimizer = nlp.begin_training( - lambda: golds_to_gold_tuples(docs, golds), device=device, - subword_features=config.subword_features, conv_depth=config.conv_depth, - bilstm_depth=config.bilstm_depth) + lambda: golds_to_gold_tuples(docs, golds), + device=device, + subword_features=config.subword_features, + conv_depth=config.conv_depth, + bilstm_depth=config.bilstm_depth, + ) if config.pretrained_tok2vec: _load_pretrained_tok2vec(nlp, config.pretrained_tok2vec) return optimizer @@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc): """Load pre-trained weights for the 'token-to-vector' part of the component models, which is typically a CNN. See 'spacy pretrain'. Experimental. """ - with Path(loc).open('rb') as file_: + with Path(loc).open("rb") as file_: weights_data = file_.read() loaded = [] for name, component in nlp.pipeline: - if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'): + if hasattr(component, "model") and hasattr(component.model, "tok2vec"): component.tok2vec.from_bytes(weights_data) loaded.append(name) return loaded - ######################## # Command line helpers # ######################## + class Config(object): - def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False, - multitask_sent=False, multitask_dep=False, multitask_vectors=None, - bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750, - batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True, - vectors_dir=None, pretrained_tok2vec=None): + def __init__( + self, + vectors=None, + max_doc_length=10, + multitask_tag=False, + multitask_sent=False, + multitask_dep=False, + multitask_vectors=None, + bilstm_depth=0, + nr_epoch=30, + min_batch_size=100, + max_batch_size=1000, + batch_by_words=True, + dropout=0.2, + conv_depth=4, + subword_features=True, + vectors_dir=None, + pretrained_tok2vec=None, + ): if vectors_dir is not None: if vectors is None: vectors = True @@ -346,13 +396,13 @@ class Config(object): multitask_vectors = True for key, value in locals().items(): setattr(self, key, value) - + @classmethod def load(cls, loc, vectors_dir=None): - with Path(loc).open('r', encoding='utf8') as file_: + with Path(loc).open("r", encoding="utf8") as file_: cfg = json.load(file_) if vectors_dir is not None: - cfg['vectors_dir'] = vectors_dir + cfg["vectors_dir"] = vectors_dir return cls(**cfg) @@ -364,43 +414,59 @@ class Dataset(object): self.text = None for file_path in self.path.iterdir(): name = file_path.parts[-1] - if section in name and name.endswith('conllu'): + if section in name and name.endswith("conllu"): self.conllu = file_path - elif section in name and name.endswith('txt'): + elif section in name and name.endswith("txt"): self.text = file_path if self.conllu is None: msg = "Could not find .txt file in {path} for {section}" raise IOError(msg.format(section=section, path=path)) if self.text is None: msg = "Could not find .txt file in {path} for {section}" - self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0] + self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0] class TreebankPaths(object): def __init__(self, ud_path, treebank, **cfg): - self.train = Dataset(ud_path / treebank, 'train') - self.dev = Dataset(ud_path / treebank, 'dev') + self.train = Dataset(ud_path / treebank, "train") + self.dev = Dataset(ud_path / treebank, "dev") self.lang = self.train.lang @plac.annotations( ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), - corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc", - "positional", None, str), + corpus=( + "UD corpus to train and evaluate on, e.g. en, es_ancora, etc", + "positional", + None, + str, + ), parses_dir=("Directory to write the development parses", "positional", None, Path), config=("Path to json formatted config file", "option", "C", Path), limit=("Size limit", "option", "n", int), gpu_device=("Use GPU", "option", "g", int), use_oracle_segments=("Use oracle segments", "flag", "G", int), - vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/", - "option", "v", Path), + vectors_dir=( + "Path to directory with pre-trained vectors, named e.g. en/", + "option", + "v", + Path, + ), ) -def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None, - use_oracle_segments=False): +def main( + ud_dir, + parses_dir, + corpus, + config=None, + limit=0, + gpu_device=-1, + vectors_dir=None, + use_oracle_segments=False, +): spacy.util.fix_random_seed() lang.zh.Chinese.Defaults.use_jieba = False lang.ja.Japanese.Defaults.use_janome = False - + if config is not None: config = Config.load(config, vectors_dir=vectors_dir) else: @@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector print("Train and evaluate", corpus, "using lang", paths.lang) nlp = load_nlp(paths.lang, config, vectors=vectors_dir) - docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(), - max_doc_length=config.max_doc_length, - limit=limit) + docs, golds = read_data( + nlp, + paths.train.conllu.open(), + paths.train.text.open(), + max_doc_length=config.max_doc_length, + limit=limit, + ) optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device) batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) beam_prob = compounding(0.2, 0.8, 1.001) for i in range(config.nr_epoch): - docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(), - max_doc_length=config.max_doc_length, limit=limit, - oracle_segments=use_oracle_segments, - raw_text=not use_oracle_segments) + docs, golds = read_data( + nlp, + paths.train.conllu.open(), + paths.train.text.open(), + max_doc_length=config.max_doc_length, + limit=limit, + oracle_segments=use_oracle_segments, + raw_text=not use_oracle_segments, + ) Xs = list(zip(docs, golds)) random.shuffle(Xs) if config.batch_by_words: @@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector for batch in batches: batch_docs, batch_gold = zip(*batch) pbar.update(sum(len(doc) for doc in batch_docs)) - nlp.parser.cfg['beam_update_prob'] = next(beam_prob) - nlp.update(batch_docs, batch_gold, sgd=optimizer, - drop=config.dropout, losses=losses) - - out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i) + nlp.parser.cfg["beam_update_prob"] = next(beam_prob) + nlp.update( + batch_docs, + batch_gold, + sgd=optimizer, + drop=config.dropout, + losses=losses, + ) + + out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) with nlp.use_params(optimizer.averages): if use_oracle_segments: - parsed_docs, scores = evaluate(nlp, paths.dev.conllu, - paths.dev.conllu, out_path) + parsed_docs, scores = evaluate( + nlp, paths.dev.conllu, paths.dev.conllu, out_path + ) else: - parsed_docs, scores = evaluate(nlp, paths.dev.text, - paths.dev.conllu, out_path) + parsed_docs, scores = evaluate( + nlp, paths.dev.text, paths.dev.conllu, out_path + ) print_progress(i, losses, scores) def _render_parses(i, to_render): - to_render[0].user_data['title'] = "Batch %d" % i - with Path('/tmp/parses.html').open('w') as file_: - html = displacy.render(to_render[:5], style='dep', page=True) + to_render[0].user_data["title"] = "Batch %d" % i + with Path("/tmp/parses.html").open("w") as file_: + html = displacy.render(to_render[:5], style="dep", page=True) file_.write(html) -if __name__ == '__main__': +if __name__ == "__main__": plac.call(main) diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 6b0765c3e..caeaf5ca9 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function import pkg_resources from pathlib import Path import sys -import ujson import requests +from wasabi import Printer from ._messages import Messages -from ..compat import path2str, locale_escape -from ..util import prints, get_data_path, read_json +from ..compat import path2str +from ..util import get_data_path, read_json from .. import about def validate(): - """Validate that the currently installed version of spaCy is compatible + """ + Validate that the currently installed version of spaCy is compatible with the installed models. Should be run after `pip install -U spacy`. """ - r = requests.get(about.__compatibility__) - if r.status_code != 200: - prints(Messages.M021, title=Messages.M003.format(code=r.status_code), - exits=1) - compat = r.json()['spacy'] + msg = Printer() + with msg.loading("Loading compatibility table..."): + r = requests.get(about.__compatibility__) + if r.status_code != 200: + msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1) + msg.good("Loaded compatibility table") + compat = r.json()["spacy"] current_compat = compat.get(about.__version__) if not current_compat: - prints(about.__compatibility__, exits=1, - title=Messages.M022.format(version=about.__version__)) + msg.fail( + Messages.M022.format(version=about.__version__), + about.__compatibility__, + exits=1, + ) all_models = set() for spacy_v, models in dict(compat).items(): all_models.update(models.keys()) @@ -33,33 +39,38 @@ def validate(): compat[spacy_v][model] = [reformat_version(v) for v in model_vs] model_links = get_model_links(current_compat) model_pkgs = get_model_pkgs(current_compat, all_models) - incompat_links = {l for l, d in model_links.items() if not d['compat']} - incompat_models = {d['name'] for _, d in model_pkgs.items() - if not d['compat']} - incompat_models.update([d['name'] for _, d in model_links.items() - if not d['compat']]) + incompat_links = {l for l, d in model_links.items() if not d["compat"]} + incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]} + incompat_models.update( + [d["name"] for _, d in model_links.items() if not d["compat"]] + ) na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] + spacy_dir = Path(__file__).parent.parent + + msg.divider(Messages.M023.format(version=about.__version__)) + msg.info("spaCy installation: {}".format(path2str(spacy_dir))) - prints(path2str(Path(__file__).parent.parent), - title=Messages.M023.format(version=about.__version__)) if model_links or model_pkgs: - print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) + header = ("TYPE", "NAME", "MODEL", "VERSION", "") + rows = [] for name, data in model_pkgs.items(): - print(get_model_row(current_compat, name, data, 'package')) + rows.append(get_model_row(current_compat, name, data, msg)) for name, data in model_links.items(): - print(get_model_row(current_compat, name, data, 'link')) + rows.append(get_model_row(current_compat, name, data, msg, "link")) + msg.table(rows, header=header) else: - prints(Messages.M024, exits=0) + msg.text(Messages.M024, exits=0) if update_models: - cmd = ' python -m spacy download {}' - print("\n " + Messages.M025) - print('\n'.join([cmd.format(pkg) for pkg in update_models])) + msg.divider("Install updates") + cmd = "python -m spacy download {}" + print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: - prints(Messages.M025.format(version=about.__version__, - models=', '.join(na_models))) + msg.text( + Messages.M025.format(version=about.__version__, models=", ".join(na_models)) + ) if incompat_links: - prints(Messages.M027.format(path=path2str(get_data_path()))) + msg.text(Messages.M027.format(path=path2str(get_data_path()))) if incompat_models or incompat_links: sys.exit(1) @@ -70,50 +81,48 @@ def get_model_links(compat): if data_path: models = [p for p in data_path.iterdir() if is_model_path(p)] for model in models: - meta_path = Path(model) / 'meta.json' + meta_path = Path(model) / "meta.json" if not meta_path.exists(): continue meta = read_json(meta_path) link = model.parts[-1] - name = meta['lang'] + '_' + meta['name'] - links[link] = {'name': name, 'version': meta['version'], - 'compat': is_compat(compat, name, meta['version'])} + name = meta["lang"] + "_" + meta["name"] + links[link] = { + "name": name, + "version": meta["version"], + "compat": is_compat(compat, name, meta["version"]), + } return links def get_model_pkgs(compat, all_models): pkgs = {} for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): - package = pkg_name.replace('-', '_') + package = pkg_name.replace("-", "_") if package in all_models: version = pkg_data.version - pkgs[pkg_name] = {'name': package, 'version': version, - 'compat': is_compat(compat, package, version)} + pkgs[pkg_name] = { + "name": package, + "version": version, + "compat": is_compat(compat, package, version), + } return pkgs -def get_model_row(compat, name, data, type='package'): - tpl_red = '\x1b[38;5;1m{}\x1b[0m' - tpl_green = '\x1b[38;5;2m{}\x1b[0m' - if data['compat']: - comp = tpl_green.format(locale_escape('✔', errors='ignore')) - version = tpl_green.format(data['version']) +def get_model_row(compat, name, data, msg, model_type="package"): + if data["compat"]: + comp = msg.text("", color="green", icon="good", no_print=True) + version = msg.text(data["version"], color="green", no_print=True) else: - comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0]) - version = tpl_red.format(data['version']) - return get_row(type, name, data['name'], version, comp) - - -def get_row(*args): - tpl_row = ' {:<10}' + (' {:<20}' * 4) - return tpl_row.format(*args) + version = msg.text(data["version"], color="red", no_print=True) + comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0]) + return (model_type, name, data["name"], version, comp) def is_model_path(model_path): - exclude = ['cache', 'pycache', '__pycache__'] + exclude = ["cache", "pycache", "__pycache__"] name = model_path.parts[-1] - return (model_path.is_dir() and name not in exclude - and not name.startswith('.')) + return model_path.is_dir() and name not in exclude and not name.startswith(".") def is_compat(compat, name, version): @@ -122,6 +131,6 @@ def is_compat(compat, name, version): def reformat_version(version): """Hack to reformat old versions ending on '-alpha' to match pip format.""" - if version.endswith('-alpha'): - return version.replace('-alpha', 'a0') - return version.replace('-alpha', 'a') + if version.endswith("-alpha"): + return version.replace("-alpha", "a0") + return version.replace("-alpha", "a") diff --git a/spacy/cli/vocab.py b/spacy/cli/vocab.py deleted file mode 100644 index 62cf94d23..000000000 --- a/spacy/cli/vocab.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import plac -import json -import spacy -import numpy -from pathlib import Path - -from ..vectors import Vectors -from ..util import prints, ensure_path - - -@plac.annotations( - lang=("model language", "positional", None, str), - output_dir=("model output directory", "positional", None, Path), - lexemes_loc=("location of JSONL-formatted lexical data", "positional", - None, Path), - vectors_loc=("optional: location of vectors data, as numpy .npz", - "positional", None, str), - prune_vectors=("optional: number of vectors to prune to.", - "option", "V", int) -) -def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1): - """Compile a vocabulary from a lexicon jsonl file and word vectors.""" - if not lexemes_loc.exists(): - prints(lexemes_loc, title="Can't find lexical data", exits=1) - vectors_loc = ensure_path(vectors_loc) - nlp = spacy.blank(lang) - for word in nlp.vocab: - word.rank = 0 - lex_added = 0 - with lexemes_loc.open() as file_: - for line in file_: - if line.strip(): - attrs = json.loads(line) - if 'settings' in attrs: - nlp.vocab.cfg.update(attrs['settings']) - else: - lex = nlp.vocab[attrs['orth']] - lex.set_attrs(**attrs) - assert lex.rank == attrs['id'] - lex_added += 1 - if vectors_loc is not None: - vector_data = numpy.load(vectors_loc.open('rb')) - nlp.vocab.vectors = Vectors(data=vector_data) - for word in nlp.vocab: - if word.rank: - nlp.vocab.vectors.add(word.orth, row=word.rank) - - if prune_vectors >= 1: - remap = nlp.vocab.prune_vectors(prune_vectors) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - vec_added = len(nlp.vocab.vectors) - prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir, - title="Sucessfully compiled vocab and vectors, and saved model") - return nlp diff --git a/spacy/compat.py b/spacy/compat.py index 1f47971ec..f00e2c417 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -5,7 +5,6 @@ import os import sys import ujson import itertools -import locale from thinc.neural.util import copy_array @@ -136,12 +135,3 @@ def import_file(name, loc): module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module - - -def locale_escape(string, errors="replace"): - """ - Mangle non-supported characters, for savages with ascii terminals. - """ - encoding = locale.getpreferredencoding() - string = string.encode(encoding, errors).decode("utf8") - return string diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 222f37464..9a4b4f5d8 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -5,7 +5,7 @@ from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span from ..compat import b_to_str from ..errors import Errors, Warnings, user_warning -from ..util import prints, is_in_jupyter +from ..util import is_in_jupyter _html = {} @@ -72,14 +72,12 @@ def serve( render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server("0.0.0.0", port, app) - prints( - "Using the '{}' visualizer".format(style), - title="Serving on port {}...".format(port), - ) + print("\nUsing the '{}' visualizer".format(style)) + print("Serving on port {}...\n".format(port)) try: httpd.serve_forever() except KeyboardInterrupt: - prints("Shutting down server on port {}.".format(port)) + print("Shutting down server on port {}.".format(port)) finally: httpd.server_close() diff --git a/spacy/errors.py b/spacy/errors.py index cede519c9..e1cc8a819 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -278,6 +278,12 @@ class Errors(object): E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token" " can only be part of one entity, so make sure the entities you're " "setting don't overlap.") + E104 = ("Can't find JSON schema for '{name}'.") + E105 = ("The Doc.print_tree() method is now deprecated. Please use " + "Doc.json() instead.") + E106 = ("Can't find doc._.{attr} attribute specified in the underscore " + "settings: {opts}") + E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 63d8e0733..0f25d7f53 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -15,7 +15,7 @@ import json import ujson -from . import _align +from . import _align from .syntax import nonproj from .tokens import Doc from .errors import Errors @@ -172,7 +172,7 @@ class GoldCorpus(object): def dev_tuples(self): locs = (self.tmp_dir / 'dev').iterdir() yield from self.read_tuples(locs, limit=self.limit) - + @property def train_tuples(self): locs = (self.tmp_dir / 'train').iterdir() @@ -271,6 +271,53 @@ def _corrupt(c, noise_level): return c.lower() +def read_json_object(json_corpus_section): + """Take a list of JSON-formatted documents (e.g. from an already loaded + training data file) and yield tuples in the GoldParse format. + + json_corpus_section (list): The data. + YIELDS (tuple): The reformatted data. + """ + for json_doc in json_corpus_section: + tuple_doc = json_to_tuple(json_doc) + for tuple_paragraph in tuple_doc: + yield tuple_paragraph + + +def json_to_tuple(doc): + """Convert an item in the JSON-formatted training data to the tuple format + used by GoldParse. + + doc (dict): One entry in the training data. + YIELDS (tuple): The reformatted data. + """ + paragraphs = [] + for paragraph in doc['paragraphs']: + sents = [] + for sent in paragraph['sentences']: + words = [] + ids = [] + tags = [] + heads = [] + labels = [] + ner = [] + for i, token in enumerate(sent['tokens']): + words.append(token['orth']) + ids.append(i) + tags.append(token.get('tag', '-')) + heads.append(token.get('head', 0) + i) + labels.append(token.get('dep', '')) + # Ensure ROOT label is case-insensitive + if labels[-1].lower() == 'root': + labels[-1] = 'ROOT' + ner.append(token.get('ner', '-')) + sents.append([ + [ids, words, tags, heads, labels, ner], + sent.get('brackets', [])]) + if sents: + yield [paragraph.get('raw', None), sents] + + def read_json_file(loc, docs_filter=None, limit=None): loc = util.ensure_path(loc) if loc.is_dir(): @@ -280,31 +327,8 @@ def read_json_file(loc, docs_filter=None, limit=None): for doc in _json_iterate(loc): if docs_filter is not None and not docs_filter(doc): continue - paragraphs = [] - for paragraph in doc['paragraphs']: - sents = [] - for sent in paragraph['sentences']: - words = [] - ids = [] - tags = [] - heads = [] - labels = [] - ner = [] - for i, token in enumerate(sent['tokens']): - words.append(token['orth']) - ids.append(i) - tags.append(token.get('tag', '-')) - heads.append(token.get('head', 0) + i) - labels.append(token.get('dep', '')) - # Ensure ROOT label is case-insensitive - if labels[-1].lower() == 'root': - labels[-1] = 'ROOT' - ner.append(token.get('ner', '-')) - sents.append([ - [ids, words, tags, heads, labels, ner], - sent.get('brackets', [])]) - if sents: - yield [paragraph.get('raw', None), sents] + for json_tuple in json_to_tuple(doc): + yield json_tuple def _json_iterate(loc): @@ -573,32 +597,19 @@ cdef class GoldParse: self.c.sent_start[i] = 0 -def docs_to_json(id, docs): - '''Convert a list of Doc objects into the JSON-serializable format used by - the spacy train command. Each Doc in the list will be interpreted as a - paragraph. - ''' +def docs_to_json(docs, underscore=None): + """Convert a list of Doc objects into the JSON-serializable format used by + the spacy train command. + + docs (iterable / Doc): The Doc object(s) to convert. + underscore (list): Optional list of string names of custom doc._. + attributes. Attribute values need to be JSON-serializable. Values will + be added to an "_" key in the data, e.g. "_": {"foo": "bar"}. + RETURNS (list): The data in spaCy's JSON format. + """ if isinstance(docs, Doc): docs = [docs] - json_doc = {'id': id, 'paragraphs': []} - for i, doc in enumerate(docs): - json_para = {'raw': doc.text, 'sentences': []} - ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] - biluo_tags = biluo_tags_from_offsets(doc, ent_offsets) - for j, sent in enumerate(doc.sents): - json_sent = {'tokens': [], 'brackets': []} - for token in sent: - json_token = {"id": token.i, "orth": token.text} - if doc.is_tagged: - json_token['tag'] = token.tag_ - if doc.is_parsed: - json_token['head'] = token.head.i-token.i - json_token['dep'] = token.dep_ - json_token['ner'] = biluo_tags[token.i] - json_sent['tokens'].append(json_token) - json_para['sentences'].append(json_sent) - json_doc['paragraphs'].append(json_para) - return json_doc + return [doc.to_json(underscore=underscore) for doc in docs] def biluo_tags_from_offsets(doc, entities, missing='O'): diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ce4083e8a..25c1df77b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -341,21 +341,3 @@ def test_lowest_common_ancestor(en_tokenizer): assert lca[1, 1] == 1 assert lca[0, 1] == 2 assert lca[1, 2] == 2 - - -def test_parse_tree(en_tokenizer): - """Tests doc.print_tree() method.""" - text = "I like New York in Autumn." - heads = [1, 0, 1, -2, -3, -1, -5] - tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."] - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags) - # full method parse_tree(text) is a trivial composition - trees = doc.print_tree() - assert len(trees) > 0 - tree = trees[0] - assert all( - k in list(tree.keys()) - for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"] - ) - assert tree["word"] == "like" # check root is correct diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py new file mode 100644 index 000000000..1869d0918 --- /dev/null +++ b/spacy/tests/doc/test_to_json.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest +from spacy.cli.schemas import get_schema, validate_json +from spacy.tokens import Doc +from ..util import get_doc + + +@pytest.fixture() +def doc(en_vocab): + words = ["c", "d", "e"] + pos = ["VERB", "NOUN", "NOUN"] + tags = ["VBP", "NN", "NN"] + heads = [0, -1, -2] + deps = ["ROOT", "dobj", "dobj"] + ents = [(1, 2, "ORG")] + return get_doc( + en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents + ) + + +def test_doc_to_json(doc): + json_doc = doc.to_json() + assert json_doc["text"] == "c d e " + assert len(json_doc["tokens"]) == 3 + assert json_doc["tokens"][0]["pos"] == "VERB" + assert json_doc["tokens"][0]["tag"] == "VBP" + assert json_doc["tokens"][0]["dep"] == "ROOT" + assert len(json_doc["ents"]) == 1 + assert json_doc["ents"][0]["start"] == 2 # character offset! + assert json_doc["ents"][0]["end"] == 3 # character offset! + assert json_doc["ents"][0]["label"] == "ORG" + + +def test_doc_to_json_underscore(doc): + Doc.set_extension("json_test1", default=False) + Doc.set_extension("json_test2", default=False) + doc._.json_test1 = "hello world" + doc._.json_test2 = [1, 2, 3] + json_doc = doc.to_json(underscore=["json_test1", "json_test2"]) + assert "_" in json_doc + assert json_doc["_"]["json_test1"] == "hello world" + assert json_doc["_"]["json_test2"] == [1, 2, 3] + + +def test_doc_to_json_underscore_error_attr(doc): + """Test that Doc.to_json() raises an error if a custom attribute doesn't + exist in the ._ space.""" + with pytest.raises(ValueError): + doc.to_json(underscore=["json_test3"]) + + +def test_doc_to_json_underscore_error_serialize(doc): + """Test that Doc.to_json() raises an error if a custom attribute value + isn't JSON-serializable.""" + Doc.set_extension("json_test4", method=lambda doc: doc.text) + with pytest.raises(ValueError): + doc.to_json(underscore=["json_test4"]) + + +def test_doc_to_json_valid_training(doc): + json_doc = doc.to_json() + errors = validate_json([json_doc], get_schema("training")) + assert not errors diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 1d01990bd..9ecd61465 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from spacy.matcher import PhraseMatcher from spacy.tokens import Doc - from ..util import get_doc diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d2fa7682e..7c230f469 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -2,9 +2,7 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import docs_to_json from spacy.tokens import Doc -from .util import get_doc def test_gold_biluo_U(en_vocab): @@ -52,34 +50,3 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer): assert biluo_tags_converted == biluo_tags offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) assert offsets_converted == offsets - - -def test_docs_to_json(en_vocab): - """Test we can convert a list of Doc objects into the JSON-serializable - format we use for training. - """ - docs = [ - get_doc( - en_vocab, - words=["a", "b"], - pos=["VBP", "NN"], - heads=[0, -1], - deps=["ROOT", "dobj"], - ents=[], - ), - get_doc( - en_vocab, - words=["c", "d", "e"], - pos=["VBP", "NN", "NN"], - heads=[0, -1, -2], - deps=["ROOT", "dobj", "dobj"], - ents=[(1, 2, "ORG")], - ), - ] - json_doc = docs_to_json(0, docs) - assert json_doc["id"] == 0 - assert len(json_doc["paragraphs"]) == 2 - assert len(json_doc["paragraphs"][0]["sentences"]) == 1 - assert len(json_doc["paragraphs"][1]["sentences"]) == 1 - assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2 - assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3 diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py new file mode 100644 index 000000000..2ddb39f20 --- /dev/null +++ b/spacy/tests/test_json_schemas.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from spacy.cli.schemas import validate_json, get_schema +import pytest + + +@pytest.fixture(scope="session") +def training_schema(): + return get_schema("training") + + +def test_json_schema_get(): + schema = get_schema("training") + assert schema + with pytest.raises(ValueError): + schema = get_schema("xxx") + + +@pytest.mark.parametrize( + "data", + [ + {"text": "Hello world"}, + {"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]}, + ], +) +def test_json_schema_training_valid(data, training_schema): + errors = validate_json([data], training_schema) + assert not errors + + +@pytest.mark.parametrize( + "data,n_errors", + [ + ({"spans": []}, 1), + ({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2), + ({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1), + ({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1), + ({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2), + ], +) +def test_json_schema_training_invalid(data, n_errors, training_schema): + errors = validate_json([data], training_schema) + assert len(errors) == n_errors diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 6aaf22fb8..f7f49cc0b 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals - import pytest from pathlib import Path from spacy import util diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0b8267e09..b845b4eb7 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -20,7 +20,6 @@ from .span cimport Span from .token cimport Token from .span cimport Span from .token cimport Token -from .printers import parse_tree from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs import intify_attrs, IDS @@ -29,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport ENT_TYPE, SENT_START from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t -from ..util import normalize_slice +from ..util import normalize_slice, is_json_serializable from ..compat import is_config, copy_reg, pickle, basestring_ from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings @@ -959,31 +958,48 @@ cdef class Doc: return self[start] def print_tree(self, light=False, flat=False): - """Returns the parse trees in JSON (dict) format. + raise ValueError(Errors.E105) - light (bool): Don't include lemmas or entities. - flat (bool): Don't include arcs or modifiers. - RETURNS (dict): Parse tree as dict. + def to_json(self, underscore=None): + """Convert a Doc to JSON. Produces the same format used by the spacy + train command. - EXAMPLE: - >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') - >>> trees = doc.print_tree() - >>> trees[1] - {'modifiers': [ - {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', - 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', - 'lemma': 'Alice'}, - {'modifiers': [ - {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', - 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], - 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', - 'POS_fine': 'NN', 'lemma': 'pizza'}, - {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', - 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], - 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', - 'POS_fine': 'VBD', 'lemma': 'eat'} + underscore (list): Optional list of string names of custom doc._. + attributes. Attribute values need to be JSON-serializable. Values will + be added to an "_" key in the data, e.g. "_": {"foo": "bar"}. + RETURNS (dict): The data in spaCy's JSON format. """ - return parse_tree(self, light=light, flat=flat) + data = {'text': self.text} + data['ents'] = [{'start': ent.start_char, 'end': ent.end_char, + 'label': ent.label_} for ent in self.ents] + sents = list(self.sents) + if sents: + data['sents'] = [{'start': sent.start_char, 'end': sent.end_char} + for sent in sents] + if self.cats: + data['cats'] = self.cats + data['tokens'] = [] + for token in self: + token_data = {'id': token.i, 'start': token.idx, 'end': token.idx + len(token)} + if token.pos_: + token_data['pos'] = token.pos_ + if token.tag_: + token_data['tag'] = token.tag_ + if token.dep_: + token_data['dep'] = token.dep_ + if token.head: + token_data['head'] = token.head.i + data['tokens'].append(token_data) + if underscore: + data['_'] = {} + for attr in underscore: + if not self.has_extension(attr): + raise ValueError(Errors.E106.format(attr=attr, opts=underscore)) + value = self._.get(attr) + if not is_json_serializable(value): + raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) + data['_'][attr] = value + return data cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: diff --git a/spacy/tokens/printers.py b/spacy/tokens/printers.py deleted file mode 100644 index c7ae88f3d..000000000 --- a/spacy/tokens/printers.py +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .doc import Doc -from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE - - -def merge_ents(doc): - """Helper: merge adjacent entities into single tokens; modifies the doc.""" - for ent in doc.ents: - ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.label_) - return doc - - -def format_POS(token, light, flat): - """Helper: form the POS output for a token.""" - subtree = dict([ - ("word", token.text), - ("lemma", token.lemma_), # trigger - ("NE", token.ent_type_), # trigger - ("POS_fine", token.tag_), - ("POS_coarse", token.pos_), - ("arc", token.dep_), - ("modifiers", []) - ]) - if light: - subtree.pop("lemma") - subtree.pop("NE") - if flat: - subtree.pop("arc") - subtree.pop("modifiers") - return subtree - - -def POS_tree(root, light=False, flat=False): - """Helper: generate a POS tree for a root token. The doc must have - `merge_ents(doc)` ran on it. - """ - subtree = format_POS(root, light=light, flat=flat) - for c in root.children: - subtree["modifiers"].append(POS_tree(c)) - return subtree - - -def parse_tree(doc, light=False, flat=False): - """Make a copy of the doc and construct a syntactic parse tree similar to - displaCy. Generates the POS tree for all sentences in a doc. - - doc (Doc): The doc for parsing. - RETURNS (dict): The parse tree. - - EXAMPLE: - >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') - >>> trees = doc.print_tree() - >>> trees[1] - {'modifiers': [ - {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', - 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, - {'modifiers': [ - {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', - 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], - 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', - 'POS_fine': 'NN', 'lemma': 'pizza'}, - {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', - 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], - 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', - 'POS_fine': 'VBD', 'lemma': 'eat'} - """ - doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) - doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], - doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE])) - merge_ents(doc_clone) # merge the entities into single tokens first - return [POS_tree(sent.root, light=light, flat=flat) - for sent in doc_clone.sents] diff --git a/spacy/util.py b/spacy/util.py index d333d8712..a070e3045 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,8 +7,6 @@ import pkg_resources import importlib import regex as re from pathlib import Path -import sys -import textwrap import random from collections import OrderedDict from thinc.neural._classes.model import Model @@ -18,9 +16,10 @@ import cytoolz import itertools import numpy.random + from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ -from .compat import import_file +from .compat import import_file, json_dumps from .errors import Errors # Import these directly from Thinc, so that we're sure we always have the @@ -541,6 +540,16 @@ def read_json(location): return ujson.load(f) +def write_json(file_path, contents): + """Create a .json file and dump contents. + + file_path (unicode / Path): The path to the output file. + contents: The JSON-serializable contents to output. + """ + with Path(file_path).open("w", encoding="utf8") as f: + f.write(json_dumps(contents)) + + def read_jsonl(file_path): """Read a .jsonl file and yield its contents line by line. @@ -555,6 +564,29 @@ def read_jsonl(file_path): continue +def write_jsonl(file_path, lines): + """Create a .jsonl file and dump contents. + + file_path (unicode / Path): The path to the output file. + lines (list): The JSON-serializable contents of each line. + """ + data = [json_dumps(line) for line in lines] + with Path(file_path).open("w", encoding="utf-8") as f: + f.write("\n".join(data)) + + +def is_json_serializable(obj): + """Check if a Python object is JSON-serializable.""" + if hasattr(obj, "__call__"): + # Check this separately here to prevent infinite recursions + return False + try: + ujson.dumps(obj) + return True + except TypeError: + return False + + def get_raw_input(description, default=False): """Get user input from the command line via raw_input / input. @@ -602,21 +634,6 @@ def from_disk(path, readers, exclude): return path -def print_table(data, title=None): - """Print data in table format. - - data (dict or list of tuples): Label/value pairs. - title (unicode or None): Title, will be printed above. - """ - if isinstance(data, dict): - data = list(data.items()) - tpl_row = " {:<15}" * len(data[0]) - table = "\n".join([tpl_row.format(l, unicode_(v)) for l, v in data]) - if title: - print("\n \033[93m{}\033[0m".format(title)) - print("\n{}\n".format(table)) - - def print_markdown(data, title=None): """Print data in GitHub-flavoured Markdown format for issues etc. @@ -638,44 +655,6 @@ def print_markdown(data, title=None): print("\n{}\n".format("\n".join(markdown))) -def prints(*texts, **kwargs): - """Print formatted message (manual ANSI escape sequences to avoid - dependency) - - *texts (unicode): Texts to print. Each argument is rendered as paragraph. - **kwargs: 'title' becomes coloured headline. exits=True performs sys exit. - """ - exits = kwargs.get("exits", None) - title = kwargs.get("title", None) - title = "\033[93m{}\033[0m\n".format(_wrap(title)) if title else "" - message = "\n\n".join([_wrap(text) for text in texts]) - print("\n{}{}\n".format(title, message)) - if exits is not None: - sys.exit(exits) - - -def _wrap(text, wrap_max=80, indent=4): - """Wrap text at given width using textwrap module. - - text (unicode): Text to wrap. If it's a Path, it's converted to string. - wrap_max (int): Maximum line length (indent is deducted). - indent (int): Number of spaces for indentation. - RETURNS (unicode): Wrapped text. - """ - indent = indent * " " - wrap_width = wrap_max - len(indent) - if isinstance(text, Path): - text = path2str(text) - return textwrap.fill( - text, - width=wrap_width, - initial_indent=indent, - subsequent_indent=indent, - break_long_words=False, - break_on_hyphens=False, - ) - - def minify_html(html): """Perform a template-specific, rudimentary HTML minification for displaCy. Disclaimer: NOT a general-purpose solution, only removes indentation and diff --git a/website/api/_top-level/_util.jade b/website/api/_top-level/_util.jade index 70999692d..e2204ae6f 100644 --- a/website/api/_top-level/_util.jade +++ b/website/api/_top-level/_util.jade @@ -320,37 +320,6 @@ p +cell dict +cell Combined tokenizer exceptions. - -+h(3, "util.prints") util.prints - +tag function - +tag-new(2) - -p - | Print a formatted, text-wrapped message with optional title. If a text - | argument is a #[code Path], it's converted to a string. Should only - | be used for interactive components like the command-line interface. - -+aside-code("Example"). - data_path = Path('/some/path') - if not path.exists(): - util.prints("Can't find the path.", data_path, - title="Error", exits=1) - -+table(["Name", "Type", "Description"]) - +row - +cell #[code *texts] - +cell unicode - +cell Texts to print. Each argument is rendered as paragraph. - - +row - +cell #[code **kwargs] - +cell - - +cell - | #[code title] is rendered as coloured headline. #[code exits] - | performs system exit after printing, using the value of the - | argument as the exit code, e.g. #[code exits=1]. - - +h(3, "util.minibatch") util.minibatch +tag function +tag-new(2) diff --git a/website/api/cli.jade b/website/api/cli.jade index 28c76c09c..8083ae06a 100644 --- a/website/api/cli.jade +++ b/website/api/cli.jade @@ -257,10 +257,19 @@ p | to allow packaging the model using the | #[+api("cli#package") #[code package]] command. ++infobox("Changed in v2.1", "⚠️") + | As of spaCy 2.1, the #[code --no-tagger], #[code --no-parser] and + | #[code --no-parser] flags have been replaced by a #[code --pipeline] + | option, which lets you define comma-separated names of pipeline + | components to train. For example, #[code --pipeline tagger,parser] will + | only train the tagger and parser. + +code(false, "bash", "$", false, false, true). - python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] - [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] - [--no-entities] [--gold-preproc] [--verbose] + python -m spacy train [lang] [output_path] [train_path] [dev_path] + [--base-model] [--pipeline] [--vectors] [--n-iter] [--n-examples] [--use-gpu] + [--version] [--meta-path] [--init-tok2vec] [--parser-multitasks] + [--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens] + [--verbose] +table(["Argument", "Type", "Description"]) +row @@ -269,34 +278,34 @@ p +cell Model language. +row - +cell #[code output_dir] + +cell #[code output_path] +cell positional - +cell Directory to store model in. + +cell Directory to store model in. Will be created if it doesn't exist. +row - +cell #[code train_data] + +cell #[code train_path] +cell positional +cell Location of JSON-formatted training data. +row - +cell #[code dev_data] + +cell #[code dev_path] +cell positional +cell Location of JSON-formatted development data for evaluation. +row - +cell #[code --n-iter], #[code -n] + +cell #[code --base-model], #[code -b] +cell option - +cell Number of iterations (default: #[code 30]). + +cell + | Optional name of base model to update. Can be any loadable + | spaCy model. +row - +cell #[code --n-sents], #[code -ns] + +cell #[code --pipeline], #[code -p] + +tag-new("2.1.0") +cell option - +cell Number of sentences (default: #[code 0]). - - +row - +cell #[code --use-gpu], #[code -g] - +cell option - +cell Use GPU. + +cell + | Comma-separated names of pipeline components to train. Defaults + | to #[code 'tagger,parser,ner']. +row +cell #[code --vectors], #[code -v] @@ -304,13 +313,21 @@ p +cell Model to load vectors from. +row - +cell #[code --meta-path], #[code -m] + +cell #[code --n-iter], #[code -n] + +cell option + +cell Number of iterations (default: #[code 30]). + + +row + +cell #[code --n-examples], #[code -ns] + +cell option + +cell Number of examples to use (defaults to #[code 0] for all examples). + + +row + +cell #[code --use-gpu], #[code -g] +cell option +cell - | #[+tag-new(2)] Optional path to model - | #[+a("/usage/training#models-generating") #[code meta.json]]. - | All relevant properties like #[code lang], #[code pipeline] and - | #[code spacy_version] will be overwritten. + | Whether to use GPU. Can be either #[code 0], #[code 1] or + | #[code -1]. +row +cell #[code --version], #[code -V] @@ -320,40 +337,69 @@ p | #[code meta.json] after training. +row - +cell #[code --no-tagger], #[code -T] - +cell flag - +cell Don't train tagger. + +cell #[code --meta-path], #[code -m] + +tag-new(2) + +cell option + +cell + | Optional path to model + | #[+a("/usage/training#models-generating") #[code meta.json]]. + | All relevant properties like #[code lang], #[code pipeline] and + | #[code spacy_version] will be overwritten. +row - +cell #[code --no-parser], #[code -P] - +cell flag - +cell Don't train parser. + +cell #[code --init-tok2vec], #[code -t2v] + +tag-new("2.1.0") + +cell option + +cell + | Path to pretrained weights for the token-to-vector parts of the + | models. See #[code spacy pretrain]. Experimental. +row - +cell #[code --no-entities], #[code -N] - +cell flag - +cell Don't train NER. + +cell #[code --parser-multitasks], #[code -pt] + +cell option + +cell + | Side objectives for parser CNN, e.g. #[code 'dep'] or + | #[code 'dep,tag'] + + +row + +cell #[code --entity-multitasks], #[code -et] + +cell option + +cell + | Side objectives for NER CNN, e.g. #[code 'dep'] or + | #[code 'dep,tag'] + + +row + +cell #[code --noise-level], #[code -nl] + +cell option + +cell Float indicating the amount of corruption for data agumentation. +row +cell #[code --gold-preproc], #[code -G] +cell flag +cell Use gold preprocessing. + +row + +cell #[code --learn-tokens], #[code -T] + +cell flag + +cell + | Make parser learn gold-standard tokenization by merging + ] subtokens. Typically used for languages like Chinese. + + +row + +cell #[code --verbose], #[code -VV] + +tag-new("2.0.13") + +cell flag + +cell Show more detailed messages during training. + +row +cell #[code --help], #[code -h] +cell flag +cell Show help message and available arguments. - +row - +cell #[code --verbose] - +tag-new("2.0.13") - +cell flag - +cell Show more detail message during training. - +row("foot") +cell creates +cell model, pickle - +cell A spaCy model on each epoch, and a final #[code .pickle] file. + +cell A spaCy model on each epoch. +h(4, "train-hyperparams") Environment variables for hyperparameters +tag-new(2)