From 5901c8f7f0f4100b54f730d57522e7893ff748bd Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 15:33:48 +0200 Subject: [PATCH 1/5] Update spacy train CLI documentation --- spacy/__main__.py | 22 ++++++++------ website/docs/usage/cli.jade | 60 +++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 9 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index fedf3166b..c41c85633 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -77,20 +77,22 @@ class CLI(object): @plac.annotations( - lang=("language", "positional", None, str), - output_dir=("output directory", "positional", None, str), - train_data=("training data", "positional", None, str), - dev_data=("development data", "positional", None, str), + lang=("model language", "positional", None, str), + output_dir=("output directory to store model in", "positional", None, str), + train_data=("location of JSON-formatted training data", "positional", None, str), + dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), n_iter=("number of iterations", "option", "n", int), parser_L1=("L1 regularization penalty for parser", "option", "L", float), no_tagger=("Don't train tagger", "flag", "T", bool), no_parser=("Don't train parser", "flag", "P", bool), no_ner=("Don't train NER", "flag", "N", bool) ) - def train(self, lang, output_dir, train_data, dev_data, n_iter=15, - parser_L1=0.0, - no_tagger=False, no_parser=False, no_ner=False): - """Train a model.""" + def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15, + parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False): + """ + Train a model. Expects data in spaCy's JSON format. + """ + cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, not no_parser, not no_ner, parser_L1) @@ -100,7 +102,9 @@ class CLI(object): config=("config", "positional", None, str), ) def train_config(self, config): - """Train a model from config file.""" + """ + Train a model from config file. + """ cli_train_config(config) diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index cc07c18ea..ca33e4e40 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -138,3 +138,63 @@ p +cell #[code --help], #[code -h] +cell flag +cell Show help message and available arguments. + ++h(2, "train") Train + +tag experimental + +p + | Train a model. Expects data in spaCy's JSON format. + ++code(false, "bash"). + python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code lang] + +cell positional + +cell Model language. + + +row + +cell #[code output_dir] + +cell positional + +cell Directory to store model in. + + +row + +cell #[code train_data] + +cell positional + +cell Location of JSON-formatted training data. + + +row + +cell #[code dev_data] + +cell positional + +cell Location of JSON-formatted dev data (optional). + + +row + +cell #[code --n_iter], #[code -n] + +cell option + +cell Number of iterations (default: #[code 15]). + + +row + +cell #[code --parser_L1], #[code -L] + +cell option + +cell L1 regularization penalty for parser (default: #[code 0.0]). + + +row + +cell #[code --no_tagger], #[code -T] + +cell flag + +cell Don't train tagger. + + +row + +cell #[code --no_parser], #[code -P] + +cell flag + +cell Don't train parser. + + +row + +cell #[code --no_ner], #[code -N] + +cell flag + +cell Don't train NER. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. From 7f95023fc034c60e0cab117b61f34fa0d3afbaa4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 15:37:37 +0200 Subject: [PATCH 2/5] Fix formatting --- spacy/__main__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index c41c85633..34f5c96ab 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -93,9 +93,8 @@ class CLI(object): Train a model. Expects data in spaCy's JSON format. """ - cli_train(lang, output_dir, train_data, dev_data, n_iter, - not no_tagger, not no_parser, not no_ner, - parser_L1) + cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, + not no_parser, not no_ner, parser_L1) @plac.annotations( From b297fab062c9655012891f8b1c3da95bec04fc76 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 15:40:02 +0200 Subject: [PATCH 3/5] Update error message for missing commands --- spacy/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 34f5c96ab..598635a9b 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -109,7 +109,8 @@ class CLI(object): def __missing__(self, name): - print("\n Command %r does not exist\n" % name) + print("\n Command %r does not exist." + "\n Use the --help flag for a list of available commands.\n" % name) if __name__ == '__main__': From 007a2492bdd74e11e802ff2d3e9457287381e889 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 15:40:50 +0200 Subject: [PATCH 4/5] Remove train_config command for now --- spacy/__main__.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 598635a9b..6e96893ea 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -9,7 +9,6 @@ from spacy.cli import link as cli_link from spacy.cli import info as cli_info from spacy.cli import package as cli_package from spacy.cli import train as cli_train -from spacy.cli import train_config as cli_train_config class CLI(object): @@ -97,17 +96,6 @@ class CLI(object): not no_parser, not no_ner, parser_L1) - @plac.annotations( - config=("config", "positional", None, str), - ) - def train_config(self, config): - """ - Train a model from config file. - """ - - cli_train_config(config) - - def __missing__(self, name): print("\n Command %r does not exist." "\n Use the --help flag for a list of available commands.\n" % name) From 13df2d6a60645d0aaf5dea14d8ae4e9e7b3e9ed2 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 15:56:15 +0200 Subject: [PATCH 5/5] Add documentation for spaCy's JSON format --- website/docs/api/annotation.jade | 30 ++++++++++++++++++++++++++++++ website/docs/usage/cli.jade | 3 ++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 93511899b..adc6b28f7 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -79,3 +79,33 @@ p +h(2, "named-entities") Named Entity Recognition include _annotation/_named-entities + ++h(2, "json-input") JSON input format for training + +p + | spaCy takes training data in the following format: + ++code("Example structure"). + doc: { + id: string, + paragraphs: [{ + raw: string, + sents: [int], + tokens: [{ + start: int, + tag: string, + head: int, + dep: string + }], + ner: [{ + start: int, + end: int, + label: string + }], + brackets: [{ + start: int, + end: int, + label: string + }] + }] + } diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index ca33e4e40..74d6554b0 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -143,7 +143,8 @@ p +tag experimental p - | Train a model. Expects data in spaCy's JSON format. + | Train a model. Expects data in spaCy's + | #[+a("/docs/api/annotation#json-input") JSON format]. +code(false, "bash"). python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]