diff --git a/README.rst b/README.rst index 1ef3b136d..a00208036 100644 --- a/README.rst +++ b/README.rst @@ -5,8 +5,8 @@ spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day one to be used in real products. spaCy currently supports English and German, as well as tokenization for Chinese, Spanish, Italian, French, -Portuguese, Dutch, Swedish, Finnish, Hungarian and Bengali. It's commercial open-source -software, released under the MIT license. +Portuguese, Dutch, Swedish, Finnish, Hungarian, Bengali and Hebrew. It's commercial +open-source software, released under the MIT license. 💫 **Version 1.7 out now!** `Read the release notes here. `_ diff --git a/spacy/__main__.py b/spacy/__main__.py index fedf3166b..6e96893ea 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -9,7 +9,6 @@ from spacy.cli import link as cli_link from spacy.cli import info as cli_info from spacy.cli import package as cli_package from spacy.cli import train as cli_train -from spacy.cli import train_config as cli_train_config class CLI(object): @@ -77,36 +76,29 @@ class CLI(object): @plac.annotations( - lang=("language", "positional", None, str), - output_dir=("output directory", "positional", None, str), - train_data=("training data", "positional", None, str), - dev_data=("development data", "positional", None, str), + lang=("model language", "positional", None, str), + output_dir=("output directory to store model in", "positional", None, str), + train_data=("location of JSON-formatted training data", "positional", None, str), + dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), n_iter=("number of iterations", "option", "n", int), parser_L1=("L1 regularization penalty for parser", "option", "L", float), no_tagger=("Don't train tagger", "flag", "T", bool), no_parser=("Don't train parser", "flag", "P", bool), no_ner=("Don't train NER", "flag", "N", bool) ) - def train(self, lang, output_dir, train_data, dev_data, n_iter=15, - parser_L1=0.0, - no_tagger=False, no_parser=False, no_ner=False): - """Train a model.""" - cli_train(lang, output_dir, train_data, dev_data, n_iter, - not no_tagger, not no_parser, not no_ner, - parser_L1) + def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15, + parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False): + """ + Train a model. Expects data in spaCy's JSON format. + """ - - @plac.annotations( - config=("config", "positional", None, str), - ) - def train_config(self, config): - """Train a model from config file.""" - - cli_train_config(config) + cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, + not no_parser, not no_ner, parser_L1) def __missing__(self, name): - print("\n Command %r does not exist\n" % name) + print("\n Command %r does not exist." + "\n Use the --help flag for a list of available commands.\n" % name) if __name__ == '__main__': diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 93511899b..adc6b28f7 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -79,3 +79,33 @@ p +h(2, "named-entities") Named Entity Recognition include _annotation/_named-entities + ++h(2, "json-input") JSON input format for training + +p + | spaCy takes training data in the following format: + ++code("Example structure"). + doc: { + id: string, + paragraphs: [{ + raw: string, + sents: [int], + tokens: [{ + start: int, + tag: string, + head: int, + dep: string + }], + ner: [{ + start: int, + end: int, + label: string + }], + brackets: [{ + start: int, + end: int, + label: string + }] + }] + } diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index cc07c18ea..74d6554b0 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -138,3 +138,64 @@ p +cell #[code --help], #[code -h] +cell flag +cell Show help message and available arguments. + ++h(2, "train") Train + +tag experimental + +p + | Train a model. Expects data in spaCy's + | #[+a("/docs/api/annotation#json-input") JSON format]. + ++code(false, "bash"). + python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code lang] + +cell positional + +cell Model language. + + +row + +cell #[code output_dir] + +cell positional + +cell Directory to store model in. + + +row + +cell #[code train_data] + +cell positional + +cell Location of JSON-formatted training data. + + +row + +cell #[code dev_data] + +cell positional + +cell Location of JSON-formatted dev data (optional). + + +row + +cell #[code --n_iter], #[code -n] + +cell option + +cell Number of iterations (default: #[code 15]). + + +row + +cell #[code --parser_L1], #[code -L] + +cell option + +cell L1 regularization penalty for parser (default: #[code 0.0]). + + +row + +cell #[code --no_tagger], #[code -T] + +cell flag + +cell Don't train tagger. + + +row + +cell #[code --no_parser], #[code -P] + +cell flag + +cell Don't train parser. + + +row + +cell #[code --no_ner], #[code -N] + +cell flag + +cell Don't train NER. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments.