Merge branch 'develop'

2026-01-09 18:21:14 +03:00 · 2017-03-26 15:57:00 +02:00 · 2017-03-26 15:57:00 +02:00 · c00d997924
commit c00d997924
parent 4731254ca5 13df2d6a60
3 changed files with 104 additions and 21 deletions
--- a/spacy/main.py
+++ b/spacy/main.py
@ -9,7 +9,6 @@ from spacy.cli import link as cli_link
 from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 from spacy.cli import train as cli_train
-from spacy.cli import train_config as cli_train_config


 class CLI(object):
@ -77,36 +76,29 @@ class CLI(object):


    @plac.annotations(
-        lang=("language", "positional", None, str),
-        output_dir=("output directory", "positional", None, str),
-        train_data=("training data", "positional", None, str),
-        dev_data=("development data", "positional", None, str),
+        lang=("model language", "positional", None, str),
+        output_dir=("output directory to store model in", "positional", None, str),
+        train_data=("location of JSON-formatted training data", "positional", None, str),
+        dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
        n_iter=("number of iterations", "option", "n", int),
        parser_L1=("L1 regularization penalty for parser", "option", "L", float),
        no_tagger=("Don't train tagger", "flag", "T", bool),
        no_parser=("Don't train parser", "flag", "P", bool),
        no_ner=("Don't train NER", "flag", "N", bool)
    )
-    def train(self, lang, output_dir, train_data, dev_data, n_iter=15,
-              parser_L1=0.0,
-              no_tagger=False, no_parser=False, no_ner=False):
-        """Train a model."""
-        cli_train(lang, output_dir, train_data, dev_data, n_iter,
-                  not no_tagger, not no_parser, not no_ner,
-                  parser_L1)
+    def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
+              parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False):
+        """
+        Train a model. Expects data in spaCy's JSON format.
+        """

-
-    @plac.annotations(
-        config=("config", "positional", None, str),
-    )
-    def train_config(self, config):
-        """Train a model from config file."""
-
-        cli_train_config(config)
+        cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
+                  not no_parser, not no_ner, parser_L1)


    def __missing__(self, name):
-        print("\n   Command %r does not exist\n" % name)
+        print("\n   Command %r does not exist."
+              "\n   Use the --help flag for a list of available commands.\n" % name)


 if __name__ == '__main__':
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@ -79,3 +79,33 @@ p
 +h(2, "named-entities") Named Entity Recognition

 include _annotation/_named-entities
+
+h(2, "json-input") JSON input format for training
+
+p
+    |  spaCy takes training data in the following format:
+
+code("Example structure").
+    doc: {
+        id: string,
+        paragraphs: [{
+            raw: string,
+            sents: [int],
+            tokens: [{
+                start: int,
+                tag: string,
+                head: int,
+                dep: string
+            }],
+            ner: [{
+                start: int,
+                end: int,
+                label: string
+            }],
+            brackets: [{
+                start: int,
+                end: int,
+                label: string
+            }]
+        }]
+    }
--- a/website/docs/usage/cli.jade
+++ b/website/docs/usage/cli.jade
@ -138,3 +138,64 @@ p
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
+
+h(2, "train") Train
+    +tag experimental
+
+p
+    |  Train a model. Expects data in spaCy's
+    |  #[+a("/docs/api/annotation#json-input") JSON format].
+
+code(false, "bash").
+    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
+
+table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code lang]
+        +cell positional
+        +cell Model language.
+
+    +row
+        +cell #[code output_dir]
+        +cell positional
+        +cell Directory to store model in.
+
+    +row
+        +cell #[code train_data]
+        +cell positional
+        +cell Location of JSON-formatted training data.
+
+    +row
+        +cell #[code dev_data]
+        +cell positional
+        +cell Location of JSON-formatted dev data (optional).
+
+    +row
+        +cell #[code --n_iter], #[code -n]
+        +cell option
+        +cell Number of iterations (default: #[code 15]).
+
+    +row
+        +cell #[code --parser_L1], #[code -L]
+        +cell option
+        +cell L1 regularization penalty for parser (default: #[code 0.0]).
+
+    +row
+        +cell #[code --no_tagger], #[code -T]
+        +cell flag
+        +cell Don't train tagger.
+
+    +row
+        +cell #[code --no_parser], #[code -P]
+        +cell flag
+        +cell Don't train parser.
+
+    +row
+        +cell #[code --no_ner], #[code -N]
+        +cell flag
+        +cell Don't train NER.
+
+    +row
+        +cell #[code --help], #[code -h]
+        +cell flag
+        +cell Show help message and available arguments.