Merge branch 'master' of https://github.com/explosion/spaCy

2025-08-04 04:10:20 +03:00 · 2017-03-26 09:26:59 -05:00 · 2017-03-26 09:26:59 -05:00 · 92ac3af21d
commit 92ac3af21d
parent a9b1f23c7d c9a95d55fd
4 changed files with 106 additions and 23 deletions
--- a/README.rst
+++ b/README.rst
@ -5,8 +5,8 @@ spaCy is a library for advanced natural language processing in Python and
 Cython. spaCy is built on  the very latest research, but it isn't researchware.
 It was designed from day one to be used in real products. spaCy currently supports
 English and German,  as well as tokenization for Chinese, Spanish, Italian, French,
-Portuguese, Dutch, Swedish, Finnish, Hungarian and Bengali. It's commercial  open-source
-software, released under the MIT license.
+Portuguese, Dutch, Swedish, Finnish, Hungarian, Bengali and Hebrew. It's commercial 
+open-source software, released under the MIT license.

 💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_

--- a/spacy/main.py
+++ b/spacy/main.py
@ -9,7 +9,6 @@ from spacy.cli import link as cli_link
 from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 from spacy.cli import train as cli_train
-from spacy.cli import train_config as cli_train_config


 class CLI(object):
@ -77,36 +76,29 @@ class CLI(object):


    @plac.annotations(
-        lang=("language", "positional", None, str),
-        output_dir=("output directory", "positional", None, str),
-        train_data=("training data", "positional", None, str),
-        dev_data=("development data", "positional", None, str),
+        lang=("model language", "positional", None, str),
+        output_dir=("output directory to store model in", "positional", None, str),
+        train_data=("location of JSON-formatted training data", "positional", None, str),
+        dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
        n_iter=("number of iterations", "option", "n", int),
        parser_L1=("L1 regularization penalty for parser", "option", "L", float),
        no_tagger=("Don't train tagger", "flag", "T", bool),
        no_parser=("Don't train parser", "flag", "P", bool),
        no_ner=("Don't train NER", "flag", "N", bool)
    )
-    def train(self, lang, output_dir, train_data, dev_data, n_iter=15,
-              parser_L1=0.0,
-              no_tagger=False, no_parser=False, no_ner=False):
-        """Train a model."""
-        cli_train(lang, output_dir, train_data, dev_data, n_iter,
-                  not no_tagger, not no_parser, not no_ner,
-                  parser_L1)
+    def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
+              parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False):
+        """
+        Train a model. Expects data in spaCy's JSON format.
+        """

-
-    @plac.annotations(
-        config=("config", "positional", None, str),
-    )
-    def train_config(self, config):
-        """Train a model from config file."""
-
-        cli_train_config(config)
+        cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
+                  not no_parser, not no_ner, parser_L1)


    def __missing__(self, name):
-        print("\n   Command %r does not exist\n" % name)
+        print("\n   Command %r does not exist."
+              "\n   Use the --help flag for a list of available commands.\n" % name)


 if __name__ == '__main__':
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@ -79,3 +79,33 @@ p
 +h(2, "named-entities") Named Entity Recognition

 include _annotation/_named-entities
+
+h(2, "json-input") JSON input format for training
+
+p
+    |  spaCy takes training data in the following format:
+
+code("Example structure").
+    doc: {
+        id: string,
+        paragraphs: [{
+            raw: string,
+            sents: [int],
+            tokens: [{
+                start: int,
+                tag: string,
+                head: int,
+                dep: string
+            }],
+            ner: [{
+                start: int,
+                end: int,
+                label: string
+            }],
+            brackets: [{
+                start: int,
+                end: int,
+                label: string
+            }]
+        }]
+    }
--- a/website/docs/usage/cli.jade
+++ b/website/docs/usage/cli.jade
@ -138,3 +138,64 @@ p
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
+
+h(2, "train") Train
+    +tag experimental
+
+p
+    |  Train a model. Expects data in spaCy's
+    |  #[+a("/docs/api/annotation#json-input") JSON format].
+
+code(false, "bash").
+    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
+
+table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code lang]
+        +cell positional
+        +cell Model language.
+
+    +row
+        +cell #[code output_dir]
+        +cell positional
+        +cell Directory to store model in.
+
+    +row
+        +cell #[code train_data]
+        +cell positional
+        +cell Location of JSON-formatted training data.
+
+    +row
+        +cell #[code dev_data]
+        +cell positional
+        +cell Location of JSON-formatted dev data (optional).
+
+    +row
+        +cell #[code --n_iter], #[code -n]
+        +cell option
+        +cell Number of iterations (default: #[code 15]).
+
+    +row
+        +cell #[code --parser_L1], #[code -L]
+        +cell option
+        +cell L1 regularization penalty for parser (default: #[code 0.0]).
+
+    +row
+        +cell #[code --no_tagger], #[code -T]
+        +cell flag
+        +cell Don't train tagger.
+
+    +row
+        +cell #[code --no_parser], #[code -P]
+        +cell flag
+        +cell Don't train parser.
+
+    +row
+        +cell #[code --no_ner], #[code -N]
+        +cell flag
+        +cell Don't train NER.
+
+    +row
+        +cell #[code --help], #[code -h]
+        +cell flag
+        +cell Show help message and available arguments.