Merge branch 'master' of https://github.com/explosion/spaCy

2025-11-07 19:37:38 +03:00 · 2017-03-26 09:26:59 -05:00 · 2017-03-26 09:26:59 -05:00 · 92ac3af21d
commit 92ac3af21d
parent a9b1f23c7d c9a95d55fd
4 changed files with 106 additions and 23 deletions
--- a/README.rst
+++ b/README.rst
@ -5,8 +5,8 @@ spaCy is a library for advanced natural language processing in Python and
 Cython. spaCy is built on  the very latest research, but it isn't researchware.
 It was designed from day one to be used in real products. spaCy currently supports
 English and German,  as well as tokenization for Chinese, Spanish, Italian, French,
-Portuguese, Dutch, Swedish, Finnish, Hungarian and Bengali. It's commercial  open-source
+Portuguese, Dutch, Swedish, Finnish, Hungarian, Bengali and Hebrew. It's commercial 
-software, released under the MIT license.
+open-source software, released under the MIT license.
 💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
--- a/spacy/main.py
+++ b/spacy/main.py
@ -9,7 +9,6 @@ from spacy.cli import link as cli_link
 from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 from spacy.cli import train as cli_train
 from spacy.cli import train_config as cli_train_config
 class CLI(object):
@ -77,36 +76,29 @@ class CLI(object):
    @plac.annotations(
-        lang=("language", "positional", None, str),
+        lang=("model language", "positional", None, str),
-        output_dir=("output directory", "positional", None, str),
+        output_dir=("output directory to store model in", "positional", None, str),
-        train_data=("training data", "positional", None, str),
+        train_data=("location of JSON-formatted training data", "positional", None, str),
-        dev_data=("development data", "positional", None, str),
+        dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
        n_iter=("number of iterations", "option", "n", int),
        parser_L1=("L1 regularization penalty for parser", "option", "L", float),
        no_tagger=("Don't train tagger", "flag", "T", bool),
        no_parser=("Don't train parser", "flag", "P", bool),
        no_ner=("Don't train NER", "flag", "N", bool)
    )
-    def train(self, lang, output_dir, train_data, dev_data, n_iter=15,
+    def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
-              parser_L1=0.0,
+              parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False):
-              no_tagger=False, no_parser=False, no_ner=False):
+        """
-        """Train a model."""
+        Train a model. Expects data in spaCy's JSON format.
-        cli_train(lang, output_dir, train_data, dev_data, n_iter,
+        """
                  not no_tagger, not no_parser, not no_ner,
                  parser_L1)
-
+        cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
-    @plac.annotations(
+                  not no_parser, not no_ner, parser_L1)
        config=("config", "positional", None, str),
    )
    def train_config(self, config):
        """Train a model from config file."""
        cli_train_config(config)
    def __missing__(self, name):
-        print("\n   Command %r does not exist\n" % name)
+        print("\n   Command %r does not exist."
              "\n   Use the --help flag for a list of available commands.\n" % name)
 if __name__ == '__main__':
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@ -79,3 +79,33 @@ p
 +h(2, "named-entities") Named Entity Recognition
 include _annotation/_named-entities
 +h(2, "json-input") JSON input format for training
 p
    |  spaCy takes training data in the following format:
 +code("Example structure").
    doc: {
        id: string,
        paragraphs: [{
            raw: string,
            sents: [int],
            tokens: [{
                start: int,
                tag: string,
                head: int,
                dep: string
            }],
            ner: [{
                start: int,
                end: int,
                label: string
            }],
            brackets: [{
                start: int,
                end: int,
                label: string
            }]
        }]
    }
--- a/website/docs/usage/cli.jade
+++ b/website/docs/usage/cli.jade
@ -138,3 +138,64 @@ p
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
 +h(2, "train") Train
    +tag experimental
 p
    |  Train a model. Expects data in spaCy's
    |  #[+a("/docs/api/annotation#json-input") JSON format].
 +code(false, "bash").
    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code lang]
        +cell positional
        +cell Model language.
    +row
        +cell #[code output_dir]
        +cell positional
        +cell Directory to store model in.
    +row
        +cell #[code train_data]
        +cell positional
        +cell Location of JSON-formatted training data.
    +row
        +cell #[code dev_data]
        +cell positional
        +cell Location of JSON-formatted dev data (optional).
    +row
        +cell #[code --n_iter], #[code -n]
        +cell option
        +cell Number of iterations (default: #[code 15]).
    +row
        +cell #[code --parser_L1], #[code -L]
        +cell option
        +cell L1 regularization penalty for parser (default: #[code 0.0]).
    +row
        +cell #[code --no_tagger], #[code -T]
        +cell flag
        +cell Don't train tagger.
    +row
        +cell #[code --no_parser], #[code -P]
        +cell flag
        +cell Don't train parser.
    +row
        +cell #[code --no_ner], #[code -N]
        +cell flag
        +cell Don't train NER.
    +row
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.