This commit is contained in:
Matthew Honnibal 2017-03-26 09:26:59 -05:00
commit 92ac3af21d
4 changed files with 106 additions and 23 deletions

View File

@ -5,8 +5,8 @@ spaCy is a library for advanced natural language processing in Python and
Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day one to be used in real products. spaCy currently supports
English and German, as well as tokenization for Chinese, Spanish, Italian, French,
Portuguese, Dutch, Swedish, Finnish, Hungarian and Bengali. It's commercial open-source
software, released under the MIT license.
Portuguese, Dutch, Swedish, Finnish, Hungarian, Bengali and Hebrew. It's commercial
open-source software, released under the MIT license.
💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_

View File

@ -9,7 +9,6 @@ from spacy.cli import link as cli_link
from spacy.cli import info as cli_info
from spacy.cli import package as cli_package
from spacy.cli import train as cli_train
from spacy.cli import train_config as cli_train_config
class CLI(object):
@ -77,36 +76,29 @@ class CLI(object):
@plac.annotations(
lang=("language", "positional", None, str),
output_dir=("output directory", "positional", None, str),
train_data=("training data", "positional", None, str),
dev_data=("development data", "positional", None, str),
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
parser_L1=("L1 regularization penalty for parser", "option", "L", float),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_ner=("Don't train NER", "flag", "N", bool)
)
def train(self, lang, output_dir, train_data, dev_data, n_iter=15,
parser_L1=0.0,
no_tagger=False, no_parser=False, no_ner=False):
"""Train a model."""
cli_train(lang, output_dir, train_data, dev_data, n_iter,
not no_tagger, not no_parser, not no_ner,
parser_L1)
def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
@plac.annotations(
config=("config", "positional", None, str),
)
def train_config(self, config):
"""Train a model from config file."""
cli_train_config(config)
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
not no_parser, not no_ner, parser_L1)
def __missing__(self, name):
print("\n Command %r does not exist\n" % name)
print("\n Command %r does not exist."
"\n Use the --help flag for a list of available commands.\n" % name)
if __name__ == '__main__':

View File

@ -79,3 +79,33 @@ p
+h(2, "named-entities") Named Entity Recognition
include _annotation/_named-entities
+h(2, "json-input") JSON input format for training
p
| spaCy takes training data in the following format:
+code("Example structure").
doc: {
id: string,
paragraphs: [{
raw: string,
sents: [int],
tokens: [{
start: int,
tag: string,
head: int,
dep: string
}],
ner: [{
start: int,
end: int,
label: string
}],
brackets: [{
start: int,
end: int,
label: string
}]
}]
}

View File

@ -138,3 +138,64 @@ p
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.
+h(2, "train") Train
+tag experimental
p
| Train a model. Expects data in spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
+table(["Argument", "Type", "Description"])
+row
+cell #[code lang]
+cell positional
+cell Model language.
+row
+cell #[code output_dir]
+cell positional
+cell Directory to store model in.
+row
+cell #[code train_data]
+cell positional
+cell Location of JSON-formatted training data.
+row
+cell #[code dev_data]
+cell positional
+cell Location of JSON-formatted dev data (optional).
+row
+cell #[code --n_iter], #[code -n]
+cell option
+cell Number of iterations (default: #[code 15]).
+row
+cell #[code --parser_L1], #[code -L]
+cell option
+cell L1 regularization penalty for parser (default: #[code 0.0]).
+row
+cell #[code --no_tagger], #[code -T]
+cell flag
+cell Don't train tagger.
+row
+cell #[code --no_parser], #[code -P]
+cell flag
+cell Don't train parser.
+row
+cell #[code --no_ner], #[code -N]
+cell flag
+cell Don't train NER.
+row
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.