This commit is contained in:
Matthew Honnibal 2017-03-26 09:26:59 -05:00
commit 92ac3af21d
4 changed files with 106 additions and 23 deletions

View File

@ -5,8 +5,8 @@ spaCy is a library for advanced natural language processing in Python and
Cython. spaCy is built on the very latest research, but it isn't researchware. Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day one to be used in real products. spaCy currently supports It was designed from day one to be used in real products. spaCy currently supports
English and German, as well as tokenization for Chinese, Spanish, Italian, French, English and German, as well as tokenization for Chinese, Spanish, Italian, French,
Portuguese, Dutch, Swedish, Finnish, Hungarian and Bengali. It's commercial open-source Portuguese, Dutch, Swedish, Finnish, Hungarian, Bengali and Hebrew. It's commercial
software, released under the MIT license. open-source software, released under the MIT license.
💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_ 💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_

View File

@ -9,7 +9,6 @@ from spacy.cli import link as cli_link
from spacy.cli import info as cli_info from spacy.cli import info as cli_info
from spacy.cli import package as cli_package from spacy.cli import package as cli_package
from spacy.cli import train as cli_train from spacy.cli import train as cli_train
from spacy.cli import train_config as cli_train_config
class CLI(object): class CLI(object):
@ -77,36 +76,29 @@ class CLI(object):
@plac.annotations( @plac.annotations(
lang=("language", "positional", None, str), lang=("model language", "positional", None, str),
output_dir=("output directory", "positional", None, str), output_dir=("output directory to store model in", "positional", None, str),
train_data=("training data", "positional", None, str), train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("development data", "positional", None, str), dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int), n_iter=("number of iterations", "option", "n", int),
parser_L1=("L1 regularization penalty for parser", "option", "L", float), parser_L1=("L1 regularization penalty for parser", "option", "L", float),
no_tagger=("Don't train tagger", "flag", "T", bool), no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool), no_parser=("Don't train parser", "flag", "P", bool),
no_ner=("Don't train NER", "flag", "N", bool) no_ner=("Don't train NER", "flag", "N", bool)
) )
def train(self, lang, output_dir, train_data, dev_data, n_iter=15, def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
parser_L1=0.0, parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False):
no_tagger=False, no_parser=False, no_ner=False): """
"""Train a model.""" Train a model. Expects data in spaCy's JSON format.
cli_train(lang, output_dir, train_data, dev_data, n_iter, """
not no_tagger, not no_parser, not no_ner,
parser_L1)
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
@plac.annotations( not no_parser, not no_ner, parser_L1)
config=("config", "positional", None, str),
)
def train_config(self, config):
"""Train a model from config file."""
cli_train_config(config)
def __missing__(self, name): def __missing__(self, name):
print("\n Command %r does not exist\n" % name) print("\n Command %r does not exist."
"\n Use the --help flag for a list of available commands.\n" % name)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -79,3 +79,33 @@ p
+h(2, "named-entities") Named Entity Recognition +h(2, "named-entities") Named Entity Recognition
include _annotation/_named-entities include _annotation/_named-entities
+h(2, "json-input") JSON input format for training
p
| spaCy takes training data in the following format:
+code("Example structure").
doc: {
id: string,
paragraphs: [{
raw: string,
sents: [int],
tokens: [{
start: int,
tag: string,
head: int,
dep: string
}],
ner: [{
start: int,
end: int,
label: string
}],
brackets: [{
start: int,
end: int,
label: string
}]
}]
}

View File

@ -138,3 +138,64 @@ p
+cell #[code --help], #[code -h] +cell #[code --help], #[code -h]
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "train") Train
+tag experimental
p
| Train a model. Expects data in spaCy's
| #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
+table(["Argument", "Type", "Description"])
+row
+cell #[code lang]
+cell positional
+cell Model language.
+row
+cell #[code output_dir]
+cell positional
+cell Directory to store model in.
+row
+cell #[code train_data]
+cell positional
+cell Location of JSON-formatted training data.
+row
+cell #[code dev_data]
+cell positional
+cell Location of JSON-formatted dev data (optional).
+row
+cell #[code --n_iter], #[code -n]
+cell option
+cell Number of iterations (default: #[code 15]).
+row
+cell #[code --parser_L1], #[code -L]
+cell option
+cell L1 regularization penalty for parser (default: #[code 0.0]).
+row
+cell #[code --no_tagger], #[code -T]
+cell flag
+cell Don't train tagger.
+row
+cell #[code --no_parser], #[code -P]
+cell flag
+cell Don't train parser.
+row
+cell #[code --no_ner], #[code -N]
+cell flag
+cell Don't train NER.
+row
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.