mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
92ac3af21d
|
@ -5,8 +5,8 @@ spaCy is a library for advanced natural language processing in Python and
|
||||||
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||||
It was designed from day one to be used in real products. spaCy currently supports
|
It was designed from day one to be used in real products. spaCy currently supports
|
||||||
English and German, as well as tokenization for Chinese, Spanish, Italian, French,
|
English and German, as well as tokenization for Chinese, Spanish, Italian, French,
|
||||||
Portuguese, Dutch, Swedish, Finnish, Hungarian and Bengali. It's commercial open-source
|
Portuguese, Dutch, Swedish, Finnish, Hungarian, Bengali and Hebrew. It's commercial
|
||||||
software, released under the MIT license.
|
open-source software, released under the MIT license.
|
||||||
|
|
||||||
💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
💫 **Version 1.7 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,6 @@ from spacy.cli import link as cli_link
|
||||||
from spacy.cli import info as cli_info
|
from spacy.cli import info as cli_info
|
||||||
from spacy.cli import package as cli_package
|
from spacy.cli import package as cli_package
|
||||||
from spacy.cli import train as cli_train
|
from spacy.cli import train as cli_train
|
||||||
from spacy.cli import train_config as cli_train_config
|
|
||||||
|
|
||||||
|
|
||||||
class CLI(object):
|
class CLI(object):
|
||||||
|
@ -77,36 +76,29 @@ class CLI(object):
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
output_dir=("output directory", "positional", None, str),
|
output_dir=("output directory to store model in", "positional", None, str),
|
||||||
train_data=("training data", "positional", None, str),
|
train_data=("location of JSON-formatted training data", "positional", None, str),
|
||||||
dev_data=("development data", "positional", None, str),
|
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
n_iter=("number of iterations", "option", "n", int),
|
||||||
parser_L1=("L1 regularization penalty for parser", "option", "L", float),
|
parser_L1=("L1 regularization penalty for parser", "option", "L", float),
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_ner=("Don't train NER", "flag", "N", bool)
|
no_ner=("Don't train NER", "flag", "N", bool)
|
||||||
)
|
)
|
||||||
def train(self, lang, output_dir, train_data, dev_data, n_iter=15,
|
def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15,
|
||||||
parser_L1=0.0,
|
parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False):
|
||||||
no_tagger=False, no_parser=False, no_ner=False):
|
"""
|
||||||
"""Train a model."""
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
cli_train(lang, output_dir, train_data, dev_data, n_iter,
|
"""
|
||||||
not no_tagger, not no_parser, not no_ner,
|
|
||||||
parser_L1)
|
|
||||||
|
|
||||||
|
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
|
||||||
@plac.annotations(
|
not no_parser, not no_ner, parser_L1)
|
||||||
config=("config", "positional", None, str),
|
|
||||||
)
|
|
||||||
def train_config(self, config):
|
|
||||||
"""Train a model from config file."""
|
|
||||||
|
|
||||||
cli_train_config(config)
|
|
||||||
|
|
||||||
|
|
||||||
def __missing__(self, name):
|
def __missing__(self, name):
|
||||||
print("\n Command %r does not exist\n" % name)
|
print("\n Command %r does not exist."
|
||||||
|
"\n Use the --help flag for a list of available commands.\n" % name)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -79,3 +79,33 @@ p
|
||||||
+h(2, "named-entities") Named Entity Recognition
|
+h(2, "named-entities") Named Entity Recognition
|
||||||
|
|
||||||
include _annotation/_named-entities
|
include _annotation/_named-entities
|
||||||
|
|
||||||
|
+h(2, "json-input") JSON input format for training
|
||||||
|
|
||||||
|
p
|
||||||
|
| spaCy takes training data in the following format:
|
||||||
|
|
||||||
|
+code("Example structure").
|
||||||
|
doc: {
|
||||||
|
id: string,
|
||||||
|
paragraphs: [{
|
||||||
|
raw: string,
|
||||||
|
sents: [int],
|
||||||
|
tokens: [{
|
||||||
|
start: int,
|
||||||
|
tag: string,
|
||||||
|
head: int,
|
||||||
|
dep: string
|
||||||
|
}],
|
||||||
|
ner: [{
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
label: string
|
||||||
|
}],
|
||||||
|
brackets: [{
|
||||||
|
start: int,
|
||||||
|
end: int,
|
||||||
|
label: string
|
||||||
|
}]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
|
@ -138,3 +138,64 @@ p
|
||||||
+cell #[code --help], #[code -h]
|
+cell #[code --help], #[code -h]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
|
+h(2, "train") Train
|
||||||
|
+tag experimental
|
||||||
|
|
||||||
|
p
|
||||||
|
| Train a model. Expects data in spaCy's
|
||||||
|
| #[+a("/docs/api/annotation#json-input") JSON format].
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
|
||||||
|
|
||||||
|
+table(["Argument", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code lang]
|
||||||
|
+cell positional
|
||||||
|
+cell Model language.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code output_dir]
|
||||||
|
+cell positional
|
||||||
|
+cell Directory to store model in.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code train_data]
|
||||||
|
+cell positional
|
||||||
|
+cell Location of JSON-formatted training data.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code dev_data]
|
||||||
|
+cell positional
|
||||||
|
+cell Location of JSON-formatted dev data (optional).
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --n_iter], #[code -n]
|
||||||
|
+cell option
|
||||||
|
+cell Number of iterations (default: #[code 15]).
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --parser_L1], #[code -L]
|
||||||
|
+cell option
|
||||||
|
+cell L1 regularization penalty for parser (default: #[code 0.0]).
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --no_tagger], #[code -T]
|
||||||
|
+cell flag
|
||||||
|
+cell Don't train tagger.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --no_parser], #[code -P]
|
||||||
|
+cell flag
|
||||||
|
+cell Don't train parser.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --no_ner], #[code -N]
|
||||||
|
+cell flag
|
||||||
|
+cell Don't train NER.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --help], #[code -h]
|
||||||
|
+cell flag
|
||||||
|
+cell Show help message and available arguments.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user