This commit is contained in:
Matthew Honnibal 2018-08-07 10:49:39 +02:00
commit 664cfc29bc
5 changed files with 20 additions and 7 deletions

View File

@ -36,12 +36,13 @@ from ..compat import json_dumps
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str), version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be " meta_path=("Optional path to meta.json. All relevant properties will be "
"overwritten.", "option", "m", Path)) "overwritten.", "option", "m", Path),
verbose=("Display more information for debug", "option", None, bool))
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
parser_multitasks='', entity_multitasks='', parser_multitasks='', entity_multitasks='',
use_gpu=-1, vectors=None, no_tagger=False, use_gpu=-1, vectors=None, no_tagger=False,
no_parser=False, no_entities=False, gold_preproc=False, no_parser=False, no_entities=False, gold_preproc=False,
version="0.0.0", meta_path=None): version="0.0.0", meta_path=None, verbose=False):
""" """
Train a model. Expects data in spaCy's JSON format. Train a model. Expects data in spaCy's JSON format.
""" """
@ -143,7 +144,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
gold_preproc=gold_preproc)) gold_preproc=gold_preproc))
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer() start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs) scorer = nlp_loaded.evaluate(dev_docs, verbose)
end_time = timer() end_time = timer()
if use_gpu < 0: if use_gpu < 0:
gpu_wps = None gpu_wps = None

View File

@ -30,7 +30,7 @@ def tags_to_entities(tags):
continue continue
elif tag.startswith('I'): elif tag.startswith('I'):
if start is None: if start is None:
raise ValueError(Errors.E067.format(tags=tags[:i])) raise ValueError(Errors.E067.format(tags=tags[:i+1]))
continue continue
if tag.startswith('U'): if tag.startswith('U'):
entities.append((tag[2:], i, i)) entities.append((tag[2:], i, i))

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT
_exc = {} _exc = {}
@ -78,5 +78,11 @@ for orth in [
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]: "s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
# should be tokenized as two separate tokens.
for orth in ["i", "m"]:
_exc[orth + "."] = [
{ORTH: orth, LEMMA: orth, NORM: orth},
{ORTH: ".", TAG: PUNCT}]
TOKENIZER_EXCEPTIONS = _exc TOKENIZER_EXCEPTIONS = _exc

View File

@ -6,7 +6,8 @@ import pytest
SV_TOKEN_EXCEPTION_TESTS = [ SV_TOKEN_EXCEPTION_TESTS = [
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']) ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
] ]

View File

@ -260,7 +260,7 @@ p
+code(false, "bash", "$", false, false, true). +code(false, "bash", "$", false, false, true).
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
[--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
[--no-entities] [--gold-preproc] [--no-entities] [--gold-preproc] [--verbose]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
@ -344,6 +344,11 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row
+cell #[code --verbose]
+cell flag
+cell Show more detail message during training.
+row("foot") +row("foot")
+cell creates +cell creates
+cell model, pickle +cell model, pickle