diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8dce873ad..5d2155d5e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -36,12 +36,13 @@ from ..compat import json_dumps gold_preproc=("Use gold preprocessing", "flag", "G", bool), version=("Model version", "option", "V", str), meta_path=("Optional path to meta.json. All relevant properties will be " - "overwritten.", "option", "m", Path)) + "overwritten.", "option", "m", Path), + verbose=("Display more information for debug", "option", None, bool)) def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, parser_multitasks='', entity_multitasks='', use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, gold_preproc=False, - version="0.0.0", meta_path=None): + version="0.0.0", meta_path=None, verbose=False): """ Train a model. Expects data in spaCy's JSON format. """ @@ -143,7 +144,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() - scorer = nlp_loaded.evaluate(dev_docs) + scorer = nlp_loaded.evaluate(dev_docs, verbose) end_time = timer() if use_gpu < 0: gpu_wps = None diff --git a/spacy/gold.pyx b/spacy/gold.pyx index e396aa7c6..4cd5377a4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -30,7 +30,7 @@ def tags_to_entities(tags): continue elif tag.startswith('I'): if start is None: - raise ValueError(Errors.E067.format(tags=tags[:i])) + raise ValueError(Errors.E067.format(tags=tags[:i+1])) continue if tag.startswith('U'): entities.append((tag[2:], i, i)) diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index 4f84ef9b5..aa03e61cb 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT _exc = {} @@ -78,5 +78,11 @@ for orth in [ "s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]: _exc[orth] = [{ORTH: orth}] +# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), +# should be tokenized as two separate tokens. +for orth in ["i", "m"]: + _exc[orth + "."] = [ + {ORTH: orth, LEMMA: orth, NORM: orth}, + {ORTH: ".", TAG: PUNCT}] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py index dbb3e1dd1..7f81e250e 100644 --- a/spacy/tests/lang/sv/test_tokenizer.py +++ b/spacy/tests/lang/sv/test_tokenizer.py @@ -6,7 +6,8 @@ import pytest SV_TOKEN_EXCEPTION_TESTS = [ ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), - ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']) + ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']), + ('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."]) ] diff --git a/website/api/cli.jade b/website/api/cli.jade index 760f34a4b..6b5bc528e 100644 --- a/website/api/cli.jade +++ b/website/api/cli.jade @@ -260,7 +260,7 @@ p +code(false, "bash", "$", false, false, true). python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] - [--no-entities] [--gold-preproc] + [--no-entities] [--gold-preproc] [--verbose] +table(["Argument", "Type", "Description"]) +row @@ -344,6 +344,11 @@ p +cell flag +cell Show help message and available arguments. + +row + +cell #[code --verbose] + +cell flag + +cell Show more detail message during training. + +row("foot") +cell creates +cell model, pickle