Merge branch 'master' of https://github.com/explosion/spaCy

2026-01-01 06:23:27 +03:00 · 2018-08-07 10:49:39 +02:00 · 2018-08-07 10:49:39 +02:00 · 664cfc29bc
commit 664cfc29bc
parent 2278c9734e f0c9652ed1
5 changed files with 20 additions and 7 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -36,12 +36,13 @@ from ..compat import json_dumps
    gold_preproc=("Use gold preprocessing", "flag", "G", bool),
    version=("Model version", "option", "V", str),
    meta_path=("Optional path to meta.json. All relevant properties will be "
-               "overwritten.", "option", "m", Path))
+               "overwritten.", "option", "m", Path),
+    verbose=("Display more information for debug", "option", None, bool))
 def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
         parser_multitasks='', entity_multitasks='',
          use_gpu=-1, vectors=None, no_tagger=False,
          no_parser=False, no_entities=False, gold_preproc=False,
-          version="0.0.0", meta_path=None):
+          version="0.0.0", meta_path=None, verbose=False):
    """
    Train a model. Expects data in spaCy's JSON format.
    """
@ -143,7 +144,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
                                gold_preproc=gold_preproc))
                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                start_time = timer()
-                scorer = nlp_loaded.evaluate(dev_docs)
+                scorer = nlp_loaded.evaluate(dev_docs, verbose)
                end_time = timer()
                if use_gpu < 0:
                    gpu_wps = None
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -30,7 +30,7 @@ def tags_to_entities(tags):
            continue
        elif tag.startswith('I'):
            if start is None:
-                raise ValueError(Errors.E067.format(tags=tags[:i]))
+                raise ValueError(Errors.E067.format(tags=tags[:i+1]))
            continue
        if tag.startswith('U'):
            entities.append((tag[2:], i, i))
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@ -1,7 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
+from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT


 _exc = {}
@ -78,5 +78,11 @@ for orth in [
    "s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
    _exc[orth] = [{ORTH: orth}]

+# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
+# should be tokenized as two separate tokens.
+for orth in ["i", "m"]:
+    _exc[orth + "."] = [
+        {ORTH: orth, LEMMA: orth, NORM: orth},
+        {ORTH: ".", TAG: PUNCT}]

 TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/tests/lang/sv/test_tokenizer.py
+++ b/spacy/tests/lang/sv/test_tokenizer.py
@ -6,7 +6,8 @@ import pytest

 SV_TOKEN_EXCEPTION_TESTS = [
    ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
-    ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
+    ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
+    ('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
 ]


--- a/website/api/cli.jade
+++ b/website/api/cli.jade
@ -260,7 +260,7 @@ p
 +code(false, "bash", "$", false, false, true).
    python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
    [--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
-    [--no-entities] [--gold-preproc]
+    [--no-entities] [--gold-preproc] [--verbose]

 +table(["Argument", "Type", "Description"])
    +row
@ -344,6 +344,11 @@ p
        +cell flag
        +cell Show help message and available arguments.

+    +row
+        +cell #[code --verbose]
+        +cell flag
+        +cell Show more detail message during training.
+
    +row("foot")
        +cell creates
        +cell model, pickle