mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
664cfc29bc
|
@ -36,12 +36,13 @@ from ..compat import json_dumps
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
version=("Model version", "option", "V", str),
|
version=("Model version", "option", "V", str),
|
||||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
meta_path=("Optional path to meta.json. All relevant properties will be "
|
||||||
"overwritten.", "option", "m", Path))
|
"overwritten.", "option", "m", Path),
|
||||||
|
verbose=("Display more information for debug", "option", None, bool))
|
||||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
parser_multitasks='', entity_multitasks='',
|
parser_multitasks='', entity_multitasks='',
|
||||||
use_gpu=-1, vectors=None, no_tagger=False,
|
use_gpu=-1, vectors=None, no_tagger=False,
|
||||||
no_parser=False, no_entities=False, gold_preproc=False,
|
no_parser=False, no_entities=False, gold_preproc=False,
|
||||||
version="0.0.0", meta_path=None):
|
version="0.0.0", meta_path=None, verbose=False):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
@ -143,7 +144,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
gold_preproc=gold_preproc))
|
gold_preproc=gold_preproc))
|
||||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
scorer = nlp_loaded.evaluate(dev_docs)
|
scorer = nlp_loaded.evaluate(dev_docs, verbose)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
if use_gpu < 0:
|
if use_gpu < 0:
|
||||||
gpu_wps = None
|
gpu_wps = None
|
||||||
|
|
|
@ -30,7 +30,7 @@ def tags_to_entities(tags):
|
||||||
continue
|
continue
|
||||||
elif tag.startswith('I'):
|
elif tag.startswith('I'):
|
||||||
if start is None:
|
if start is None:
|
||||||
raise ValueError(Errors.E067.format(tags=tags[:i]))
|
raise ValueError(Errors.E067.format(tags=tags[:i+1]))
|
||||||
continue
|
continue
|
||||||
if tag.startswith('U'):
|
if tag.startswith('U'):
|
||||||
entities.append((tag[2:], i, i))
|
entities.append((tag[2:], i, i))
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
@ -78,5 +78,11 @@ for orth in [
|
||||||
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
|
"s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."),
|
||||||
|
# should be tokenized as two separate tokens.
|
||||||
|
for orth in ["i", "m"]:
|
||||||
|
_exc[orth + "."] = [
|
||||||
|
{ORTH: orth, LEMMA: orth, NORM: orth},
|
||||||
|
{ORTH: ".", TAG: PUNCT}]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
|
|
|
@ -6,7 +6,8 @@ import pytest
|
||||||
|
|
||||||
SV_TOKEN_EXCEPTION_TESTS = [
|
SV_TOKEN_EXCEPTION_TESTS = [
|
||||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
|
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
||||||
|
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -260,7 +260,7 @@ p
|
||||||
+code(false, "bash", "$", false, false, true).
|
+code(false, "bash", "$", false, false, true).
|
||||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
|
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
|
||||||
[--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
|
[--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
|
||||||
[--no-entities] [--gold-preproc]
|
[--no-entities] [--gold-preproc] [--verbose]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -344,6 +344,11 @@ p
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --verbose]
|
||||||
|
+cell flag
|
||||||
|
+cell Show more detail message during training.
|
||||||
|
|
||||||
+row("foot")
|
+row("foot")
|
||||||
+cell creates
|
+cell creates
|
||||||
+cell model, pickle
|
+cell model, pickle
|
||||||
|
|
Loading…
Reference in New Issue
Block a user