This commit is contained in:
Matthew Honnibal 2017-04-23 17:07:46 +02:00
commit c9ec24b257
2 changed files with 13 additions and 2 deletions

View File

@ -62,10 +62,10 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
for doc, gold in epoch: for doc, gold in epoch:
trainer.update(doc, gold) trainer.update(doc, gold)
dev_scores = trainer.evaluate(dev_data) if dev_data else [] dev_scores = trainer.evaluate(dev_data).scores if dev_data else {}
print_progress(itn, trainer.nlp.parser.model.nr_weight, print_progress(itn, trainer.nlp.parser.model.nr_weight,
trainer.nlp.parser.model.nr_active_feat, trainer.nlp.parser.model.nr_active_feat,
**dev_scores.scores) **dev_scores)
def evaluate(Language, gold_tuples, output_path): def evaluate(Language, gold_tuples, output_path):

View File

@ -98,6 +98,17 @@ p
| so that Python functions can be used to help you generalise and combine | so that Python functions can be used to help you generalise and combine
| the data as you require. | the data as you require.
+infobox("For languages with non-latin characters")
| In order for the tokenizer to split suffixes, prefixes and infixes, spaCy
| needs to know the language's character set. If the language you're adding
| uses non-latin characters, you might need to add the required character
| classes to the global
| #[+src(gh("spacy", "spacy/language_data/punctuation.py")) punctuation.py].
| spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library]
| to keep this simple and readable. If the language requires very specific
| punctuation rules, you should consider overwriting the default regular
| expressions with your own in the language's #[code Defaults].
+h(3, "stop-words") Stop words +h(3, "stop-words") Stop words
p p