From 3a9710f35698a55822d80ded5d66239e6dcdba8a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 15:57:53 +0200 Subject: [PATCH 1/2] Pass dev_scores to print_progress correctly (resolves #1008) Only read scores attribute if command is used with dev_data, otherwise default dev_scores to empty dict. --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3900c7f39..8557019c6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -62,10 +62,10 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for doc, gold in epoch: trainer.update(doc, gold) - dev_scores = trainer.evaluate(dev_data) if dev_data else [] + dev_scores = trainer.evaluate(dev_data).scores if dev_data else {} print_progress(itn, trainer.nlp.parser.model.nr_weight, trainer.nlp.parser.model.nr_active_feat, - **dev_scores.scores) + **dev_scores) def evaluate(Language, gold_tuples, output_path): From 2bfec1a4f8b0c3218b731084a1a1619c72d69967 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 15:58:38 +0200 Subject: [PATCH 2/2] Add note on languages with non-latin characters (see #996) --- website/docs/usage/adding-languages.jade | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 0c98cc5ca..50b626b99 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -98,6 +98,17 @@ p | so that Python functions can be used to help you generalise and combine | the data as you require. ++infobox("For languages with non-latin characters") + | In order for the tokenizer to split suffixes, prefixes and infixes, spaCy + | needs to know the language's character set. If the language you're adding + | uses non-latin characters, you might need to add the required character + | classes to the global + | #[+src(gh("spacy", "spacy/language_data/punctuation.py")) punctuation.py]. + | spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library] + | to keep this simple and readable. If the language requires very specific + | punctuation rules, you should consider overwriting the default regular + | expressions with your own in the language's #[code Defaults]. + +h(3, "stop-words") Stop words p