diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 87c3033ad..bad63209e 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -90,12 +90,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru lexeme.cluster = 0 lex_added += 1 nlp.vocab.cfg.update({'oov_prob': oov_prob}) - for word in vector_keys: - if word not in nlp.vocab: - lexeme = nlp.vocab[word] - lexeme.is_oov = False - lex_added += 1 - if len(vectors_data): + if vector_keys is not None: + for word in vector_keys: + if word not in nlp.vocab: + lexeme = nlp.vocab[word] + lexeme.is_oov = False + lex_added += 1 + if vectors_data: nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) if prune_vectors >= 1: nlp.vocab.prune_vectors(prune_vectors) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 4cad2cae1..a9c332fe3 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -7,6 +7,7 @@ import tqdm from thinc.neural._classes.model import Model from timeit import default_timer as timer import json +import shutil from ._messages import Messages from ..attrs import PROB, IS_OOV, CLUSTER, LANG @@ -186,14 +187,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, with nlp.use_params(optimizer.averages): final_model_path = output_path / 'model-final' nlp.to_disk(final_model_path) - components = [] - if not no_parser: - components.append('parser') - if not no_tagger: - components.append('tagger') - if not no_entities: - components.append('ner') - _collate_best_model(meta, output_path, components) + components = [] + if not no_parser: + components.append('parser') + if not no_tagger: + components.append('tagger') + if not no_entities: + components.append('ner') + _collate_best_model(meta, output_path, components) + def _collate_best_model(meta, output_path, components): bests = {} @@ -202,8 +204,8 @@ def _collate_best_model(meta, output_path, components): best_dest = output_path / 'model-best' shutil.copytree(output_path / 'model-final', best_dest) for component, best_component_src in bests.items(): - shutil.rmtree(best_dir / component) - shutil.copytree(best_component_src, best_dest / component) + shutil.rmtree(best_dest / component) + shutil.copytree(best_component_src / component, best_dest / component) with (best_component_src / 'accuracy.json').open() as file_: accs = json.load(file_) for metric in _get_metrics(component): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 477c9d6e2..e913b2647 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -501,15 +501,20 @@ class Tagger(Pipe): cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') guesses = scores.argmax(axis=1) + known_labels = numpy.ones((scores.shape[0], 1), dtype='f') for gold in golds: for tag in gold.tags: if tag is None: correct[idx] = guesses[idx] - else: + elif tag in tag_index: correct[idx] = tag_index[tag] + else: + correct[idx] = 0 + known_labels[idx] = 0. idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores *= known_labels loss = (d_scores**2).sum() d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores