Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2018-06-29 12:13:30 +02:00
commit f9142b4bfc
3 changed files with 25 additions and 17 deletions

View File

@ -90,12 +90,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
lexeme.cluster = 0
lex_added += 1
nlp.vocab.cfg.update({'oov_prob': oov_prob})
for word in vector_keys:
if word not in nlp.vocab:
lexeme = nlp.vocab[word]
lexeme.is_oov = False
lex_added += 1
if len(vectors_data):
if vector_keys is not None:
for word in vector_keys:
if word not in nlp.vocab:
lexeme = nlp.vocab[word]
lexeme.is_oov = False
lex_added += 1
if vectors_data:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors)

View File

@ -7,6 +7,7 @@ import tqdm
from thinc.neural._classes.model import Model
from timeit import default_timer as timer
import json
import shutil
from ._messages import Messages
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
@ -186,14 +187,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
with nlp.use_params(optimizer.averages):
final_model_path = output_path / 'model-final'
nlp.to_disk(final_model_path)
components = []
if not no_parser:
components.append('parser')
if not no_tagger:
components.append('tagger')
if not no_entities:
components.append('ner')
_collate_best_model(meta, output_path, components)
components = []
if not no_parser:
components.append('parser')
if not no_tagger:
components.append('tagger')
if not no_entities:
components.append('ner')
_collate_best_model(meta, output_path, components)
def _collate_best_model(meta, output_path, components):
bests = {}
@ -202,8 +204,8 @@ def _collate_best_model(meta, output_path, components):
best_dest = output_path / 'model-best'
shutil.copytree(output_path / 'model-final', best_dest)
for component, best_component_src in bests.items():
shutil.rmtree(best_dir / component)
shutil.copytree(best_component_src, best_dest / component)
shutil.rmtree(best_dest / component)
shutil.copytree(best_component_src / component, best_dest / component)
with (best_component_src / 'accuracy.json').open() as file_:
accs = json.load(file_)
for metric in _get_metrics(component):

View File

@ -501,15 +501,20 @@ class Tagger(Pipe):
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype='f')
for gold in golds:
for tag in gold.tags:
if tag is None:
correct[idx] = guesses[idx]
else:
elif tag in tag_index:
correct[idx] = tag_index[tag]
else:
correct[idx] = 0
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores *= known_labels
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores