mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
f9142b4bfc
|
@ -90,12 +90,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
|
|||
lexeme.cluster = 0
|
||||
lex_added += 1
|
||||
nlp.vocab.cfg.update({'oov_prob': oov_prob})
|
||||
for word in vector_keys:
|
||||
if word not in nlp.vocab:
|
||||
lexeme = nlp.vocab[word]
|
||||
lexeme.is_oov = False
|
||||
lex_added += 1
|
||||
if len(vectors_data):
|
||||
if vector_keys is not None:
|
||||
for word in vector_keys:
|
||||
if word not in nlp.vocab:
|
||||
lexeme = nlp.vocab[word]
|
||||
lexeme.is_oov = False
|
||||
lex_added += 1
|
||||
if vectors_data:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if prune_vectors >= 1:
|
||||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
|
|
@ -7,6 +7,7 @@ import tqdm
|
|||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
import json
|
||||
import shutil
|
||||
|
||||
from ._messages import Messages
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
|
@ -186,14 +187,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
with nlp.use_params(optimizer.averages):
|
||||
final_model_path = output_path / 'model-final'
|
||||
nlp.to_disk(final_model_path)
|
||||
components = []
|
||||
if not no_parser:
|
||||
components.append('parser')
|
||||
if not no_tagger:
|
||||
components.append('tagger')
|
||||
if not no_entities:
|
||||
components.append('ner')
|
||||
_collate_best_model(meta, output_path, components)
|
||||
components = []
|
||||
if not no_parser:
|
||||
components.append('parser')
|
||||
if not no_tagger:
|
||||
components.append('tagger')
|
||||
if not no_entities:
|
||||
components.append('ner')
|
||||
_collate_best_model(meta, output_path, components)
|
||||
|
||||
|
||||
def _collate_best_model(meta, output_path, components):
|
||||
bests = {}
|
||||
|
@ -202,8 +204,8 @@ def _collate_best_model(meta, output_path, components):
|
|||
best_dest = output_path / 'model-best'
|
||||
shutil.copytree(output_path / 'model-final', best_dest)
|
||||
for component, best_component_src in bests.items():
|
||||
shutil.rmtree(best_dir / component)
|
||||
shutil.copytree(best_component_src, best_dest / component)
|
||||
shutil.rmtree(best_dest / component)
|
||||
shutil.copytree(best_component_src / component, best_dest / component)
|
||||
with (best_component_src / 'accuracy.json').open() as file_:
|
||||
accs = json.load(file_)
|
||||
for metric in _get_metrics(component):
|
||||
|
|
|
@ -501,15 +501,20 @@ class Tagger(Pipe):
|
|||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
known_labels = numpy.ones((scores.shape[0], 1), dtype='f')
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
if tag is None:
|
||||
correct[idx] = guesses[idx]
|
||||
else:
|
||||
elif tag in tag_index:
|
||||
correct[idx] = tag_index[tag]
|
||||
else:
|
||||
correct[idx] = 0
|
||||
known_labels[idx] = 0.
|
||||
idx += 1
|
||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||
d_scores *= known_labels
|
||||
loss = (d_scores**2).sum()
|
||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||
return float(loss), d_scores
|
||||
|
|
Loading…
Reference in New Issue
Block a user