mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 08:12:24 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
f9142b4bfc
|
@ -90,12 +90,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
|
||||||
lexeme.cluster = 0
|
lexeme.cluster = 0
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
nlp.vocab.cfg.update({'oov_prob': oov_prob})
|
nlp.vocab.cfg.update({'oov_prob': oov_prob})
|
||||||
for word in vector_keys:
|
if vector_keys is not None:
|
||||||
if word not in nlp.vocab:
|
for word in vector_keys:
|
||||||
lexeme = nlp.vocab[word]
|
if word not in nlp.vocab:
|
||||||
lexeme.is_oov = False
|
lexeme = nlp.vocab[word]
|
||||||
lex_added += 1
|
lexeme.is_oov = False
|
||||||
if len(vectors_data):
|
lex_added += 1
|
||||||
|
if vectors_data:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
if prune_vectors >= 1:
|
if prune_vectors >= 1:
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
|
@ -7,6 +7,7 @@ import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import json
|
import json
|
||||||
|
import shutil
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
|
@ -186,14 +187,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
final_model_path = output_path / 'model-final'
|
final_model_path = output_path / 'model-final'
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
components = []
|
components = []
|
||||||
if not no_parser:
|
if not no_parser:
|
||||||
components.append('parser')
|
components.append('parser')
|
||||||
if not no_tagger:
|
if not no_tagger:
|
||||||
components.append('tagger')
|
components.append('tagger')
|
||||||
if not no_entities:
|
if not no_entities:
|
||||||
components.append('ner')
|
components.append('ner')
|
||||||
_collate_best_model(meta, output_path, components)
|
_collate_best_model(meta, output_path, components)
|
||||||
|
|
||||||
|
|
||||||
def _collate_best_model(meta, output_path, components):
|
def _collate_best_model(meta, output_path, components):
|
||||||
bests = {}
|
bests = {}
|
||||||
|
@ -202,8 +204,8 @@ def _collate_best_model(meta, output_path, components):
|
||||||
best_dest = output_path / 'model-best'
|
best_dest = output_path / 'model-best'
|
||||||
shutil.copytree(output_path / 'model-final', best_dest)
|
shutil.copytree(output_path / 'model-final', best_dest)
|
||||||
for component, best_component_src in bests.items():
|
for component, best_component_src in bests.items():
|
||||||
shutil.rmtree(best_dir / component)
|
shutil.rmtree(best_dest / component)
|
||||||
shutil.copytree(best_component_src, best_dest / component)
|
shutil.copytree(best_component_src / component, best_dest / component)
|
||||||
with (best_component_src / 'accuracy.json').open() as file_:
|
with (best_component_src / 'accuracy.json').open() as file_:
|
||||||
accs = json.load(file_)
|
accs = json.load(file_)
|
||||||
for metric in _get_metrics(component):
|
for metric in _get_metrics(component):
|
||||||
|
|
|
@ -501,15 +501,20 @@ class Tagger(Pipe):
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
|
known_labels = numpy.ones((scores.shape[0], 1), dtype='f')
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
if tag is None:
|
if tag is None:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
else:
|
elif tag in tag_index:
|
||||||
correct[idx] = tag_index[tag]
|
correct[idx] = tag_index[tag]
|
||||||
|
else:
|
||||||
|
correct[idx] = 0
|
||||||
|
known_labels[idx] = 0.
|
||||||
idx += 1
|
idx += 1
|
||||||
correct = self.model.ops.xp.array(correct, dtype='i')
|
correct = self.model.ops.xp.array(correct, dtype='i')
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
|
d_scores *= known_labels
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
Loading…
Reference in New Issue
Block a user