Fix overwriting of lexical attributes when loading vectors during training

2025-07-23 14:39:52 +03:00 · 2018-02-17 18:11:11 +01:00 · 2018-02-17 18:11:11 +01:00 · 262d0a3148
commit 262d0a3148
parent c0caf7cf27
1 changed files with 10 additions and 0 deletions
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -7,6 +7,7 @@ import tqdm
 from thinc.neural._classes.model import Model
 from timeit import default_timer as timer

+from ..attrs import PROB, IS_OOV, CLUSTER, LANG
 from ..gold import GoldCorpus, minibatch
 from ..util import prints
 from .. import util
@ -90,6 +91,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
    nlp.meta.update(meta)
    if vectors:
        util.load_model(vectors, vocab=nlp.vocab)
+        for lex in nlp.vocab:
+            values = {}
+            for attr, func in nlp.vocab.lex_attr_getters.items():
+                # These attrs are expected to be set by data. Others should
+                # be set by calling the language functions.
+                if attr not in (CLUSTER, PROB, IS_OOV, LANG):
+                    values[lex.vocab.strings[attr]] = func(lex.orth_)
+            lex.set_attrs(**values)
+            lex.is_oov = False
    for name in pipeline:
        nlp.add_pipe(nlp.create_pipe(name), name=name)
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)