From d6b87a2f558b52d66549b6a66c0af00e283ad628 Mon Sep 17 00:00:00 2001 From: Stanislav Schmidt Date: Mon, 29 Mar 2021 15:24:39 +0200 Subject: [PATCH] Make vocab update in get_docs deterministic The attribute `DocBin.strings` is a set. In `DocBin.get_docs` a given vocab is updated by iterating over this set. Iteration over a python set produces an arbitrary ordering, therefore vocab is updated non-deterministically. When training (fine-tuning) a spacy model, the base model's vocabulary will be updated with the new vocabulary in the training data in exactly the way described above. After serialization, the file `model/vocab/strings.json` will be sorted in an arbitrary way. This prevents reproducible model training. --- spacy/tokens/_serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index d5b4e4ff7..ce67be2d7 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -124,7 +124,7 @@ class DocBin: DOCS: https://spacy.io/api/docbin#get_docs """ - for string in self.strings: + for string in sorted(self.strings): vocab[string] orth_col = self.attrs.index(ORTH) for i in range(len(self.tokens)):