Fix init_model if there's no vocab (closes #4048) (#4049)

This commit is contained in:
Ines Montani 2019-08-01 17:26:09 +02:00 committed by GitHub
parent 925a852bb6
commit 8718ca8b1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -24,6 +24,7 @@ except ImportError:
ftfy = None ftfy = None
DEFAULT_OOV_PROB = -20
msg = Printer() msg = Printer()
@ -108,23 +109,30 @@ def open_file(loc):
def read_attrs_from_deprecated(freqs_loc, clusters_loc): def read_attrs_from_deprecated(freqs_loc, clusters_loc):
with msg.loading("Counting frequencies..."): if freqs_loc is not None:
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) with msg.loading("Counting frequencies..."):
msg.good("Counted frequencies") probs, _ = read_freqs(freqs_loc)
with msg.loading("Reading clusters..."): msg.good("Counted frequencies")
clusters = read_clusters(clusters_loc) if clusters_loc else {} else:
msg.good("Read clusters") probs, _ = ({}, DEFAULT_OOV_PROB)
if clusters_loc:
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc)
msg.good("Read clusters")
else:
clusters = {}
lex_attrs = [] lex_attrs = []
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
for i, (word, prob) in tqdm(enumerate(sorted_probs)): if len(sorted_probs):
attrs = {"orth": word, "id": i, "prob": prob} for i, (word, prob) in tqdm(enumerate(sorted_probs)):
# Decode as a little-endian string, so that we can do & 15 to get attrs = {"orth": word, "id": i, "prob": prob}
# the first 4 bits. See _parse_features.pyx # Decode as a little-endian string, so that we can do & 15 to get
if word in clusters: # the first 4 bits. See _parse_features.pyx
attrs["cluster"] = int(clusters[word][::-1], 2) if word in clusters:
else: attrs["cluster"] = int(clusters[word][::-1], 2)
attrs["cluster"] = 0 else:
lex_attrs.append(attrs) attrs["cluster"] = 0
lex_attrs.append(attrs)
return lex_attrs return lex_attrs
@ -142,8 +150,11 @@ def create_model(lang, lex_attrs):
lexeme.is_oov = False lexeme.is_oov = False
lex_added += 1 lex_added += 1
lex_added += 1 lex_added += 1
oov_prob = min(lex.prob for lex in nlp.vocab) if len(nlp.vocab):
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1}) oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
return nlp return nlp