* Ensure words in Brown clusters make it into the vocab, even if they're not in our probs list

This commit is contained in:
Matthew Honnibal 2015-05-31 05:46:16 +02:00
parent e77940565d
commit 5ab0f233a1

View File

@ -74,6 +74,9 @@ def setup_vocab(src_dir, dst_dir):
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt')
probs = _read_probs(src_dir / 'words.sgt.prob')
for word in clusters:
if word not in probs:
probs[word] = -17.0
lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
entry = get_lex_props(word)