mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 13:40:34 +03:00
feature(populate_vocab): Enable pruning out rare words from clusters data
This commit is contained in:
parent
6e1fad92a1
commit
cdaefae60a
|
@ -98,10 +98,6 @@ def read_clusters(clusters_path):
|
||||||
|
|
||||||
|
|
||||||
def populate_vocab(vocab, clusters, probs, oov_prob):
|
def populate_vocab(vocab, clusters, probs, oov_prob):
|
||||||
# Ensure probs has entries for all words seen during clustering.
|
|
||||||
for word in clusters:
|
|
||||||
if word not in probs:
|
|
||||||
probs[word] = oov_prob
|
|
||||||
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
|
||||||
lexeme = vocab[word]
|
lexeme = vocab[word]
|
||||||
lexeme.prob = prob
|
lexeme.prob = prob
|
||||||
|
|
Loading…
Reference in New Issue
Block a user