mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
* Ensure words in Brown clusters make it into the vocab, even if they're not in our probs list
This commit is contained in:
parent
e77940565d
commit
5ab0f233a1
|
@ -74,6 +74,9 @@ def setup_vocab(src_dir, dst_dir):
|
|||
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
||||
clusters = _read_clusters(src_dir / 'clusters.txt')
|
||||
probs = _read_probs(src_dir / 'words.sgt.prob')
|
||||
for word in clusters:
|
||||
if word not in probs:
|
||||
probs[word] = -17.0
|
||||
lexicon = []
|
||||
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
|
||||
entry = get_lex_props(word)
|
||||
|
|
Loading…
Reference in New Issue
Block a user