* Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable

This commit is contained in:
Matthew Honnibal 2015-04-17 04:44:52 +02:00
parent cc4e395927
commit 693c5a1558

View File

@ -46,7 +46,10 @@ def _read_clusters(loc):
cluster, word, freq = line.split() cluster, word, freq = line.split()
except ValueError: except ValueError:
continue continue
clusters[word] = cluster # If the clusterer has only seen the word a few times, its cluster is
# unreliable.
if int(freq) >= 3:
clusters[word] = cluster
return clusters return clusters