mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable
This commit is contained in:
parent
cc4e395927
commit
693c5a1558
|
@ -46,7 +46,10 @@ def _read_clusters(loc):
|
|||
cluster, word, freq = line.split()
|
||||
except ValueError:
|
||||
continue
|
||||
clusters[word] = cluster
|
||||
# If the clusterer has only seen the word a few times, its cluster is
|
||||
# unreliable.
|
||||
if int(freq) >= 3:
|
||||
clusters[word] = cluster
|
||||
return clusters
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user