mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
* Exclude clusterings for words only seen 1 or 2 times, as their clusters are unreliable
This commit is contained in:
parent
cc4e395927
commit
693c5a1558
|
@ -46,7 +46,10 @@ def _read_clusters(loc):
|
||||||
cluster, word, freq = line.split()
|
cluster, word, freq = line.split()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
clusters[word] = cluster
|
# If the clusterer has only seen the word a few times, its cluster is
|
||||||
|
# unreliable.
|
||||||
|
if int(freq) >= 3:
|
||||||
|
clusters[word] = cluster
|
||||||
return clusters
|
return clusters
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user