From ab93898ac6369f623ce419ed768eb493e4d35c1b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 26 Jul 2015 00:22:19 +0200 Subject: [PATCH] * Make heuristics more explicit in init_model --- bin/init_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 4083f4a73..91b02f607 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -89,7 +89,7 @@ def _read_probs(loc): return probs, probs['-OOV-'] -def _read_freqs(loc): +def _read_freqs(loc, max_length=100, min_doc_freq=5, min_freq=10): if not loc.exists(): print("Warning: Frequencies file not found") return None @@ -105,7 +105,9 @@ def _read_freqs(loc): probs = {} for line in loc.open(): freq, doc_freq, key = line.split('\t', 2) - if int(doc_freq) >= 2 and int(freq) >= 5 and len(key) < 200: + doc_freq = int(doc_freq) + freq = int(freq) + if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: word = literal_eval(key) smooth_count = counts.smoother(int(freq)) log_smooth_count = math.log(smooth_count)