* Filter out phrases that consist of common, lower-case words.

This commit is contained in:
Matthew Honnibal 2015-10-09 12:47:43 +11:00
parent 4bbc8f45c6
commit 5af4b62fe7

View File

@ -45,6 +45,8 @@ def read_gazetteer(tokenizer, loc, n=-1):
if i >= n:
break
phrase = tokenizer(phrase)
if all((t.is_lower and t.prob >= -10) for t in phrase):
continue
if len(phrase) >= 2:
yield phrase