* Add LIKE_URL and LIKE_NUMBER flag features

This commit is contained in:
Matthew Honnibal 2014-11-02 13:19:05 +11:00
parent c414d0eebe
commit 8335706321
3 changed files with 9 additions and 3 deletions

View File

@ -16,6 +16,9 @@ cpdef enum:
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUMBER
OFT_LOWER
OFT_TITLE
OFT_UPPER

View File

@ -23,6 +23,9 @@ def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
flags |= orth.is_space(string) << IS_SPACE
flags |= orth.is_title(string) << IS_TITLE
flags |= orth.is_upper(string) << IS_UPPER
flags |= orth.like_url(string) << LIKE_URL
flags |= orth.like_number(string) << LIKE_NUMBER
return flags

View File

@ -65,7 +65,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
"wf|ws|ye|yt|za|zm|zw".split('|'))
def is_urlish(string):
def like_url(string):
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
if string.startswith('http://'):
@ -89,14 +89,14 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eighteen nineteen twenty thirty forty fifty sixty seventy'
'eighty ninety hundred thousand million billion trillion'
'quadrillion gajillion bazillion'.split())
def is_number(string):
def like_number(string):
string = string.replace(',', '')
string = string.replace('.', '')
if string.isdigit():
return True
if string.count('/') == 1:
num, denom = string.split('/')
if is_number(num) and is_number(denom):
if like_number(num) and like_number(denom):
return True
if string in NUM_WORDS:
return True