* Implement is_number

This commit is contained in:
Matthew Honnibal 2014-11-01 19:13:24 +11:00
parent f685218e21
commit 5484fbea69

View File

@ -49,6 +49,7 @@ def is_lower(string):
def is_upper(string):
return string.isupper()
TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
"name|pro|tel|travel|xxx|"
"ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|"
@ -63,6 +64,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
"tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|"
"wf|ws|ye|yt|za|zm|zw".split('|'))
def is_urlish(string):
# We're looking for things that function in text like URLs. So, valid URL
# or not, anything they say http:// is going to be good.
@ -82,13 +84,23 @@ def is_urlish(string):
return False
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
'eighteen nineteen twenty thirty forty fifty sixty seventy'
'eighty ninety hundred thousand million billion trillion'
'quadrillion gajillion bazillion'.split())
def is_number(string):
pass
def is_emoticon(string):
pass
string = string.replace(',', '')
string = string.replace('.', '')
if string.isdigit():
return True
if string.count('/') == 1:
num, denom = string.split('/')
if is_number(num) and is_number(denom):
return True
if string in NUM_WORDS:
return True
return False
# Statistics features
def oft_case(name, thresh):