mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
* Implement is_number
This commit is contained in:
parent
f685218e21
commit
5484fbea69
|
@ -49,6 +49,7 @@ def is_lower(string):
|
||||||
def is_upper(string):
|
def is_upper(string):
|
||||||
return string.isupper()
|
return string.isupper()
|
||||||
|
|
||||||
|
|
||||||
TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
|
TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|"
|
||||||
"name|pro|tel|travel|xxx|"
|
"name|pro|tel|travel|xxx|"
|
||||||
"ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|"
|
"ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|"
|
||||||
|
@ -63,6 +64,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
|
||||||
"tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|"
|
"tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|"
|
||||||
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
||||||
|
|
||||||
|
|
||||||
def is_urlish(string):
|
def is_urlish(string):
|
||||||
# We're looking for things that function in text like URLs. So, valid URL
|
# We're looking for things that function in text like URLs. So, valid URL
|
||||||
# or not, anything they say http:// is going to be good.
|
# or not, anything they say http:// is going to be good.
|
||||||
|
@ -82,13 +84,23 @@ def is_urlish(string):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
||||||
|
'eleven twelve thirteen fourteen fifteen sixteen seventeen'
|
||||||
|
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||||
|
'eighty ninety hundred thousand million billion trillion'
|
||||||
|
'quadrillion gajillion bazillion'.split())
|
||||||
def is_number(string):
|
def is_number(string):
|
||||||
pass
|
string = string.replace(',', '')
|
||||||
|
string = string.replace('.', '')
|
||||||
|
if string.isdigit():
|
||||||
def is_emoticon(string):
|
return True
|
||||||
pass
|
if string.count('/') == 1:
|
||||||
|
num, denom = string.split('/')
|
||||||
|
if is_number(num) and is_number(denom):
|
||||||
|
return True
|
||||||
|
if string in NUM_WORDS:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
# Statistics features
|
# Statistics features
|
||||||
def oft_case(name, thresh):
|
def oft_case(name, thresh):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user