mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
* Add LIKE_URL and LIKE_NUMBER flag features
This commit is contained in:
parent
c414d0eebe
commit
8335706321
|
@ -16,6 +16,9 @@ cpdef enum:
|
||||||
IS_TITLE
|
IS_TITLE
|
||||||
IS_UPPER
|
IS_UPPER
|
||||||
|
|
||||||
|
LIKE_URL
|
||||||
|
LIKE_NUMBER
|
||||||
|
|
||||||
OFT_LOWER
|
OFT_LOWER
|
||||||
OFT_TITLE
|
OFT_TITLE
|
||||||
OFT_UPPER
|
OFT_UPPER
|
||||||
|
|
|
@ -23,6 +23,9 @@ def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
||||||
flags |= orth.is_space(string) << IS_SPACE
|
flags |= orth.is_space(string) << IS_SPACE
|
||||||
flags |= orth.is_title(string) << IS_TITLE
|
flags |= orth.is_title(string) << IS_TITLE
|
||||||
flags |= orth.is_upper(string) << IS_UPPER
|
flags |= orth.is_upper(string) << IS_UPPER
|
||||||
|
|
||||||
|
flags |= orth.like_url(string) << LIKE_URL
|
||||||
|
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||||
return flags
|
return flags
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
|
||||||
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
||||||
|
|
||||||
|
|
||||||
def is_urlish(string):
|
def like_url(string):
|
||||||
# We're looking for things that function in text like URLs. So, valid URL
|
# We're looking for things that function in text like URLs. So, valid URL
|
||||||
# or not, anything they say http:// is going to be good.
|
# or not, anything they say http:// is going to be good.
|
||||||
if string.startswith('http://'):
|
if string.startswith('http://'):
|
||||||
|
@ -89,14 +89,14 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
||||||
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||||
'eighty ninety hundred thousand million billion trillion'
|
'eighty ninety hundred thousand million billion trillion'
|
||||||
'quadrillion gajillion bazillion'.split())
|
'quadrillion gajillion bazillion'.split())
|
||||||
def is_number(string):
|
def like_number(string):
|
||||||
string = string.replace(',', '')
|
string = string.replace(',', '')
|
||||||
string = string.replace('.', '')
|
string = string.replace('.', '')
|
||||||
if string.isdigit():
|
if string.isdigit():
|
||||||
return True
|
return True
|
||||||
if string.count('/') == 1:
|
if string.count('/') == 1:
|
||||||
num, denom = string.split('/')
|
num, denom = string.split('/')
|
||||||
if is_number(num) and is_number(denom):
|
if like_number(num) and like_number(denom):
|
||||||
return True
|
return True
|
||||||
if string in NUM_WORDS:
|
if string in NUM_WORDS:
|
||||||
return True
|
return True
|
||||||
|
|
Loading…
Reference in New Issue
Block a user