mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Add LIKE_URL and LIKE_NUMBER flag features
This commit is contained in:
parent
c414d0eebe
commit
8335706321
|
@ -16,6 +16,9 @@ cpdef enum:
|
|||
IS_TITLE
|
||||
IS_UPPER
|
||||
|
||||
LIKE_URL
|
||||
LIKE_NUMBER
|
||||
|
||||
OFT_LOWER
|
||||
OFT_TITLE
|
||||
OFT_UPPER
|
||||
|
|
|
@ -23,6 +23,9 @@ def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc):
|
|||
flags |= orth.is_space(string) << IS_SPACE
|
||||
flags |= orth.is_title(string) << IS_TITLE
|
||||
flags |= orth.is_upper(string) << IS_UPPER
|
||||
|
||||
flags |= orth.like_url(string) << LIKE_URL
|
||||
flags |= orth.like_number(string) << LIKE_NUMBER
|
||||
return flags
|
||||
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu
|
|||
"wf|ws|ye|yt|za|zm|zw".split('|'))
|
||||
|
||||
|
||||
def is_urlish(string):
|
||||
def like_url(string):
|
||||
# We're looking for things that function in text like URLs. So, valid URL
|
||||
# or not, anything they say http:// is going to be good.
|
||||
if string.startswith('http://'):
|
||||
|
@ -89,14 +89,14 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten'
|
|||
'eighteen nineteen twenty thirty forty fifty sixty seventy'
|
||||
'eighty ninety hundred thousand million billion trillion'
|
||||
'quadrillion gajillion bazillion'.split())
|
||||
def is_number(string):
|
||||
def like_number(string):
|
||||
string = string.replace(',', '')
|
||||
string = string.replace('.', '')
|
||||
if string.isdigit():
|
||||
return True
|
||||
if string.count('/') == 1:
|
||||
num, denom = string.split('/')
|
||||
if is_number(num) and is_number(denom):
|
||||
if like_number(num) and like_number(denom):
|
||||
return True
|
||||
if string in NUM_WORDS:
|
||||
return True
|
||||
|
|
Loading…
Reference in New Issue
Block a user