From 8335706321158cf47aec2d11f562186914f71b09 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 2 Nov 2014 13:19:05 +1100 Subject: [PATCH] * Add LIKE_URL and LIKE_NUMBER flag features --- spacy/lexeme.pxd | 3 +++ spacy/lexeme.pyx | 3 +++ spacy/orth.py | 6 +++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 76b236e5b..288fd6375 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -16,6 +16,9 @@ cpdef enum: IS_TITLE IS_UPPER + LIKE_URL + LIKE_NUMBER + OFT_LOWER OFT_TITLE OFT_UPPER diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 6616cda47..c7c9e8334 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -23,6 +23,9 @@ def get_flags(unicode string, float upper_pc, float title_pc, float lower_pc): flags |= orth.is_space(string) << IS_SPACE flags |= orth.is_title(string) << IS_TITLE flags |= orth.is_upper(string) << IS_UPPER + + flags |= orth.like_url(string) << LIKE_URL + flags |= orth.like_number(string) << LIKE_NUMBER return flags diff --git a/spacy/orth.py b/spacy/orth.py index 10a5df66c..e27ef9119 100644 --- a/spacy/orth.py +++ b/spacy/orth.py @@ -65,7 +65,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu "wf|ws|ye|yt|za|zm|zw".split('|')) -def is_urlish(string): +def like_url(string): # We're looking for things that function in text like URLs. So, valid URL # or not, anything they say http:// is going to be good. if string.startswith('http://'): @@ -89,14 +89,14 @@ NUM_WORDS = set('zero one two three four five six seven eight nine ten' 'eighteen nineteen twenty thirty forty fifty sixty seventy' 'eighty ninety hundred thousand million billion trillion' 'quadrillion gajillion bazillion'.split()) -def is_number(string): +def like_number(string): string = string.replace(',', '') string = string.replace('.', '') if string.isdigit(): return True if string.count('/') == 1: num, denom = string.split('/') - if is_number(num) and is_number(denom): + if like_number(num) and like_number(denom): return True if string in NUM_WORDS: return True