Merge pull request #1968 from DuyguA/is_currency

New lexical feature is_currency
This commit is contained in:
Matthew Honnibal 2018-02-15 12:13:36 +01:00 committed by GitHub
commit 4cb861e080
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 38 additions and 9 deletions

View File

@ -18,9 +18,9 @@ cdef enum attr_id_t:
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
IS_CURRENCY
FLAG18 = 18
FLAG19
FLAG19 = 19
FLAG20
FLAG21
FLAG22

View File

@ -21,7 +21,7 @@ IDS = {
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"FLAG18": FLAG18,
"IS_CURRENCY": IS_CURRENCY,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,

View File

@ -69,6 +69,14 @@ def is_right_punct(text):
return text in right_punct
def is_currency(text):
# can be overwritten by lang with list of currency words, e.g. dollar, euro
for char in text:
if unicodedata.category(char) != 'Sc':
return False
return True
def like_email(text):
return bool(_like_email(text))
@ -164,5 +172,6 @@ LEX_ATTRS = {
attrs.IS_QUOTE: is_quote,
attrs.IS_LEFT_PUNCT: is_left_punct,
attrs.IS_RIGHT_PUNCT: is_right_punct,
attrs.IS_CURRENCY: is_currency,
attrs.LIKE_URL: like_url
}

View File

@ -12,7 +12,7 @@ import numpy
from .typedefs cimport attr_t, flags_t
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
from .attrs cimport PROB
from .attrs import intify_attrs
from . import about
@ -474,6 +474,14 @@ cdef class Lexeme:
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property is_currency:
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
property like_url:
"""RETURNS (bool): Whether the lexeme resembles a URL."""
def __get__(self):

View File

@ -17,9 +17,9 @@ cdef enum symbol_t:
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
IS_CURRENCY
FLAG18 = 18
FLAG19
FLAG19 = 19
FLAG20
FLAG21
FLAG22

View File

@ -22,8 +22,8 @@ IDS = {
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"IS_CURRENCY": IS_CURRENCY,
"FLAG18": FLAG18,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
from ...attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
from ...lang.lex_attrs import is_punct, is_ascii, like_url, word_shape
from ...lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
import pytest
@ -37,6 +37,13 @@ def test_lex_attrs_is_ascii(text, match):
assert is_ascii(text) == match
@pytest.mark.parametrize('text,match', [('$', True), ('£', True), ('', False),
('', True), ('¥', True), ('¢', True),
('a', False), ('www.google.com', False), ('dog', False)])
def test_lex_attrs_is_currency(text, match):
assert is_currency(text) == match
@pytest.mark.parametrize('text,match', [
('www.google.com', True), ('google.com', True), ('sydney.com', True),
('2girls1cup.org', True), ('http://stupid', True), ('www.hi', True),

View File

@ -15,7 +15,7 @@ from ..lexeme cimport Lexeme
from .. import parts_of_speech
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config
@ -855,6 +855,11 @@ cdef class Token:
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
property is_currency:
"""RETURNS (bool): Whether the token is a currency symbol."""
def __get__(self):
return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY)
property like_url:
"""RETURNS (bool): Whether the token resembles a URL."""
def __get__(self):