diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 74397fa64..79a177ba9 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -18,9 +18,9 @@ cdef enum attr_id_t: IS_QUOTE IS_LEFT_PUNCT IS_RIGHT_PUNCT + IS_CURRENCY - FLAG18 = 18 - FLAG19 + FLAG19 = 19 FLAG20 FLAG21 FLAG22 diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 893ec0845..d4e8a38c5 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -21,7 +21,7 @@ IDS = { "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, - "FLAG18": FLAG18, + "IS_CURRENCY": IS_CURRENCY, "FLAG19": FLAG19, "FLAG20": FLAG20, "FLAG21": FLAG21, diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index c3bb4a8ff..f1279f035 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -69,6 +69,14 @@ def is_right_punct(text): return text in right_punct +def is_currency(text): + # can be overwritten by lang with list of currency words, e.g. dollar, euro + for char in text: + if unicodedata.category(char) != 'Sc': + return False + return True + + def like_email(text): return bool(_like_email(text)) @@ -164,5 +172,6 @@ LEX_ATTRS = { attrs.IS_QUOTE: is_quote, attrs.IS_LEFT_PUNCT: is_left_punct, attrs.IS_RIGHT_PUNCT: is_right_punct, + attrs.IS_CURRENCY: is_currency, attrs.LIKE_URL: like_url } diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index d136540f9..78d3bed6c 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -12,7 +12,7 @@ import numpy from .typedefs cimport attr_t, flags_t from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP -from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV +from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV from .attrs cimport PROB from .attrs import intify_attrs from . import about @@ -474,6 +474,14 @@ cdef class Lexeme: def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) + property is_currency: + """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €.""" + def __get__(self): + return Lexeme.c_check_flag(self.c, IS_CURRENCY) + + def __set__(self, bint x): + Lexeme.c_set_flag(self.c, IS_CURRENCY, x) + property like_url: """RETURNS (bool): Whether the lexeme resembles a URL.""" def __get__(self): diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 6960681a3..cc1734e6d 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -17,9 +17,9 @@ cdef enum symbol_t: IS_QUOTE IS_LEFT_PUNCT IS_RIGHT_PUNCT + IS_CURRENCY - FLAG18 = 18 - FLAG19 + FLAG19 = 19 FLAG20 FLAG21 FLAG22 diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 98e4c440d..4bc1d4228 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -22,8 +22,8 @@ IDS = { "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, + "IS_CURRENCY": IS_CURRENCY, - "FLAG18": FLAG18, "FLAG19": FLAG19, "FLAG20": FLAG20, "FLAG21": FLAG21, diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 92ee04737..67485ee60 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA -from ...lang.lex_attrs import is_punct, is_ascii, like_url, word_shape +from ...lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape import pytest @@ -37,6 +37,13 @@ def test_lex_attrs_is_ascii(text, match): assert is_ascii(text) == match +@pytest.mark.parametrize('text,match', [('$', True), ('£', True), ('♥', False), + ('€', True), ('¥', True), ('¢', True), + ('a', False), ('www.google.com', False), ('dog', False)]) +def test_lex_attrs_is_currency(text, match): + assert is_currency(text) == match + + @pytest.mark.parametrize('text,match', [ ('www.google.com', True), ('google.com', True), ('sydney.com', True), ('2girls1cup.org', True), ('http://stupid', True), ('www.hi', True), diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 74487b515..9e4b878cf 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -15,7 +15,7 @@ from ..lexeme cimport Lexeme from .. import parts_of_speech from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT -from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL +from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..compat import is_config @@ -855,6 +855,11 @@ cdef class Token: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) + property is_currency: + """RETURNS (bool): Whether the token is a currency symbol.""" + def __get__(self): + return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY) + property like_url: """RETURNS (bool): Whether the token resembles a URL.""" def __get__(self):