mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Merge pull request #1968 from DuyguA/is_currency
New lexical feature is_currency
This commit is contained in:
commit
4cb861e080
|
@ -18,9 +18,9 @@ cdef enum attr_id_t:
|
|||
IS_QUOTE
|
||||
IS_LEFT_PUNCT
|
||||
IS_RIGHT_PUNCT
|
||||
IS_CURRENCY
|
||||
|
||||
FLAG18 = 18
|
||||
FLAG19
|
||||
FLAG19 = 19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
|
|
|
@ -21,7 +21,7 @@ IDS = {
|
|||
"IS_QUOTE": IS_QUOTE,
|
||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||
"FLAG18": FLAG18,
|
||||
"IS_CURRENCY": IS_CURRENCY,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
"FLAG21": FLAG21,
|
||||
|
|
|
@ -69,6 +69,14 @@ def is_right_punct(text):
|
|||
return text in right_punct
|
||||
|
||||
|
||||
def is_currency(text):
|
||||
# can be overwritten by lang with list of currency words, e.g. dollar, euro
|
||||
for char in text:
|
||||
if unicodedata.category(char) != 'Sc':
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def like_email(text):
|
||||
return bool(_like_email(text))
|
||||
|
||||
|
@ -164,5 +172,6 @@ LEX_ATTRS = {
|
|||
attrs.IS_QUOTE: is_quote,
|
||||
attrs.IS_LEFT_PUNCT: is_left_punct,
|
||||
attrs.IS_RIGHT_PUNCT: is_right_punct,
|
||||
attrs.IS_CURRENCY: is_currency,
|
||||
attrs.LIKE_URL: like_url
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ import numpy
|
|||
from .typedefs cimport attr_t, flags_t
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_OOV
|
||||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
|
||||
from .attrs cimport PROB
|
||||
from .attrs import intify_attrs
|
||||
from . import about
|
||||
|
@ -474,6 +474,14 @@ cdef class Lexeme:
|
|||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||
|
||||
property is_currency:
|
||||
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
||||
|
||||
def __set__(self, bint x):
|
||||
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
||||
|
||||
property like_url:
|
||||
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||||
def __get__(self):
|
||||
|
|
|
@ -17,9 +17,9 @@ cdef enum symbol_t:
|
|||
IS_QUOTE
|
||||
IS_LEFT_PUNCT
|
||||
IS_RIGHT_PUNCT
|
||||
IS_CURRENCY
|
||||
|
||||
FLAG18 = 18
|
||||
FLAG19
|
||||
FLAG19 = 19
|
||||
FLAG20
|
||||
FLAG21
|
||||
FLAG22
|
||||
|
|
|
@ -22,8 +22,8 @@ IDS = {
|
|||
"IS_QUOTE": IS_QUOTE,
|
||||
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
|
||||
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
|
||||
"IS_CURRENCY": IS_CURRENCY,
|
||||
|
||||
"FLAG18": FLAG18,
|
||||
"FLAG19": FLAG19,
|
||||
"FLAG20": FLAG20,
|
||||
"FLAG21": FLAG21,
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
|
||||
from ...lang.lex_attrs import is_punct, is_ascii, like_url, word_shape
|
||||
from ...lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -37,6 +37,13 @@ def test_lex_attrs_is_ascii(text, match):
|
|||
assert is_ascii(text) == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [('$', True), ('£', True), ('♥', False),
|
||||
('€', True), ('¥', True), ('¢', True),
|
||||
('a', False), ('www.google.com', False), ('dog', False)])
|
||||
def test_lex_attrs_is_currency(text, match):
|
||||
assert is_currency(text) == match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,match', [
|
||||
('www.google.com', True), ('google.com', True), ('sydney.com', True),
|
||||
('2girls1cup.org', True), ('http://stupid', True), ('www.hi', True),
|
||||
|
|
|
@ -15,7 +15,7 @@ from ..lexeme cimport Lexeme
|
|||
from .. import parts_of_speech
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT
|
||||
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM, LIKE_EMAIL
|
||||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||
from ..compat import is_config
|
||||
|
@ -855,6 +855,11 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||
|
||||
property is_currency:
|
||||
"""RETURNS (bool): Whether the token is a currency symbol."""
|
||||
def __get__(self):
|
||||
return Lexeme.c_check_flag(self.c.lex, IS_CURRENCY)
|
||||
|
||||
property like_url:
|
||||
"""RETURNS (bool): Whether the token resembles a URL."""
|
||||
def __get__(self):
|
||||
|
|
Loading…
Reference in New Issue
Block a user