* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects

This commit is contained in:
Matthew Honnibal 2015-07-26 16:37:16 +02:00
parent eeaea25f0c
commit 6bb96c122d
9 changed files with 120 additions and 42 deletions

View File

@ -1,5 +1,17 @@
# Reserve 64 values for flag features
cpdef enum attr_id_t:
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUM
LIKE_EMAIL
IS_STOP
FLAG0
FLAG1
FLAG2

View File

@ -1,8 +1,19 @@
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
from ..attrs cimport FLAG8, FLAG9, FLAG10, FLAG11, FLAG12, FLAG13, FLAG14
from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
from ..attrs cimport IS_ALPHA as _IS_ALPHA
from ..attrs cimport IS_DIGIT as _IS_DIGIT
from ..attrs cimport IS_ASCII as _IS_ASCII
from ..attrs cimport IS_LOWER as _IS_LOWER
from ..attrs cimport IS_PUNCT as _IS_PUNCT
from ..attrs cimport IS_SPACE as _IS_SPACE
from ..attrs cimport IS_TITLE as _IS_TITLE
from ..attrs cimport IS_UPPER as _IS_UPPER
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
from ..attrs cimport LIKE_URL as _LIKE_URL
from ..attrs cimport LIKE_NUM as _LIKE_NUM
from ..attrs cimport IS_STOP as _IS_STOP
from ..attrs cimport ORTH as _ORTH
from ..attrs cimport SHAPE as _SHAPE
from ..attrs cimport LOWER as _LOWER
@ -20,42 +31,18 @@ from ..attrs cimport ENT_TYPE as _ENT_TYPE
cpdef enum:
IS_ALPHA = FLAG0
IS_ASCII = FLAG1
IS_DIGIT = FLAG2
IS_LOWER = FLAG3
IS_PUNCT = FLAG4
IS_SPACE = FLAG5
IS_TITLE = FLAG6
IS_UPPER = FLAG7
LIKE_URL = FLAG8
LIKE_NUM = FLAG9
IS_STOP = FLAG10
EMO_POS = FLAG11
EMO_NEG = FLAG12
EMO_ANGER = FLAG13
EMO_APATE = FLAG14
EMO_DISGUST = FLAG15
EMO_FEAR = FLAG16
EMO_JOY = FLAG17
EMO_SAD = FLAG18
EMO_SURPRISE = FLAG19
EMO_TRUST = FLAG20
CLR_NONE = FLAG21
CLR_BLACK = FLAG22
CLR_BLUE = FLAG23
CLR_BROWN = FLAG24
CLR_GREEN = FLAG25
CLR_GREY = FLAG26
CLR_ORANGE = FLAG27
CLR_PURPLE = FLAG28
CLR_PINK = FLAG29
CLR_RED = FLAG30
CLR_WHITE = FLAG31
CLR_YELLOW = FLAG32
IS_ALPHA = _IS_ALPHA
IS_ASCII = _IS_ASCII
IS_DIGIT = _IS_DIGIT
IS_LOWER = _IS_LOWER
IS_PUNCT = _IS_PUNCT
IS_SPACE = _IS_SPACE
IS_TITLE = _IS_TITLE
IS_UPPER = _IS_UPPER
LIKE_URL = _LIKE_URL
LIKE_NUM = _LIKE_NUM
LIKE_EMAIL = _LIKE_EMAIL
IS_STOP = _IS_STOP
ORTH = _ORTH
SHAPE = _SHAPE

View File

@ -1,6 +1,6 @@
# cython: embedsignature=True
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
from ..orth cimport is_title, is_upper, like_url, like_number
from ..orth cimport is_title, is_upper, like_url, like_number, like_email
from ..typedefs cimport flags_t
@ -16,4 +16,5 @@ def get_flags(unicode string):
flags |= is_upper(string) << IS_UPPER
flags |= like_url(string) << LIKE_URL
flags |= like_number(string) << LIKE_NUM
flags |= like_email(string) << LIKE_EMAIL
return flags

View File

@ -72,7 +72,7 @@ cdef class Lexeme:
py.sentiment = ptr.sentiment
return py
cpdef bint check(self, attr_id_t flag_id) except -1
cpdef bint check_flag(self, attr_id_t flag_id) except -1
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:

View File

@ -9,6 +9,9 @@ from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t
import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@ -44,5 +47,36 @@ cdef class Lexeme:
def has_repvec(self):
return self.l2_norm != 0
cpdef bint check(self, attr_id_t flag_id) except -1:
return self.flags & (1 << flag_id)
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
cdef flags_t one = 1
return self.flags & (one << flag_id)
property is_alpha:
def __get__(self): return self.check_flag(IS_ALPHA)
property is_ascii:
def __get__(self): return self.check_flag(IS_ASCII)
property is_digit:
def __get__(self): return self.check_flag(IS_DIGIT)
property is_lower:
def __get__(self): return self.check_flag(IS_LOWER)
property is_title:
def __get__(self): return self.check_flag(IS_TITLE)
property is_punct:
def __get__(self): return self.check_flag(IS_PUNCT)
property is_space:
def __get__(self): return self.check_flag(IS_SPACE)
property like_url:
def __get__(self): return self.check_flag(LIKE_URL)
property like_num:
def __get__(self): return self.check_flag(LIKE_NUM)
property like_email:
def __get__(self): return self.check_flag(LIKE_EMAIL)

View File

@ -6,6 +6,7 @@ cpdef bint is_ascii(unicode string)
cpdef bint is_title(unicode string)
cpdef bint is_lower(unicode string)
cpdef bint is_upper(unicode string)
cpdef bint like_email(unicode string)
cpdef bint like_url(unicode string)
cpdef bint like_number(unicode string)
cpdef unicode word_shape(unicode string)

View File

@ -111,6 +111,11 @@ cpdef bint like_number(unicode string):
return False
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
cpdef bint like_email(unicode string):
return _like_email(string)
cpdef unicode word_shape(unicode string):
if len(string) >= 100:
return 'LONG'

View File

@ -16,6 +16,11 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
from ..attrs cimport POS, LEMMA, TAG, DEP
from ..parts_of_speech cimport CONJ, PUNCT
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
@ -282,4 +287,35 @@ cdef class Token:
return self.vocab.strings[self.c.dep]
property is_alpha:
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
property is_ascii:
def __get__(self): return check_flag(self.c.lex, IS_ASCII)
property is_digit:
def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
property is_lower:
def __get__(self): return check_flag(self.c.lex, IS_LOWER)
property is_title:
def __get__(self): return check_flag(self.c.lex, IS_TITLE)
property is_punct:
def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
property is_space:
def __get__(self): return check_flag(self.c.lex, IS_SPACE)
property like_url:
def __get__(self): return check_flag(self.c.lex, LIKE_URL)
property like_num:
def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
property like_email:
def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -38,6 +38,8 @@ cdef class Vocab:
'''
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
pos_tags=None, oov_prob=-30):
if oov_prob is None:
oov_prob = -30
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()