mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects
This commit is contained in:
parent
eeaea25f0c
commit
6bb96c122d
|
@ -1,5 +1,17 @@
|
|||
# Reserve 64 values for flag features
|
||||
cpdef enum attr_id_t:
|
||||
IS_ALPHA
|
||||
IS_ASCII
|
||||
IS_DIGIT
|
||||
IS_LOWER
|
||||
IS_PUNCT
|
||||
IS_SPACE
|
||||
IS_TITLE
|
||||
IS_UPPER
|
||||
LIKE_URL
|
||||
LIKE_NUM
|
||||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
FLAG0
|
||||
FLAG1
|
||||
FLAG2
|
||||
|
|
|
@ -1,8 +1,19 @@
|
|||
from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7
|
||||
from ..attrs cimport FLAG8, FLAG9, FLAG10, FLAG11, FLAG12, FLAG13, FLAG14
|
||||
from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14
|
||||
from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21
|
||||
from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28
|
||||
from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32
|
||||
from ..attrs cimport IS_ALPHA as _IS_ALPHA
|
||||
from ..attrs cimport IS_DIGIT as _IS_DIGIT
|
||||
from ..attrs cimport IS_ASCII as _IS_ASCII
|
||||
from ..attrs cimport IS_LOWER as _IS_LOWER
|
||||
from ..attrs cimport IS_PUNCT as _IS_PUNCT
|
||||
from ..attrs cimport IS_SPACE as _IS_SPACE
|
||||
from ..attrs cimport IS_TITLE as _IS_TITLE
|
||||
from ..attrs cimport IS_UPPER as _IS_UPPER
|
||||
from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL
|
||||
from ..attrs cimport LIKE_URL as _LIKE_URL
|
||||
from ..attrs cimport LIKE_NUM as _LIKE_NUM
|
||||
from ..attrs cimport IS_STOP as _IS_STOP
|
||||
from ..attrs cimport ORTH as _ORTH
|
||||
from ..attrs cimport SHAPE as _SHAPE
|
||||
from ..attrs cimport LOWER as _LOWER
|
||||
|
@ -20,43 +31,19 @@ from ..attrs cimport ENT_TYPE as _ENT_TYPE
|
|||
|
||||
|
||||
cpdef enum:
|
||||
IS_ALPHA = FLAG0
|
||||
IS_ASCII = FLAG1
|
||||
IS_DIGIT = FLAG2
|
||||
IS_LOWER = FLAG3
|
||||
IS_PUNCT = FLAG4
|
||||
IS_SPACE = FLAG5
|
||||
IS_TITLE = FLAG6
|
||||
IS_UPPER = FLAG7
|
||||
LIKE_URL = FLAG8
|
||||
LIKE_NUM = FLAG9
|
||||
IS_STOP = FLAG10
|
||||
IS_ALPHA = _IS_ALPHA
|
||||
IS_ASCII = _IS_ASCII
|
||||
IS_DIGIT = _IS_DIGIT
|
||||
IS_LOWER = _IS_LOWER
|
||||
IS_PUNCT = _IS_PUNCT
|
||||
IS_SPACE = _IS_SPACE
|
||||
IS_TITLE = _IS_TITLE
|
||||
IS_UPPER = _IS_UPPER
|
||||
LIKE_URL = _LIKE_URL
|
||||
LIKE_NUM = _LIKE_NUM
|
||||
LIKE_EMAIL = _LIKE_EMAIL
|
||||
IS_STOP = _IS_STOP
|
||||
|
||||
EMO_POS = FLAG11
|
||||
EMO_NEG = FLAG12
|
||||
|
||||
EMO_ANGER = FLAG13
|
||||
EMO_APATE = FLAG14
|
||||
EMO_DISGUST = FLAG15
|
||||
EMO_FEAR = FLAG16
|
||||
EMO_JOY = FLAG17
|
||||
EMO_SAD = FLAG18
|
||||
EMO_SURPRISE = FLAG19
|
||||
EMO_TRUST = FLAG20
|
||||
|
||||
CLR_NONE = FLAG21
|
||||
CLR_BLACK = FLAG22
|
||||
CLR_BLUE = FLAG23
|
||||
CLR_BROWN = FLAG24
|
||||
CLR_GREEN = FLAG25
|
||||
CLR_GREY = FLAG26
|
||||
CLR_ORANGE = FLAG27
|
||||
CLR_PURPLE = FLAG28
|
||||
CLR_PINK = FLAG29
|
||||
CLR_RED = FLAG30
|
||||
CLR_WHITE = FLAG31
|
||||
CLR_YELLOW = FLAG32
|
||||
|
||||
ORTH = _ORTH
|
||||
SHAPE = _SHAPE
|
||||
LOWER = _LOWER
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# cython: embedsignature=True
|
||||
from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space
|
||||
from ..orth cimport is_title, is_upper, like_url, like_number
|
||||
from ..orth cimport is_title, is_upper, like_url, like_number, like_email
|
||||
from ..typedefs cimport flags_t
|
||||
|
||||
|
||||
|
@ -16,4 +16,5 @@ def get_flags(unicode string):
|
|||
flags |= is_upper(string) << IS_UPPER
|
||||
flags |= like_url(string) << LIKE_URL
|
||||
flags |= like_number(string) << LIKE_NUM
|
||||
flags |= like_email(string) << LIKE_EMAIL
|
||||
return flags
|
||||
|
|
|
@ -72,7 +72,7 @@ cdef class Lexeme:
|
|||
py.sentiment = ptr.sentiment
|
||||
return py
|
||||
|
||||
cpdef bint check(self, attr_id_t flag_id) except -1
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1
|
||||
|
||||
|
||||
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||
|
|
|
@ -9,6 +9,9 @@ from .orth cimport word_shape
|
|||
from .typedefs cimport attr_t, flags_t
|
||||
import numpy
|
||||
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
|
||||
|
@ -44,5 +47,36 @@ cdef class Lexeme:
|
|||
def has_repvec(self):
|
||||
return self.l2_norm != 0
|
||||
|
||||
cpdef bint check(self, attr_id_t flag_id) except -1:
|
||||
return self.flags & (1 << flag_id)
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
cdef flags_t one = 1
|
||||
return self.flags & (one << flag_id)
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return self.check_flag(IS_ALPHA)
|
||||
|
||||
property is_ascii:
|
||||
def __get__(self): return self.check_flag(IS_ASCII)
|
||||
|
||||
property is_digit:
|
||||
def __get__(self): return self.check_flag(IS_DIGIT)
|
||||
|
||||
property is_lower:
|
||||
def __get__(self): return self.check_flag(IS_LOWER)
|
||||
|
||||
property is_title:
|
||||
def __get__(self): return self.check_flag(IS_TITLE)
|
||||
|
||||
property is_punct:
|
||||
def __get__(self): return self.check_flag(IS_PUNCT)
|
||||
|
||||
property is_space:
|
||||
def __get__(self): return self.check_flag(IS_SPACE)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return self.check_flag(LIKE_URL)
|
||||
|
||||
property like_num:
|
||||
def __get__(self): return self.check_flag(LIKE_NUM)
|
||||
|
||||
property like_email:
|
||||
def __get__(self): return self.check_flag(LIKE_EMAIL)
|
||||
|
|
|
@ -6,6 +6,7 @@ cpdef bint is_ascii(unicode string)
|
|||
cpdef bint is_title(unicode string)
|
||||
cpdef bint is_lower(unicode string)
|
||||
cpdef bint is_upper(unicode string)
|
||||
cpdef bint like_email(unicode string)
|
||||
cpdef bint like_url(unicode string)
|
||||
cpdef bint like_number(unicode string)
|
||||
cpdef unicode word_shape(unicode string)
|
||||
|
|
|
@ -111,6 +111,11 @@ cpdef bint like_number(unicode string):
|
|||
return False
|
||||
|
||||
|
||||
_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match
|
||||
cpdef bint like_email(unicode string):
|
||||
return _like_email(string)
|
||||
|
||||
|
||||
cpdef unicode word_shape(unicode string):
|
||||
if len(string) >= 100:
|
||||
return 'LONG'
|
||||
|
|
|
@ -16,6 +16,11 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST
|
|||
from ..attrs cimport POS, LEMMA, TAG, DEP
|
||||
from ..parts_of_speech cimport CONJ, PUNCT
|
||||
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
|
||||
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||
|
@ -281,5 +286,36 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
|
||||
|
||||
property is_alpha:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_ALPHA)
|
||||
|
||||
property is_ascii:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_ASCII)
|
||||
|
||||
property is_digit:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_DIGIT)
|
||||
|
||||
property is_lower:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_LOWER)
|
||||
|
||||
property is_title:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_TITLE)
|
||||
|
||||
property is_punct:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_PUNCT)
|
||||
|
||||
property is_space:
|
||||
def __get__(self): return check_flag(self.c.lex, IS_SPACE)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return check_flag(self.c.lex, LIKE_URL)
|
||||
|
||||
property like_num:
|
||||
def __get__(self): return check_flag(self.c.lex, LIKE_NUM)
|
||||
|
||||
property like_email:
|
||||
def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL)
|
||||
|
||||
|
||||
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||
|
|
|
@ -38,6 +38,8 @@ cdef class Vocab:
|
|||
'''
|
||||
def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True,
|
||||
pos_tags=None, oov_prob=-30):
|
||||
if oov_prob is None:
|
||||
oov_prob = -30
|
||||
self.mem = Pool()
|
||||
self._by_hash = PreshMap()
|
||||
self._by_orth = PreshMap()
|
||||
|
|
Loading…
Reference in New Issue
Block a user