mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects
This commit is contained in:
		
							parent
							
								
									eeaea25f0c
								
							
						
					
					
						commit
						6bb96c122d
					
				|  | @ -1,5 +1,17 @@ | |||
| # Reserve 64 values for flag features | ||||
| cpdef enum attr_id_t: | ||||
|     IS_ALPHA | ||||
|     IS_ASCII | ||||
|     IS_DIGIT | ||||
|     IS_LOWER | ||||
|     IS_PUNCT | ||||
|     IS_SPACE | ||||
|     IS_TITLE | ||||
|     IS_UPPER | ||||
|     LIKE_URL | ||||
|     LIKE_NUM | ||||
|     LIKE_EMAIL | ||||
|     IS_STOP | ||||
|     FLAG0 | ||||
|     FLAG1 | ||||
|     FLAG2 | ||||
|  |  | |||
|  | @ -1,8 +1,19 @@ | |||
| from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7 | ||||
| from ..attrs cimport FLAG8, FLAG9, FLAG10, FLAG11, FLAG12, FLAG13, FLAG14 | ||||
| from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14 | ||||
| from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21 | ||||
| from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28 | ||||
| from ..attrs cimport FLAG29,  FLAG30, FLAG31, FLAG32 | ||||
| from ..attrs cimport IS_ALPHA as _IS_ALPHA | ||||
| from ..attrs cimport IS_DIGIT as _IS_DIGIT | ||||
| from ..attrs cimport IS_ASCII as _IS_ASCII | ||||
| from ..attrs cimport IS_LOWER as _IS_LOWER | ||||
| from ..attrs cimport IS_PUNCT as _IS_PUNCT | ||||
| from ..attrs cimport IS_SPACE as _IS_SPACE | ||||
| from ..attrs cimport IS_TITLE as _IS_TITLE | ||||
| from ..attrs cimport IS_UPPER as _IS_UPPER | ||||
| from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL | ||||
| from ..attrs cimport LIKE_URL as _LIKE_URL | ||||
| from ..attrs cimport LIKE_NUM as _LIKE_NUM | ||||
| from ..attrs cimport IS_STOP as _IS_STOP | ||||
| from ..attrs cimport ORTH as _ORTH | ||||
| from ..attrs cimport SHAPE as _SHAPE | ||||
| from ..attrs cimport LOWER as _LOWER | ||||
|  | @ -20,43 +31,19 @@ from ..attrs cimport ENT_TYPE as _ENT_TYPE | |||
| 
 | ||||
| 
 | ||||
| cpdef enum: | ||||
|     IS_ALPHA = FLAG0 | ||||
|     IS_ASCII = FLAG1 | ||||
|     IS_DIGIT = FLAG2 | ||||
|     IS_LOWER = FLAG3 | ||||
|     IS_PUNCT = FLAG4 | ||||
|     IS_SPACE = FLAG5 | ||||
|     IS_TITLE = FLAG6 | ||||
|     IS_UPPER = FLAG7 | ||||
|     LIKE_URL = FLAG8 | ||||
|     LIKE_NUM = FLAG9 | ||||
|     IS_STOP = FLAG10 | ||||
|     IS_ALPHA = _IS_ALPHA | ||||
|     IS_ASCII = _IS_ASCII | ||||
|     IS_DIGIT = _IS_DIGIT | ||||
|     IS_LOWER = _IS_LOWER | ||||
|     IS_PUNCT = _IS_PUNCT | ||||
|     IS_SPACE = _IS_SPACE | ||||
|     IS_TITLE = _IS_TITLE | ||||
|     IS_UPPER = _IS_UPPER | ||||
|     LIKE_URL = _LIKE_URL | ||||
|     LIKE_NUM = _LIKE_NUM | ||||
|     LIKE_EMAIL = _LIKE_EMAIL | ||||
|     IS_STOP = _IS_STOP | ||||
|   | ||||
|     EMO_POS = FLAG11 | ||||
|     EMO_NEG = FLAG12 | ||||
| 
 | ||||
|     EMO_ANGER = FLAG13 | ||||
|     EMO_APATE = FLAG14 | ||||
|     EMO_DISGUST = FLAG15 | ||||
|     EMO_FEAR = FLAG16 | ||||
|     EMO_JOY = FLAG17 | ||||
|     EMO_SAD = FLAG18 | ||||
|     EMO_SURPRISE = FLAG19 | ||||
|     EMO_TRUST = FLAG20 | ||||
| 
 | ||||
|     CLR_NONE = FLAG21 | ||||
|     CLR_BLACK = FLAG22 | ||||
|     CLR_BLUE = FLAG23 | ||||
|     CLR_BROWN = FLAG24 | ||||
|     CLR_GREEN = FLAG25 | ||||
|     CLR_GREY = FLAG26 | ||||
|     CLR_ORANGE = FLAG27 | ||||
|     CLR_PURPLE = FLAG28 | ||||
|     CLR_PINK = FLAG29 | ||||
|     CLR_RED = FLAG30 | ||||
|     CLR_WHITE = FLAG31 | ||||
|     CLR_YELLOW = FLAG32 | ||||
| 
 | ||||
|     ORTH = _ORTH | ||||
|     SHAPE = _SHAPE | ||||
|     LOWER = _LOWER | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| # cython: embedsignature=True | ||||
| from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space | ||||
| from ..orth cimport is_title, is_upper, like_url, like_number | ||||
| from ..orth cimport is_title, is_upper, like_url, like_number, like_email | ||||
| from ..typedefs cimport flags_t | ||||
| 
 | ||||
| 
 | ||||
|  | @ -16,4 +16,5 @@ def get_flags(unicode string): | |||
|     flags |= is_upper(string) << IS_UPPER | ||||
|     flags |= like_url(string) << LIKE_URL | ||||
|     flags |= like_number(string) << LIKE_NUM | ||||
|     flags |= like_email(string) << LIKE_EMAIL | ||||
|     return flags | ||||
|  |  | |||
|  | @ -72,7 +72,7 @@ cdef class Lexeme: | |||
|         py.sentiment = ptr.sentiment | ||||
|         return py | ||||
| 
 | ||||
|     cpdef bint check(self, attr_id_t flag_id) except -1 | ||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1 | ||||
|      | ||||
| 
 | ||||
| cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: | ||||
|  |  | |||
|  | @ -9,6 +9,9 @@ from .orth cimport word_shape | |||
| from .typedefs cimport attr_t, flags_t | ||||
| import numpy | ||||
| 
 | ||||
| from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | ||||
| from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | ||||
| 
 | ||||
| 
 | ||||
| memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||
| 
 | ||||
|  | @ -44,5 +47,36 @@ cdef class Lexeme: | |||
|     def has_repvec(self): | ||||
|         return self.l2_norm != 0 | ||||
| 
 | ||||
|     cpdef bint check(self, attr_id_t flag_id) except -1: | ||||
|         return self.flags & (1 << flag_id) | ||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1: | ||||
|         cdef flags_t one = 1 | ||||
|         return self.flags & (one << flag_id) | ||||
| 
 | ||||
|     property is_alpha: | ||||
|         def __get__(self): return self.check_flag(IS_ALPHA) | ||||
|      | ||||
|     property is_ascii: | ||||
|         def __get__(self): return self.check_flag(IS_ASCII) | ||||
| 
 | ||||
|     property is_digit: | ||||
|         def __get__(self): return self.check_flag(IS_DIGIT) | ||||
| 
 | ||||
|     property is_lower: | ||||
|         def __get__(self): return self.check_flag(IS_LOWER) | ||||
| 
 | ||||
|     property is_title: | ||||
|         def __get__(self): return self.check_flag(IS_TITLE) | ||||
| 
 | ||||
|     property is_punct: | ||||
|         def __get__(self): return self.check_flag(IS_PUNCT) | ||||
| 
 | ||||
|     property is_space:  | ||||
|         def __get__(self): return self.check_flag(IS_SPACE) | ||||
| 
 | ||||
|     property like_url: | ||||
|         def __get__(self): return self.check_flag(LIKE_URL) | ||||
|      | ||||
|     property like_num: | ||||
|         def __get__(self): return self.check_flag(LIKE_NUM) | ||||
| 
 | ||||
|     property like_email: | ||||
|         def __get__(self): return self.check_flag(LIKE_EMAIL) | ||||
|  |  | |||
|  | @ -6,6 +6,7 @@ cpdef bint is_ascii(unicode string) | |||
| cpdef bint is_title(unicode string) | ||||
| cpdef bint is_lower(unicode string) | ||||
| cpdef bint is_upper(unicode string) | ||||
| cpdef bint like_email(unicode string) | ||||
| cpdef bint like_url(unicode string) | ||||
| cpdef bint like_number(unicode string) | ||||
| cpdef unicode word_shape(unicode string) | ||||
|  |  | |||
|  | @ -111,6 +111,11 @@ cpdef bint like_number(unicode string): | |||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match | ||||
| cpdef bint like_email(unicode string): | ||||
|     return _like_email(string) | ||||
| 
 | ||||
| 
 | ||||
| cpdef unicode word_shape(unicode string): | ||||
|     if len(string) >= 100: | ||||
|         return 'LONG' | ||||
|  |  | |||
|  | @ -16,6 +16,11 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST | |||
| from ..attrs cimport POS, LEMMA, TAG, DEP | ||||
| from ..parts_of_speech cimport CONJ, PUNCT | ||||
| 
 | ||||
| from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | ||||
| from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef class Token: | ||||
|     """An individual token --- i.e. a word, a punctuation symbol, etc.  Created | ||||
|  | @ -281,5 +286,36 @@ cdef class Token: | |||
|         def __get__(self): | ||||
|             return self.vocab.strings[self.c.dep] | ||||
| 
 | ||||
|      | ||||
|     property is_alpha: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_ALPHA) | ||||
|      | ||||
|     property is_ascii: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_ASCII) | ||||
| 
 | ||||
|     property is_digit: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_DIGIT) | ||||
| 
 | ||||
|     property is_lower: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_LOWER) | ||||
| 
 | ||||
|     property is_title: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_TITLE) | ||||
| 
 | ||||
|     property is_punct: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_PUNCT) | ||||
| 
 | ||||
|     property is_space:  | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_SPACE) | ||||
| 
 | ||||
|     property like_url: | ||||
|         def __get__(self): return check_flag(self.c.lex, LIKE_URL) | ||||
|      | ||||
|     property like_num: | ||||
|         def __get__(self): return check_flag(self.c.lex, LIKE_NUM) | ||||
| 
 | ||||
|     property like_email: | ||||
|         def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) | ||||
| 
 | ||||
| 
 | ||||
| _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} | ||||
|  |  | |||
|  | @ -38,6 +38,8 @@ cdef class Vocab: | |||
|     ''' | ||||
|     def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, | ||||
|                  pos_tags=None, oov_prob=-30): | ||||
|         if oov_prob is None: | ||||
|             oov_prob = -30 | ||||
|         self.mem = Pool() | ||||
|         self._by_hash = PreshMap() | ||||
|         self._by_orth = PreshMap() | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user