mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'master' of github.com:honnibal/spaCy
This commit is contained in:
		
						commit
						fc19a4a153
					
				|  | @ -14,7 +14,7 @@ cpdef enum attr_id_t: | |||
|     LIKE_EMAIL | ||||
|     IS_STOP | ||||
|     IS_OOV | ||||
|      | ||||
|     | ||||
|     FLAG14 = 14 | ||||
|     FLAG15 | ||||
|     FLAG16 | ||||
|  | @ -85,3 +85,11 @@ cpdef enum attr_id_t: | |||
|     HEAD | ||||
|     SPACY | ||||
|     PROB | ||||
|      | ||||
| # Move these up to FLAG14--FLAG18 once we finish the functionality and | ||||
| # are ready to regenerate the model | ||||
| #IS_BRACKET | ||||
| #IS_QUOTE | ||||
| #IS_LEFT_PUNCT | ||||
| #IS_RIGHT_PUNCT | ||||
|   | ||||
|  |  | |||
|  | @ -13,7 +13,6 @@ IDS = { | |||
|     "LIKE_EMAIL": LIKE_EMAIL, | ||||
|     "IS_STOP": IS_STOP, | ||||
|     "IS_OOV": IS_OOV, | ||||
| 
 | ||||
|     "FLAG14": FLAG14, | ||||
|     "FLAG15": FLAG15, | ||||
|     "FLAG16": FLAG16, | ||||
|  |  | |||
|  | @ -82,6 +82,22 @@ class Language(object): | |||
|     def is_title(string): | ||||
|         return orth.is_title(string) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def is_bracket(string): | ||||
|         return orth.is_bracket(string) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def is_quote(string): | ||||
|         return orth.is_quote(string) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def is_left_punct(string): | ||||
|         return orth.is_left_punct(string) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def is_right_punct(string): | ||||
|         return orth.is_right_punct(string) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def is_upper(string): | ||||
|         return orth.is_upper(string) | ||||
|  | @ -121,6 +137,10 @@ class Language(object): | |||
|             attrs.IS_SPACE: cls.is_space, | ||||
|             attrs.IS_TITLE: cls.is_title, | ||||
|             attrs.IS_UPPER: cls.is_upper, | ||||
|             attrs.FLAG14: cls.is_bracket, | ||||
|             attrs.FLAG15: cls.is_quote, | ||||
|             attrs.FLAG16: cls.is_left_punct, | ||||
|             attrs.FLAG17: cls.is_right_punct, | ||||
|             attrs.LIKE_URL: cls.like_url, | ||||
|             attrs.LIKE_NUM: cls.like_num, | ||||
|             attrs.LIKE_EMAIL: cls.like_email, | ||||
|  |  | |||
|  | @ -18,6 +18,10 @@ import numpy | |||
| 
 | ||||
| from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | ||||
| from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | ||||
| from .attrs cimport FLAG14 as IS_BRACKET | ||||
| from .attrs cimport FLAG15 as IS_QUOTE | ||||
| from .attrs cimport FLAG16 as IS_LEFT_PUNCT | ||||
| from .attrs cimport FLAG17 as IS_RIGHT_PUNCT | ||||
| from .attrs cimport IS_OOV | ||||
| 
 | ||||
| 
 | ||||
|  | @ -183,6 +187,23 @@ cdef class Lexeme: | |||
|         def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) | ||||
|         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) | ||||
| 
 | ||||
|     property is_bracket:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) | ||||
|         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) | ||||
| 
 | ||||
|     property is_quote:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) | ||||
|         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) | ||||
| 
 | ||||
|     property is_left_punct:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) | ||||
|         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) | ||||
| 
 | ||||
|     property is_right_punct:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) | ||||
|         def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) | ||||
| 
 | ||||
| 
 | ||||
|     property like_url: | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) | ||||
|         def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) | ||||
|  |  | |||
|  | @ -15,7 +15,6 @@ from libcpp.vector cimport vector | |||
| from murmurhash.mrmr cimport hash64 | ||||
| 
 | ||||
| from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE | ||||
| from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 | ||||
| from .tokens.doc cimport get_token_attr | ||||
| from .tokens.doc cimport Doc | ||||
| from .vocab cimport Vocab | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # -*- coding: utf8 -*- | ||||
| # cython: infer_types=True | ||||
| from __future__ import unicode_literals | ||||
| import unicodedata | ||||
| 
 | ||||
|  | @ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string): | |||
|     else: | ||||
|         return True | ||||
| 
 | ||||
| cpdef bint is_bracket(unicode string): | ||||
|     return False | ||||
| 
 | ||||
| cpdef bint is_quote(unicode string): | ||||
|     if string in ('"', "'"): | ||||
|         return True | ||||
|     else: | ||||
|         return False | ||||
| 
 | ||||
| cpdef bint is_left_punct(unicode string): | ||||
|     return False | ||||
| 
 | ||||
| cpdef bint is_right_punct(unicode string): | ||||
|     return False | ||||
| 
 | ||||
| 
 | ||||
| cpdef bint is_title(unicode string): | ||||
|     return string.istitle() | ||||
|  |  | |||
|  | @ -14,7 +14,7 @@ cpdef enum symbol_t: | |||
|     IS_STOP | ||||
|     IS_OOV | ||||
|      | ||||
|     FLAG14 | ||||
|     FLAG14 = 14 | ||||
|     FLAG15 | ||||
|     FLAG16 | ||||
|     FLAG17 | ||||
|  | @ -419,3 +419,10 @@ cpdef enum symbol_t: | |||
|     rcmod | ||||
|     root | ||||
|     xcomp | ||||
| 
 | ||||
| # Move these up to FLAG14--FLAG18 once we finish the functionality | ||||
| # and are ready to regenerate the model. | ||||
| #IS_BRACKET | ||||
| #IS_QUOTE | ||||
| #IS_LEFT_PUNCT | ||||
| #IS_RIGHT_PUNCT | ||||
|  |  | |||
|  | @ -13,7 +13,6 @@ IDS = { | |||
|     "LIKE_EMAIL": LIKE_EMAIL, | ||||
|     "IS_STOP": IS_STOP, | ||||
|     "IS_OOV": IS_OOV, | ||||
|      | ||||
|     "FLAG14": FLAG14, | ||||
|     "FLAG15": FLAG15, | ||||
|     "FLAG16": FLAG16, | ||||
|  |  | |||
|  | @ -41,3 +41,18 @@ def test_is_digit(words): | |||
|     assert not is_digit(words[7]) | ||||
|     assert not is_digit(words[8]) | ||||
|     assert not is_digit(words[9]) | ||||
| 
 | ||||
| 
 | ||||
| def test_is_quote(words): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| def test_is_bracket(words): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| def test_is_left_bracket(words): | ||||
|     pass | ||||
| 
 | ||||
| def test_is_right_bracket(words): | ||||
|     pass | ||||
|  |  | |||
|  | @ -45,6 +45,7 @@ def test_symbols(en_vocab): | |||
|     assert en_vocab.strings['PROB'] == PROB | ||||
|      | ||||
| 
 | ||||
| @pytest.mark.skip | ||||
| def test_pickle_vocab(en_vocab): | ||||
|     file_ = io.BytesIO() | ||||
|     cloudpickle.dump(en_vocab, file_) | ||||
|  |  | |||
|  | @ -18,6 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP | |||
| from ..parts_of_speech cimport CONJ, PUNCT | ||||
| 
 | ||||
| from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE | ||||
| from ..attrs cimport FLAG14 as IS_BRACKET | ||||
| from ..attrs cimport FLAG15 as IS_QUOTE | ||||
| from ..attrs cimport FLAG16 as IS_LEFT_PUNCT | ||||
| from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT | ||||
| from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | ||||
| from ..attrs cimport IS_OOV | ||||
| 
 | ||||
|  | @ -362,6 +366,18 @@ cdef class Token: | |||
| 
 | ||||
|     property is_space:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) | ||||
|      | ||||
|     property is_bracket:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) | ||||
| 
 | ||||
|     property is_quote:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) | ||||
| 
 | ||||
|     property is_left_punct:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) | ||||
| 
 | ||||
|     property is_right_punct:  | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) | ||||
| 
 | ||||
|     property like_url: | ||||
|         def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user