mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Tmp
This commit is contained in:
		
							parent
							
								
									0f2cb74433
								
							
						
					
					
						commit
						cad0cca4e3
					
				|  | @ -80,7 +80,6 @@ class English(object): | |||
|       Packer=None, | ||||
|       load_vectors=True | ||||
|     ): | ||||
|          | ||||
|         self.data_dir = data_dir | ||||
| 
 | ||||
|         if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): | ||||
|  |  | |||
							
								
								
									
										126
									
								
								spacy/lexeme.pxd
									
									
									
									
									
								
							
							
						
						
									
										126
									
								
								spacy/lexeme.pxd
									
									
									
									
									
								
							|  | @ -8,97 +8,53 @@ from .strings cimport StringStore | |||
| from numpy cimport ndarray | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef LexemeC EMPTY_LEXEME | ||||
| 
 | ||||
| 
 | ||||
| cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings, | ||||
|                               const float* empty_vec) except -1 | ||||
| 
 | ||||
| cdef class Lexeme: | ||||
|     cdef readonly ndarray repvec | ||||
| 
 | ||||
|     cdef readonly flags_t flags | ||||
|     cdef readonly attr_t id | ||||
|     cdef readonly attr_t length | ||||
| 
 | ||||
|     cdef LexemeC* c | ||||
|     cdef readonly Vocab vocab | ||||
|     cdef readonly attr_t orth | ||||
|     cdef readonly attr_t lower | ||||
|     cdef readonly attr_t norm | ||||
|     cdef readonly attr_t shape | ||||
|     cdef readonly attr_t prefix | ||||
|     cdef readonly attr_t suffix | ||||
| 
 | ||||
|     cdef readonly unicode orth_ | ||||
|     cdef readonly unicode lower_ | ||||
|     cdef readonly unicode norm_ | ||||
|     cdef readonly unicode shape_ | ||||
|     cdef readonly unicode prefix_ | ||||
|     cdef readonly unicode suffix_ | ||||
|     cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: | ||||
|         lex.length = props['length'] | ||||
|         lex.orth = vocab.strings[props['orth']] | ||||
|         lex.lower = vocab.strings[props['lower']] | ||||
|         lex.norm = vocab.strings[props['norm']] | ||||
|         lex.shape = vocab.strings[props['shape']] | ||||
|         lex.prefix = vocab.strings[props['prefix']] | ||||
|         lex.suffix = vocab.strings[props['suffix']] | ||||
| 
 | ||||
|     cdef readonly attr_t cluster | ||||
|     cdef readonly float prob | ||||
|     cdef readonly float sentiment | ||||
|     cdef readonly float l2_norm | ||||
|         lex.cluster = props['cluster'] | ||||
|         lex.prob = props['prob'] | ||||
|         lex.sentiment = props['sentiment'] | ||||
| 
 | ||||
|         lex.flags = props['flags'] | ||||
|         lex.repvec = empty_vec | ||||
| 
 | ||||
|     # Workaround for an apparent bug in the way the decorator is handled --- | ||||
|     # TODO: post bug report / patch to Cython. | ||||
|     @staticmethod | ||||
|     cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length): | ||||
|         cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length) | ||||
|         for i in range(repvec_length): | ||||
|             py.repvec[i] = ptr.repvec[i] | ||||
|         py.l2_norm = ptr.l2_norm | ||||
|         py.flags = ptr.flags | ||||
|         py.id = ptr.id | ||||
|         py.length = ptr.length | ||||
|     cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: | ||||
|         if feat_name < (sizeof(flags_t) * 8): | ||||
|             return Lexeme.check_flag(lex, feat_name) | ||||
|         elif feat_name == ID: | ||||
|             return lex.id | ||||
|         elif feat_name == ORTH: | ||||
|             return lex.orth | ||||
|         elif feat_name == LOWER: | ||||
|             return lex.lower | ||||
|         elif feat_name == NORM: | ||||
|             return lex.norm | ||||
|         elif feat_name == SHAPE: | ||||
|             return lex.shape | ||||
|         elif feat_name == PREFIX: | ||||
|             return lex.prefix | ||||
|         elif feat_name == SUFFIX: | ||||
|             return lex.suffix | ||||
|         elif feat_name == LENGTH: | ||||
|             return lex.length | ||||
|         elif feat_name == CLUSTER: | ||||
|             return lex.cluster | ||||
|         else: | ||||
|             return 0 | ||||
| 
 | ||||
|         py.orth = ptr.orth | ||||
|         py.lower = ptr.lower | ||||
|         py.norm = ptr.norm | ||||
|         py.shape = ptr.shape | ||||
|         py.prefix = ptr.prefix | ||||
|         py.suffix = ptr.suffix | ||||
| 
 | ||||
|         py.orth_ = strings[ptr.orth] | ||||
|         py.lower_ = strings[ptr.lower] | ||||
|         py.norm_ = strings[ptr.norm] | ||||
|         py.shape_ = strings[ptr.shape] | ||||
|         py.prefix_ = strings[ptr.prefix] | ||||
|         py.suffix_ = strings[ptr.suffix] | ||||
| 
 | ||||
|         py.cluster = ptr.cluster | ||||
|         py.prob = ptr.prob | ||||
|         py.sentiment = ptr.sentiment | ||||
|         return py | ||||
| 
 | ||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1 | ||||
|      | ||||
| 
 | ||||
| cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: | ||||
|     return lexeme.flags & (1 << flag_id) | ||||
| 
 | ||||
| 
 | ||||
| cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil: | ||||
|     if feat_name < (sizeof(flags_t) * 8): | ||||
|         return check_flag(lex, feat_name) | ||||
|     elif feat_name == ID: | ||||
|         return lex.id | ||||
|     elif feat_name == ORTH: | ||||
|         return lex.orth | ||||
|     elif feat_name == LOWER: | ||||
|         return lex.lower | ||||
|     elif feat_name == NORM: | ||||
|         return lex.norm | ||||
|     elif feat_name == SHAPE: | ||||
|         return lex.shape | ||||
|     elif feat_name == PREFIX: | ||||
|         return lex.prefix | ||||
|     elif feat_name == SUFFIX: | ||||
|         return lex.suffix | ||||
|     elif feat_name == LENGTH: | ||||
|         return lex.length | ||||
|     elif feat_name == CLUSTER: | ||||
|         return lex.cluster | ||||
|     else: | ||||
|         return 0 | ||||
|     cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: | ||||
|         return lexeme.flags & (1 << flag_id) | ||||
|  |  | |||
							
								
								
									
										109
									
								
								spacy/lexeme.pyx
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								spacy/lexeme.pyx
									
									
									
									
									
								
							|  | @ -17,70 +17,105 @@ from .attrs cimport IS_OOV | |||
| memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||
| 
 | ||||
| 
 | ||||
| cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store, | ||||
|                               const float* empty_vec) except -1: | ||||
|     lex.length = props['length'] | ||||
|     lex.orth = string_store[props['orth']] | ||||
|     lex.lower = string_store[props['lower']] | ||||
|     lex.norm = string_store[props['norm']] | ||||
|     lex.shape = string_store[props['shape']] | ||||
|     lex.prefix = string_store[props['prefix']] | ||||
|     lex.suffix = string_store[props['suffix']] | ||||
| 
 | ||||
|     lex.cluster = props['cluster'] | ||||
|     lex.prob = props['prob'] | ||||
|     lex.sentiment = props['sentiment'] | ||||
| 
 | ||||
|     lex.flags = props['flags'] | ||||
|     lex.repvec = empty_vec | ||||
| 
 | ||||
| 
 | ||||
| cdef class Lexeme: | ||||
|     """An entry in the vocabulary.  A Lexeme has no string context --- it's a | ||||
|     word-type, as opposed to a word token.  It therefore has no part-of-speech | ||||
|     tag, dependency parse, or lemma (lemmatization depends on the part-of-speech | ||||
|     tag). | ||||
|     """ | ||||
|     def __cinit__(self, int vec_size): | ||||
|         self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32) | ||||
|     def __init__(self, Vocab vocab, int orth): | ||||
|         self.vocab = vocab | ||||
|         self.orth = orth | ||||
|         self.c = <LexemeC*><void*>vocab.get_by_orth(orth) | ||||
| 
 | ||||
|     @property | ||||
|     def has_repvec(self): | ||||
|         return self.l2_norm != 0 | ||||
|     property orth: | ||||
|         def __get__(self):  | ||||
|             return self.c.orth | ||||
|      | ||||
|     property lower: | ||||
|         def __get__(self): return self.c.lower | ||||
|         def __set__(self, int x): self.c.lower = x | ||||
|      | ||||
|     property norm: | ||||
|         def __get__(self): return self.c.norm | ||||
|         def __set__(self, int x): self.c.norm = x | ||||
| 
 | ||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1: | ||||
|         cdef flags_t one = 1 | ||||
|         return self.flags & (one << flag_id) | ||||
|     property shape: | ||||
|         def __get__(self): return self.c.shape | ||||
|         def __set__(self, int x): self.c.shape = x | ||||
| 
 | ||||
|     property prefix: | ||||
|         def __get__(self): return self.c.prefix | ||||
|         def __set__(self, int x): self.c.prefix = x | ||||
| 
 | ||||
|     property suffix: | ||||
|         def __get__(self): return self.c.suffix | ||||
|         def __set__(self, int x): self.c.suffix = x | ||||
|      | ||||
|     property orth_: | ||||
|         def __get__(self): | ||||
|             return self.vocab.strings[self.c.orth] | ||||
| 
 | ||||
|     property lower_: | ||||
|         def __get__(self): return self.vocab.strings[self.c.lower] | ||||
|         def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] | ||||
|   | ||||
|     property norm_: | ||||
|         def __get__(self): return self.c.norm | ||||
|         def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] | ||||
|      | ||||
|     property shape_: | ||||
|         def __get__(self): return self.vocab.strings[self.c.shape] | ||||
|         def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] | ||||
| 
 | ||||
|     property prefix_: | ||||
|         def __get__(self): return self.c.prefix | ||||
|         def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x] | ||||
| 
 | ||||
|     property suffix_: | ||||
|         def __get__(self): return self.c.suffix | ||||
|         def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] | ||||
| 
 | ||||
|     property is_oov: | ||||
|         def __get__(self): return self.check_flag(IS_OOV) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) | ||||
| 
 | ||||
|     property is_alpha: | ||||
|         def __get__(self): return self.check_flag(IS_ALPHA) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x) | ||||
|      | ||||
|     property is_ascii: | ||||
|         def __get__(self): return self.check_flag(IS_ASCII) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x) | ||||
| 
 | ||||
|     property is_digit: | ||||
|         def __get__(self): return self.check_flag(IS_DIGIT) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x) | ||||
| 
 | ||||
|     property is_lower: | ||||
|         def __get__(self): return self.check_flag(IS_LOWER) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x) | ||||
| 
 | ||||
|     property is_title: | ||||
|         def __get__(self): return self.check_flag(IS_TITLE) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x) | ||||
| 
 | ||||
|     property is_punct: | ||||
|         def __get__(self): return self.check_flag(IS_PUNCT) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x) | ||||
| 
 | ||||
|     property is_space:  | ||||
|         def __get__(self): return self.check_flag(IS_SPACE) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x) | ||||
| 
 | ||||
|     property like_url: | ||||
|         def __get__(self): return self.check_flag(LIKE_URL) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) | ||||
|      | ||||
|     property like_num: | ||||
|         def __get__(self): return self.check_flag(LIKE_NUM) | ||||
|         def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) | ||||
| 
 | ||||
|     property like_email: | ||||
|         def __get__(self): return self.check_flag(LIKE_EMAIL) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) | ||||
|  |  | |||
|  | @ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr | |||
| from .tokens.doc cimport Doc | ||||
| from .vocab cimport Vocab | ||||
| 
 | ||||
| from libcpp.vector cimport vector | ||||
| 
 | ||||
| try: | ||||
|     import ujson as json | ||||
| except ImportError: | ||||
|  | @ -96,28 +98,26 @@ def map_attr_name(attr): | |||
| 
 | ||||
| cdef class Matcher: | ||||
|     cdef Pool mem | ||||
|     cdef Pattern** patterns | ||||
|     cdef vector[Pattern*] patterns | ||||
|     cdef readonly int n_patterns | ||||
| 
 | ||||
|     def __init__(self, vocab, patterns): | ||||
|         self.mem = Pool() | ||||
|         n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()]) | ||||
|         self.patterns = <Pattern**>self.mem.alloc(n_patterns, sizeof(Pattern*)) | ||||
|         cdef int i = 0 | ||||
|         for entity_key, (etype, attrs, specs) in sorted(patterns.items()): | ||||
|             if isinstance(entity_key, basestring): | ||||
|                 entity_key = vocab.strings[entity_key] | ||||
|             if isinstance(etype, basestring): | ||||
|                 etype = vocab.strings[etype] | ||||
|             elif etype is None: | ||||
|                 etype = -1 | ||||
|             # TODO: Do something more clever about multiple patterns for single | ||||
|             # entity | ||||
|             for spec in specs: | ||||
|                 spec = _convert_strings(spec, vocab.strings) | ||||
|                 self.patterns[i] = init_pattern(self.mem, spec, etype) | ||||
|                 i += 1 | ||||
|         self.n_patterns = len(patterns) | ||||
|             self.add(entity_key, etype, attrs, specs) | ||||
| 
 | ||||
|     def add(self, entity_key, etype, attrs, specs): | ||||
|         if isinstance(entity_key, basestring): | ||||
|             entity_key = vocab.strings[entity_key] | ||||
|         if isinstance(etype, basestring): | ||||
|             etype = vocab.strings[etype] | ||||
|         elif etype is None: | ||||
|             etype = -1 | ||||
|         # TODO: Do something more clever about multiple patterns for single | ||||
|         # entity | ||||
|         for spec in specs: | ||||
|             spec = _convert_strings(spec, vocab.strings) | ||||
|             self.patterns.push_back(init_pattern(self.mem, spec, etype)) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_dir(cls, vocab, data_dir): | ||||
|  |  | |||
|  | @ -108,6 +108,11 @@ cdef class StringStore: | |||
|         else: | ||||
|             raise TypeError(type(string_or_id)) | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         cdef int i | ||||
|         for i in range(self.size): | ||||
|             yield self[i] | ||||
| 
 | ||||
|     cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: | ||||
|         # 0 means missing, but we don't bother offsetting the index. | ||||
|         key = hash64(chars, length * sizeof(char), 0) | ||||
|  |  | |||
|  | @ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC | |||
| cdef class Vocab: | ||||
|     '''A map container for a language's LexemeC structs. | ||||
|     ''' | ||||
|     def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, | ||||
|                  pos_tags=None, oov_prob=-30): | ||||
|         if oov_prob is None: | ||||
|             oov_prob = -30 | ||||
|     def __init__(self, data_dir=None, get_lex_attr=None): | ||||
|         self.mem = Pool() | ||||
|         self._by_hash = PreshMap() | ||||
|         self._by_orth = PreshMap() | ||||
|         self.strings = StringStore() | ||||
|         self.pos_tags = pos_tags if pos_tags is not None else {} | ||||
| 
 | ||||
|         self.lexeme_props_getter = get_lex_props | ||||
|          | ||||
|         self.get_lex_attr = get_lex_attr | ||||
|         self.repvec_length = 0 | ||||
|         self.length = 0 | ||||
|         self._add_lex_to_vocab(0, &EMPTY_LEXEME) | ||||
|         if data_dir is not None: | ||||
|             if not path.exists(data_dir): | ||||
|                 raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) | ||||
|         if data_dir is not None: | ||||
|             if not path.isdir(data_dir): | ||||
|                 raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) | ||||
|             self.load_lexemes(path.join(data_dir, 'strings.txt'), | ||||
|  | @ -63,7 +59,6 @@ cdef class Vocab: | |||
| 
 | ||||
|         self._serializer = None | ||||
|         self.data_dir = data_dir | ||||
|         self.oov_prob = oov_prob | ||||
| 
 | ||||
|     property serializer: | ||||
|         def __get__(self): | ||||
|  | @ -91,18 +86,8 @@ cdef class Vocab: | |||
|         lex = <LexemeC*>self._by_hash.get(key) | ||||
|         if lex != NULL: | ||||
|             return lex | ||||
|         cdef bint is_oov = mem is not self.mem | ||||
|         if len(string) < 3: | ||||
|             mem = self.mem | ||||
|         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) | ||||
|         props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) | ||||
|         set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) | ||||
|         if is_oov: | ||||
|             lex.id = 0 | ||||
|         else: | ||||
|             self._add_lex_to_vocab(key, lex) | ||||
|         assert lex != NULL, string | ||||
|         return lex | ||||
|             return self._new_lexeme(mem, string) | ||||
| 
 | ||||
|     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: | ||||
|         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||
|  | @ -114,18 +99,21 @@ cdef class Vocab: | |||
|         lex = <LexemeC*>self._by_orth.get(orth) | ||||
|         if lex != NULL: | ||||
|             return lex | ||||
|         cdef unicode string = self.strings[orth] | ||||
|         else: | ||||
|             return self._new_lexeme(mem, self.strings[orth]) | ||||
| 
 | ||||
|     cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: | ||||
|         cdef bint is_oov = mem is not self.mem | ||||
|         if len(string) < 3: | ||||
|             mem = self.mem | ||||
|         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) | ||||
|         props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) | ||||
|         set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) | ||||
|         for attr, func in self.lex_attr_getters.items(): | ||||
|             Lexeme.set_struct_attr(lex, attr, func(string)) | ||||
|         if is_oov: | ||||
|             lex.id = 0 | ||||
|         else: | ||||
|             self._add_lex_to_vocab(hash_string(string), lex) | ||||
|         assert lex != NULL, orth | ||||
|             self._add_lex_to_vocab(key, lex) | ||||
|         assert lex != NULL, string | ||||
|         return lex | ||||
| 
 | ||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: | ||||
|  | @ -171,15 +159,6 @@ cdef class Vocab: | |||
|                 "int --> Lexeme" % str(type(id_or_string))) | ||||
|         return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) | ||||
| 
 | ||||
|     def __setitem__(self, unicode string, dict props): | ||||
|         cdef hash_t key = hash_string(string) | ||||
|         cdef LexemeC* lex | ||||
|         lex = <LexemeC*>self._by_hash.get(key) | ||||
|         if lex == NULL: | ||||
|             lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) | ||||
|         set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) | ||||
|         self._add_lex_to_vocab(key, lex) | ||||
| 
 | ||||
|     def dump(self, loc): | ||||
|         if path.exists(loc): | ||||
|             assert not path.isdir(loc) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user