mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Begin merge of Gazetteer and DE branches
This commit is contained in:
		
							parent
							
								
									dbf8dce109
								
							
						
					
					
						commit
						d2fc104a26
					
				|  | @ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE | |||
| 
 | ||||
| from .structs cimport LexemeC | ||||
| from .strings cimport StringStore | ||||
| from .vocab cimport Vocab | ||||
| 
 | ||||
| from numpy cimport ndarray | ||||
| 
 | ||||
|  | @ -15,21 +16,31 @@ cdef class Lexeme: | |||
|     cdef readonly Vocab vocab | ||||
|     cdef readonly attr_t orth | ||||
| 
 | ||||
|     cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: | ||||
|         lex.length = props['length'] | ||||
|         lex.orth = vocab.strings[props['orth']] | ||||
|         lex.lower = vocab.strings[props['lower']] | ||||
|         lex.norm = vocab.strings[props['norm']] | ||||
|         lex.shape = vocab.strings[props['shape']] | ||||
|         lex.prefix = vocab.strings[props['prefix']] | ||||
|         lex.suffix = vocab.strings[props['suffix']] | ||||
| 
 | ||||
|         lex.cluster = props['cluster'] | ||||
|         lex.prob = props['prob'] | ||||
|         lex.sentiment = props['sentiment'] | ||||
| 
 | ||||
|         lex.flags = props['flags'] | ||||
|         lex.repvec = empty_vec | ||||
|     @staticmethod | ||||
|     cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length): | ||||
|         cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth) | ||||
|         self.c = lex | ||||
|         self.vocab = vocab | ||||
|         self.orth = lex.orth | ||||
|      | ||||
|     @staticmethod | ||||
|     cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: | ||||
|         if name < (sizeof(flags_t) * 8): | ||||
|             Lexeme.set_flag(lex, name, value) | ||||
|         elif name == ID: | ||||
|             lex.id = value | ||||
|         elif name == LOWER: | ||||
|             lex.lower = value | ||||
|         elif name == NORM: | ||||
|             lex.norm = value | ||||
|         elif name == SHAPE: | ||||
|             lex.shape = value | ||||
|         elif name == PREFIX: | ||||
|             lex.prefix = value | ||||
|         elif name == SUFFIX: | ||||
|             lex.suffix = value | ||||
|         elif name == CLUSTER: | ||||
|             lex.cluster = value | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: | ||||
|  | @ -56,5 +67,14 @@ cdef class Lexeme: | |||
|         else: | ||||
|             return 0 | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: | ||||
|         return lexeme.flags & (1 << flag_id) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil: | ||||
|         cdef flags_t one = 1 | ||||
|         if value: | ||||
|             lexeme.flags |= one << flag_id | ||||
|         else: | ||||
|             lexeme.flags &= ~(one << flag_id) | ||||
|  |  | |||
|  | @ -26,12 +26,8 @@ cdef class Lexeme: | |||
|     def __init__(self, Vocab vocab, int orth): | ||||
|         self.vocab = vocab | ||||
|         self.orth = orth | ||||
|         self.c = <LexemeC*><void*>vocab.get_by_orth(orth) | ||||
|         self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth) | ||||
| 
 | ||||
|     property orth: | ||||
|         def __get__(self):  | ||||
|             return self.c.orth | ||||
|      | ||||
|     property lower: | ||||
|         def __get__(self): return self.c.lower | ||||
|         def __set__(self, int x): self.c.lower = x | ||||
|  | @ -78,44 +74,44 @@ cdef class Lexeme: | |||
| 
 | ||||
|     property is_oov: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x) | ||||
| 
 | ||||
|     property is_alpha: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x) | ||||
|      | ||||
|     property is_ascii: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x) | ||||
| 
 | ||||
|     property is_digit: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x) | ||||
| 
 | ||||
|     property is_lower: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x) | ||||
| 
 | ||||
|     property is_title: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x) | ||||
| 
 | ||||
|     property is_punct: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x) | ||||
| 
 | ||||
|     property is_space:  | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x) | ||||
| 
 | ||||
|     property like_url: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x) | ||||
|      | ||||
|     property like_num: | ||||
|         def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x) | ||||
| 
 | ||||
|     property like_email: | ||||
|         def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) | ||||
|         def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) | ||||
|         def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) | ||||
|  |  | |||
|  | @ -102,21 +102,22 @@ cdef class Matcher: | |||
|     cdef readonly int n_patterns | ||||
| 
 | ||||
|     def __init__(self, vocab, patterns): | ||||
|         self.vocab = vocab | ||||
|         self.mem = Pool() | ||||
|         for entity_key, (etype, attrs, specs) in sorted(patterns.items()): | ||||
|             self.add(entity_key, etype, attrs, specs) | ||||
| 
 | ||||
|     def add(self, entity_key, etype, attrs, specs): | ||||
|         if isinstance(entity_key, basestring): | ||||
|             entity_key = vocab.strings[entity_key] | ||||
|             entity_key = self.vocab.strings[entity_key] | ||||
|         if isinstance(etype, basestring): | ||||
|             etype = vocab.strings[etype] | ||||
|             etype = self.vocab.strings[etype] | ||||
|         elif etype is None: | ||||
|             etype = -1 | ||||
|         # TODO: Do something more clever about multiple patterns for single | ||||
|         # entity | ||||
|         for spec in specs: | ||||
|             spec = _convert_strings(spec, vocab.strings) | ||||
|             spec = _convert_strings(spec, self.vocab.strings) | ||||
|             self.patterns.push_back(init_pattern(self.mem, spec, etype)) | ||||
| 
 | ||||
|     @classmethod | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ from libc.stdint cimport uint32_t | |||
| import numpy | ||||
| import struct | ||||
| 
 | ||||
| from ..lexeme cimport Lexeme | ||||
| from ..lexeme cimport EMPTY_LEXEME | ||||
| from ..typedefs cimport attr_t, flags_t | ||||
| from ..attrs cimport attr_id_t | ||||
|  | @ -13,8 +14,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE | |||
| from ..parts_of_speech import UNIV_POS_NAMES | ||||
| from ..parts_of_speech cimport CONJ, PUNCT, NOUN | ||||
| from ..parts_of_speech cimport univ_pos_t | ||||
| from ..lexeme cimport check_flag | ||||
| from ..lexeme cimport get_attr as get_lex_attr | ||||
| from .spans cimport Span | ||||
| from .token cimport Token | ||||
| from ..serialize.bits cimport BitArray | ||||
|  | @ -48,7 +47,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | |||
|     elif feat_name == ENT_TYPE: | ||||
|         return token.ent_type | ||||
|     else: | ||||
|         return get_lex_attr(token.lex, feat_name) | ||||
|         return Lexeme.get_struct_attr(token.lex, feat_name) | ||||
| 
 | ||||
| 
 | ||||
| cdef class Doc: | ||||
|  |  | |||
|  | @ -1,6 +1,5 @@ | |||
| from libc.string cimport memcpy | ||||
| from cpython.mem cimport PyMem_Malloc, PyMem_Free | ||||
| from ..lexeme cimport check_flag | ||||
| # Compiler crashes on memory view coercion without this. Should report bug. | ||||
| from cython.view cimport array as cvarray | ||||
| cimport numpy as np | ||||
|  | @ -9,6 +8,7 @@ np.import_array() | |||
| import numpy | ||||
| 
 | ||||
| 
 | ||||
| from ..lexeme cimport Lexeme | ||||
| from ..parts_of_speech import UNIV_POS_NAMES | ||||
| 
 | ||||
| from ..attrs cimport LEMMA | ||||
|  | @ -42,7 +42,7 @@ cdef class Token: | |||
|         return self.string | ||||
| 
 | ||||
|     cpdef bint check_flag(self, attr_id_t flag_id) except -1: | ||||
|         return check_flag(self.c.lex, flag_id) | ||||
|         return Lexeme.check_flag(self.c.lex, flag_id) | ||||
| 
 | ||||
|     def nbor(self, int i=1): | ||||
|         return self.doc[self.i+i] | ||||
|  | @ -286,37 +286,37 @@ cdef class Token: | |||
|             return self.vocab.strings[self.c.dep] | ||||
| 
 | ||||
|     property is_oov: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_OOV) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) | ||||
| 
 | ||||
|     property is_alpha: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_ALPHA) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) | ||||
| 
 | ||||
|     property is_ascii: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_ASCII) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII) | ||||
| 
 | ||||
|     property is_digit: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_DIGIT) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT) | ||||
| 
 | ||||
|     property is_lower: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_LOWER) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER) | ||||
| 
 | ||||
|     property is_title: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_TITLE) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE) | ||||
| 
 | ||||
|     property is_punct: | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_PUNCT) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT) | ||||
| 
 | ||||
|     property is_space:  | ||||
|         def __get__(self): return check_flag(self.c.lex, IS_SPACE) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE) | ||||
| 
 | ||||
|     property like_url: | ||||
|         def __get__(self): return check_flag(self.c.lex, LIKE_URL) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL) | ||||
| 
 | ||||
|     property like_num: | ||||
|         def __get__(self): return check_flag(self.c.lex, LIKE_NUM) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM) | ||||
| 
 | ||||
|     property like_email: | ||||
|         def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) | ||||
|         def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL) | ||||
| 
 | ||||
| 
 | ||||
| _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} | ||||
|  |  | |||
|  | @ -37,6 +37,7 @@ cdef class Vocab: | |||
|     cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL | ||||
|      | ||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 | ||||
|     cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL | ||||
| 
 | ||||
|     cdef PreshMap _by_hash | ||||
|     cdef PreshMap _by_orth | ||||
|  |  | |||
|  | @ -12,7 +12,6 @@ import math | |||
| import json | ||||
| 
 | ||||
| from .lexeme cimport EMPTY_LEXEME | ||||
| from .lexeme cimport set_lex_struct_props | ||||
| from .lexeme cimport Lexeme | ||||
| from .strings cimport hash_string | ||||
| from .orth cimport word_shape | ||||
|  | @ -36,12 +35,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC | |||
| cdef class Vocab: | ||||
|     '''A map container for a language's LexemeC structs. | ||||
|     ''' | ||||
|     def __init__(self, data_dir=None, get_lex_attr=None): | ||||
|     def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False): | ||||
|         self.mem = Pool() | ||||
|         self._by_hash = PreshMap() | ||||
|         self._by_orth = PreshMap() | ||||
|         self.strings = StringStore() | ||||
|         self.pos_tags = pos_tags if pos_tags is not None else {} | ||||
|         #self.pos_tags = pos_tags if pos_tags is not None else {} | ||||
|         self.pos_tags = {} | ||||
|          | ||||
|         self.get_lex_attr = get_lex_attr | ||||
|         self.repvec_length = 0 | ||||
|  | @ -112,7 +112,7 @@ cdef class Vocab: | |||
|         if is_oov: | ||||
|             lex.id = 0 | ||||
|         else: | ||||
|             self._add_lex_to_vocab(key, lex) | ||||
|             self._add_lex_to_vocab(hash_string(string), lex) | ||||
|         assert lex != NULL, string | ||||
|         return lex | ||||
| 
 | ||||
|  | @ -125,7 +125,7 @@ cdef class Vocab: | |||
|         cdef attr_t orth | ||||
|         cdef size_t addr | ||||
|         for orth, addr in self._by_orth.items(): | ||||
|             yield Lexeme.from_ptr(<LexemeC*>addr, self.strings, self.repvec_length) | ||||
|             yield Lexeme.from_ptr(<LexemeC*>addr, self, self.repvec_length) | ||||
| 
 | ||||
|     def __getitem__(self,  id_or_string): | ||||
|         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously | ||||
|  | @ -157,7 +157,7 @@ cdef class Vocab: | |||
|             raise ValueError("Vocab unable to map type: " | ||||
|                 "%s. Maps unicode --> Lexeme or " | ||||
|                 "int --> Lexeme" % str(type(id_or_string))) | ||||
|         return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) | ||||
|         return Lexeme.from_ptr(<LexemeC*><void*>lexeme, self, self.repvec_length) | ||||
| 
 | ||||
|     def dump(self, loc): | ||||
|         if path.exists(loc): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user