mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	* Tmp. Working on refactor. Compiles, must hook up lexical feats.
This commit is contained in:
		
							parent
							
								
									46da3d74d2
								
							
						
					
					
						commit
						0930892fc1
					
				|  | @ -12,7 +12,10 @@ from .attrs import get_flags | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_lex_props(string): | def get_lex_props(string): | ||||||
|     return {'flags': get_flags(string), 'dense': 1} |     return {'flags': get_flags(string), 'length': len(string), | ||||||
|  |             'sic': string, 'norm1': string, 'norm2': string, 'shape': string, | ||||||
|  |             'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0, | ||||||
|  |             'sentiment': 0} | ||||||
| 
 | 
 | ||||||
| LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') | LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') | ||||||
| 
 | 
 | ||||||
|  | @ -45,7 +48,7 @@ class English(object): | ||||||
|     """ |     """ | ||||||
|     def __init__(self, data_dir=LOCAL_DATA_DIR): |     def __init__(self, data_dir=LOCAL_DATA_DIR): | ||||||
|         self._data_dir = data_dir |         self._data_dir = data_dir | ||||||
|         self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'), |         self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, | ||||||
|                            get_lex_props=get_lex_props) |                            get_lex_props=get_lex_props) | ||||||
|         tag_names = list(POS_TAGS.keys()) |         tag_names = list(POS_TAGS.keys()) | ||||||
|         tag_names.sort() |         tag_names.sort() | ||||||
|  |  | ||||||
|  | @ -283,12 +283,12 @@ cdef class EnPosTagger: | ||||||
|     cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: |     cdef int lemmatize(self, const univ_tag_t pos, const LexemeC* lex) except -1: | ||||||
|         if self.lemmatizer is None: |         if self.lemmatizer is None: | ||||||
|             return lex.sic |             return lex.sic | ||||||
|         cdef bytes py_string = self.strings[lex.sic] |         cdef unicode py_string = self.strings[lex.sic] | ||||||
|         if pos != NOUN and pos != VERB and pos != ADJ: |         if pos != NOUN and pos != VERB and pos != ADJ: | ||||||
|             return lex.sic |             return lex.sic | ||||||
|         cdef set lemma_strings |         cdef set lemma_strings | ||||||
|         cdef unicode lemma_string |         cdef unicode lemma_string | ||||||
|         lemma_strings = self.lemmatizer(py_string.decode('utf8'), pos) |         lemma_strings = self.lemmatizer(py_string, pos) | ||||||
|         lemma_string = sorted(lemma_strings)[0] |         lemma_string = sorted(lemma_strings)[0] | ||||||
|         lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i |         lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i | ||||||
|         return lemma |         return lemma | ||||||
|  |  | ||||||
|  | @ -7,9 +7,7 @@ from .strings cimport StringStore | ||||||
| cdef LexemeC EMPTY_LEXEME | cdef LexemeC EMPTY_LEXEME | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef LexemeC init(id_t i, unicode string, hash_t hashed, StringStore store, | cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings) except -1 | ||||||
|                   dict props) except * |  | ||||||
|   |  | ||||||
|   |   | ||||||
| cdef class Lexeme: | cdef class Lexeme: | ||||||
|     cdef const float* vec |     cdef const float* vec | ||||||
|  |  | ||||||
|  | @ -5,27 +5,27 @@ from murmurhash.mrmr cimport hash64 | ||||||
| from libc.string cimport memset | from libc.string cimport memset | ||||||
| 
 | 
 | ||||||
| from .orth cimport word_shape | from .orth cimport word_shape | ||||||
|  | from .typedefs cimport attr_t | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef LexemeC init(id_t i, unicode string, hash_t hashed, | cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store) except -1: | ||||||
|                   StringStore string_store, dict props) except *: |  | ||||||
|     cdef LexemeC lex |  | ||||||
|     lex.id = i |  | ||||||
|     lex.length = len(string) |  | ||||||
|     lex.sic = string_store[string] |  | ||||||
| 
 | 
 | ||||||
|     lex.cluster = props.get('cluster', 0) |     lex.length = props['length'] | ||||||
|     lex.prob = props.get('prob', 0) |     lex.sic = string_store[props['sic']] | ||||||
|  |     lex.norm1 = string_store[props['norm1']]  | ||||||
|  |     lex.norm2 = string_store[props['norm2']]  | ||||||
|  |     lex.shape = string_store[props['shape']]  | ||||||
|  |     lex.prefix = string_store[props['prefix']] | ||||||
|  |     lex.suffix = string_store[props['suffix']] | ||||||
|      |      | ||||||
|     lex.prefix = string_store[string[:1]] |     lex.cluster = props['cluster'] | ||||||
|     lex.suffix = string_store[string[-3:]] |     lex.prob = props['prob'] | ||||||
|     lex.shape = string_store[word_shape(string)] |     lex.sentiment = props['sentiment'] | ||||||
| 
 | 
 | ||||||
|     lex.flags = props.get('flags', 0) |     lex.flags = props['flags'] | ||||||
|     return lex |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Lexeme: | cdef class Lexeme: | ||||||
|  |  | ||||||
|  | @ -67,7 +67,7 @@ cdef class StringStore: | ||||||
|             if string_or_id < 1 or string_or_id >= self.size: |             if string_or_id < 1 or string_or_id >= self.size: | ||||||
|                 raise IndexError(string_or_id) |                 raise IndexError(string_or_id) | ||||||
|             utf8str = &self.strings[<int>string_or_id] |             utf8str = &self.strings[<int>string_or_id] | ||||||
|             return utf8str.chars[:utf8str.length] |             return utf8str.chars[:utf8str.length].decode('utf8') | ||||||
|         elif isinstance(string_or_id, bytes): |         elif isinstance(string_or_id, bytes): | ||||||
|             utf8str = self.intern(<char*>string_or_id, len(string_or_id)) |             utf8str = self.intern(<char*>string_or_id, len(string_or_id)) | ||||||
|             return utf8str.i |             return utf8str.i | ||||||
|  |  | ||||||
|  | @ -42,32 +42,5 @@ cdef class Tokens: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Token: | cdef class Token: | ||||||
|     cdef cvarray vec |     cdef readonly Tokens _seq | ||||||
| 
 |     cdef readonly int i | ||||||
|     cdef readonly flags_t flags |  | ||||||
|     |  | ||||||
|     cdef readonly attr_t id |  | ||||||
|     cdef readonly attr_t sic |  | ||||||
|     cdef readonly attr_t dense |  | ||||||
|     cdef readonly attr_t shape |  | ||||||
|     cdef readonly attr_t prefix |  | ||||||
|     cdef readonly attr_t suffix |  | ||||||
|   |  | ||||||
|     cdef readonly attr_t length |  | ||||||
|     cdef readonly attr_t cluster |  | ||||||
|     cdef readonly attr_t pos_type |  | ||||||
| 
 |  | ||||||
|     cdef readonly float prob |  | ||||||
|     cdef readonly float sentiment |  | ||||||
| 
 |  | ||||||
|     cdef readonly Morphology morph |  | ||||||
|     cdef readonly univ_tag_t pos |  | ||||||
|     cdef readonly int fine_pos |  | ||||||
|     cdef readonly int idx |  | ||||||
|     cdef readonly int lemma |  | ||||||
|     cdef readonly int sense |  | ||||||
|     cdef readonly int dep_tag |  | ||||||
|      |  | ||||||
|     cdef readonly int head_offset |  | ||||||
|     cdef readonly uint32_t l_kids |  | ||||||
|     cdef readonly uint32_t r_kids |  | ||||||
|  |  | ||||||
							
								
								
									
										188
									
								
								spacy/tokens.pyx
									
									
									
									
									
								
							
							
						
						
									
										188
									
								
								spacy/tokens.pyx
									
									
									
									
									
								
							|  | @ -85,7 +85,7 @@ cdef class Tokens: | ||||||
|             token (Token): |             token (Token): | ||||||
|         """ |         """ | ||||||
|         bounds_check(i, self.length, PADDING) |         bounds_check(i, self.length, PADDING) | ||||||
|         return cinit_token(&self.data[i]) |         return Token(self, i) | ||||||
| 
 | 
 | ||||||
|     def __iter__(self): |     def __iter__(self): | ||||||
|         """Iterate over the tokens. |         """Iterate over the tokens. | ||||||
|  | @ -174,38 +174,26 @@ cdef class Tokens: | ||||||
|             self.data[i].lex = &EMPTY_LEXEME |             self.data[i].lex = &EMPTY_LEXEME | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef Token cinit_token(const TokenC* c_tok): | @cython.freelist(64) | ||||||
|     cdef Token py_tok = Token.__new__(Token) |  | ||||||
|     py_tok.morph = c_tok.morph |  | ||||||
|     py_tok.pos = c_tok.pos |  | ||||||
|     py_tok.fine_pos = c_tok.fine_pos |  | ||||||
|     py_tok.idx = c_tok.idx |  | ||||||
|     py_tok.lemma = c_tok.lemma |  | ||||||
|     py_tok.sense = c_tok.sense |  | ||||||
|     py_tok.dep_tag = c_tok.dep_tag |  | ||||||
|     py_tok.head_offset = c_tok.head |  | ||||||
|     py_tok.l_kids = c_tok.l_kids |  | ||||||
|     py_tok.r_kids = c_tok.r_kids |  | ||||||
|     return py_tok |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class Token: | cdef class Token: | ||||||
|     """An individual token. |     """An individual token. | ||||||
|     """ |  | ||||||
|     def __init__(self): |  | ||||||
|         pass |  | ||||||
|         #self._seq = tokens |  | ||||||
|         #self.i = i |  | ||||||
| 
 | 
 | ||||||
|     #def __unicode__(self): |     Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens | ||||||
|     #    cdef const TokenC* t = &self._seq.data[self.i] |     object. | ||||||
|     #    cdef int end_idx = t.idx + t.lex.length |     """ | ||||||
|     #    if self.i + 1 == self._seq.length: |     def __init__(self, Tokens tokens, int i): | ||||||
|     #        return self.string |         self._seq = tokens | ||||||
|     #    if end_idx == t[1].idx: |         self.i = i | ||||||
|     #        return self.string | 
 | ||||||
|     #    else: |     def __unicode__(self): | ||||||
|     #        return self.string + ' ' |         cdef const TokenC* t = &self._seq.data[self.i] | ||||||
|  |         cdef int end_idx = t.idx + t.lex.length | ||||||
|  |         if self.i + 1 == self._seq.length: | ||||||
|  |             return self.string | ||||||
|  |         if end_idx == t[1].idx: | ||||||
|  |             return self.string | ||||||
|  |         else: | ||||||
|  |             return self.string + ' ' | ||||||
| 
 | 
 | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|         """The number of unicode code-points in the original string. |         """The number of unicode code-points in the original string. | ||||||
|  | @ -213,87 +201,87 @@ cdef class Token: | ||||||
|         Returns: |         Returns: | ||||||
|             length (int): |             length (int): | ||||||
|         """ |         """ | ||||||
|         return self.length |         return self._seq.data[self.i].lex.length | ||||||
| 
 | 
 | ||||||
|     #property idx: |     property idx: | ||||||
|     #    """The index into the original string at which the token starts. |         """The index into the original string at which the token starts. | ||||||
| 
 | 
 | ||||||
|     #    The following is supposed to always be true: |         The following is supposed to always be true: | ||||||
|     #     |  | ||||||
|     #    >>> original_string[token.idx:token.idx len(token) == token.string |  | ||||||
|     #    """ |  | ||||||
|     #    def __get__(self): |  | ||||||
|     #        return self._seq.data[self.i].idx |  | ||||||
|          |          | ||||||
|     #property cluster: |         >>> original_string[token.idx:token.idx len(token) == token.string | ||||||
|     #    """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering |         """ | ||||||
|     # |         def __get__(self): | ||||||
|     #    Similar words have better-than-chance likelihood of having similar cluster |             return self._seq.data[self.i].idx | ||||||
|     #    IDs, although the clustering is quite noisy.  Cluster IDs make good features, |  | ||||||
|     #    and help to make models slightly more robust to domain variation. |  | ||||||
| 
 | 
 | ||||||
|     #    A common trick is to use only the first N bits of a cluster ID in a feature, |     property cluster: | ||||||
|     #    as the more general part of the hierarchical clustering is often more accurate |         """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering | ||||||
|     #    than the lower categories. |  | ||||||
|      |      | ||||||
|     #    To assist in this, I encode the cluster IDs little-endian, to allow a simple |         Similar words have better-than-chance likelihood of having similar cluster | ||||||
|     #    bit-mask: |         IDs, although the clustering is quite noisy.  Cluster IDs make good features, | ||||||
|  |         and help to make models slightly more robust to domain variation. | ||||||
| 
 | 
 | ||||||
|     #    >>> six_bits = cluster & (2**6 - 1) |         A common trick is to use only the first N bits of a cluster ID in a feature, | ||||||
|     #    """ |         as the more general part of the hierarchical clustering is often more accurate | ||||||
|     #    def __get__(self): |         than the lower categories. | ||||||
|     #        return self._seq.data[self.i].lex.cluster |  | ||||||
| 
 | 
 | ||||||
|     #property string: |         To assist in this, I encode the cluster IDs little-endian, to allow a simple | ||||||
|     #    """The unicode string of the word, with no whitespace padding.""" |         bit-mask: | ||||||
|     #    def __get__(self): |  | ||||||
|     #        cdef const TokenC* t = &self._seq.data[self.i] |  | ||||||
|     #        if t.lex.sic == 0: |  | ||||||
|     #            return '' |  | ||||||
|     #        cdef bytes utf8string = self._seq.vocab.strings[t.lex.sic] |  | ||||||
|     #        return utf8string.decode('utf8') |  | ||||||
| 
 | 
 | ||||||
|     #property lemma: |         >>> six_bits = cluster & (2**6 - 1) | ||||||
|     #    """The unicode string of the word's lemma.  If no part-of-speech tag is |         """ | ||||||
|     #    assigned, the most common part-of-speech tag of the word is used. |         def __get__(self): | ||||||
|     #    """ |             return self._seq.data[self.i].lex.cluster | ||||||
|     #    def __get__(self): |  | ||||||
|     #        cdef const TokenC* t = &self._seq.data[self.i] |  | ||||||
|     #        if t.lemma == 0: |  | ||||||
|     #            return self.string |  | ||||||
|     #        cdef bytes utf8string = self._seq.vocab.strings[t.lemma] |  | ||||||
|     #        return utf8string.decode('utf8') |  | ||||||
| 
 | 
 | ||||||
|     #property dep_tag: |     property string: | ||||||
|     #    """The ID integer of the word's dependency label.  If no parse has been |         """The unicode string of the word, with no whitespace padding.""" | ||||||
|     #    assigned, defaults to 0. |         def __get__(self): | ||||||
|     #    """ |             cdef const TokenC* t = &self._seq.data[self.i] | ||||||
|     #    def __get__(self): |             if t.lex.sic == 0: | ||||||
|     #        return self._seq.data[self.i].dep_tag |                 return '' | ||||||
|  |             cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic] | ||||||
|  |             return py_ustr | ||||||
| 
 | 
 | ||||||
|     #property pos: |     property lemma: | ||||||
|     #    """The ID integer of the word's part-of-speech tag, from the 13-tag |         """The unicode string of the word's lemma.  If no part-of-speech tag is | ||||||
|     #    Google Universal Tag Set.  Constants for this tag set are available in |         assigned, the most common part-of-speech tag of the word is used. | ||||||
|     #    spacy.typedefs. |         """ | ||||||
|     #    """ |         def __get__(self): | ||||||
|     #    def __get__(self): |             cdef const TokenC* t = &self._seq.data[self.i] | ||||||
|     #        return self._seq.data[self.i].pos |             if t.lemma == 0: | ||||||
|  |                 return self.string | ||||||
|  |             cdef unicode py_ustr = self._seq.vocab.strings[t.lemma] | ||||||
|  |             return py_ustr | ||||||
| 
 | 
 | ||||||
|     #property fine_pos: |     property dep_tag: | ||||||
|     #    """The ID integer of the word's fine-grained part-of-speech tag, as assigned |         """The ID integer of the word's dependency label.  If no parse has been | ||||||
|     #    by the tagger model.  Fine-grained tags include morphological information, |         assigned, defaults to 0. | ||||||
|     #    and other distinctions, and allow a more accurate tagger to be trained. |         """ | ||||||
|     #    """ |         def __get__(self): | ||||||
|  |             return self._seq.data[self.i].dep_tag | ||||||
| 
 | 
 | ||||||
|     #    def __get__(self): |     property pos: | ||||||
|     #        return self._seq.data[self.i].fine_pos |         """The ID integer of the word's part-of-speech tag, from the 13-tag | ||||||
|  |         Google Universal Tag Set.  Constants for this tag set are available in | ||||||
|  |         spacy.typedefs. | ||||||
|  |         """ | ||||||
|  |         def __get__(self): | ||||||
|  |             return self._seq.data[self.i].pos | ||||||
| 
 | 
 | ||||||
|     #property sic: |     property fine_pos: | ||||||
|     #    def __get__(self): |         """The ID integer of the word's fine-grained part-of-speech tag, as assigned | ||||||
|     #        return self._seq.data[self.i].lex.sic |         by the tagger model.  Fine-grained tags include morphological information, | ||||||
|  |         and other distinctions, and allow a more accurate tagger to be trained. | ||||||
|  |         """ | ||||||
|   |   | ||||||
|     #property head: |         def __get__(self): | ||||||
|     #    """The token predicted by the parser to be the head of the current token.""" |             return self._seq.data[self.i].fine_pos | ||||||
|     #    def __get__(self): | 
 | ||||||
|     #        cdef const TokenC* t = &self._seq.data[self.i] |     property sic: | ||||||
|     #        return Token(self._seq, self.i + t.head) |         def __get__(self): | ||||||
|  |             return self._seq.vocab.strings[self._seq.data[self.i].lex.sic] | ||||||
|  | 
 | ||||||
|  |     property head: | ||||||
|  |         """The token predicted by the parser to be the head of the current token.""" | ||||||
|  |         def __get__(self): | ||||||
|  |             cdef const TokenC* t = &self._seq.data[self.i] | ||||||
|  |             return Token(self._seq, self.i + t.head) | ||||||
|  |  | ||||||
|  | @ -24,12 +24,13 @@ cdef struct _Cached: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Vocab: | cdef class Vocab: | ||||||
|     cpdef public get_lex_props |     cpdef public lexeme_props_getter | ||||||
|     cdef Pool mem |     cdef Pool mem | ||||||
|     cpdef readonly StringStore strings |     cpdef readonly StringStore strings | ||||||
|     cdef vector[LexemeC*] lexemes |     cdef vector[const LexemeC*] lexemes | ||||||
| 
 | 
 | ||||||
|     cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL |     cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL | ||||||
|  |     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 | ||||||
|      |      | ||||||
|     cdef PreshMap _map |     cdef PreshMap _map | ||||||
|    |    | ||||||
|  |  | ||||||
|  | @ -5,7 +5,7 @@ from os import path | ||||||
| import codecs | import codecs | ||||||
| 
 | 
 | ||||||
| from .lexeme cimport EMPTY_LEXEME | from .lexeme cimport EMPTY_LEXEME | ||||||
| from .lexeme cimport init as lexeme_init | from .lexeme cimport set_lex_struct_props | ||||||
| from .lexeme cimport Lexeme_cinit | from .lexeme cimport Lexeme_cinit | ||||||
| from .strings cimport slice_unicode | from .strings cimport slice_unicode | ||||||
| from .strings cimport hash_string | from .strings cimport hash_string | ||||||
|  | @ -21,24 +21,6 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||||
| EMPTY_LEXEME.vec = EMPTY_VEC | EMPTY_LEXEME.vec = EMPTY_VEC | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef LexemeC init_lexeme(id_t i, unicode string, hash_t hashed, |  | ||||||
|                   StringStore string_store, dict props) except *: |  | ||||||
|     cdef LexemeC lex |  | ||||||
|     lex.id = i |  | ||||||
|     lex.length = len(string) |  | ||||||
|     lex.sic = string_store[string] |  | ||||||
|      |  | ||||||
|     lex.cluster = props.get('cluster', 0) |  | ||||||
|     lex.prob = props.get('prob', 0) |  | ||||||
| 
 |  | ||||||
|     lex.prefix = string_store[string[:1]] |  | ||||||
|     lex.suffix = string_store[string[-3:]] |  | ||||||
|     lex.shape = string_store[word_shape(string)] |  | ||||||
|     |  | ||||||
|     lex.flags = props.get('flags', 0) |  | ||||||
|     return lex |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class Vocab: | cdef class Vocab: | ||||||
|     '''A map container for a language's LexemeC structs. |     '''A map container for a language's LexemeC structs. | ||||||
|     ''' |     ''' | ||||||
|  | @ -47,7 +29,7 @@ cdef class Vocab: | ||||||
|         self._map = PreshMap(2 ** 20) |         self._map = PreshMap(2 ** 20) | ||||||
|         self.strings = StringStore() |         self.strings = StringStore() | ||||||
|         self.lexemes.push_back(&EMPTY_LEXEME) |         self.lexemes.push_back(&EMPTY_LEXEME) | ||||||
|         self.get_lex_props = get_lex_props |         self.lexeme_props_getter = get_lex_props | ||||||
| 
 | 
 | ||||||
|         if data_dir is not None: |         if data_dir is not None: | ||||||
|             if not path.exists(data_dir): |             if not path.exists(data_dir): | ||||||
|  | @ -63,32 +45,36 @@ cdef class Vocab: | ||||||
|         """The current number of lexemes stored.""" |         """The current number of lexemes stored.""" | ||||||
|         return self.lexemes.size() |         return self.lexemes.size() | ||||||
| 
 | 
 | ||||||
|     cdef const LexemeC* get(self, Pool mem, UniStr* string) except NULL: |     cdef const LexemeC* get(self, Pool mem, UniStr* c_str) except NULL: | ||||||
|         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme |         '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme | ||||||
|         if necessary, using memory acquired from the given pool.  If the pool |         if necessary, using memory acquired from the given pool.  If the pool | ||||||
|         is the lexicon's own memory, the lexeme is saved in the lexicon.''' |         is the lexicon's own memory, the lexeme is saved in the lexicon.''' | ||||||
|         cdef LexemeC* lex |         cdef LexemeC* lex | ||||||
|         lex = <LexemeC*>self._map.get(string.key) |         lex = <LexemeC*>self._map.get(c_str.key) | ||||||
|         if lex != NULL: |         if lex != NULL: | ||||||
|             return lex |             return lex | ||||||
|         if string.n < 3: |         if c_str.n < 3: | ||||||
|             mem = self.mem |             mem = self.mem | ||||||
|         cdef unicode py_string = string.chars[:string.n] |         cdef unicode py_str = c_str.chars[:c_str.n] | ||||||
|         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) |         lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1) | ||||||
|         lex[0] = init_lexeme(self.lexemes.size(), py_string, string.key, self.strings, |         props = self.lexeme_props_getter(py_str) | ||||||
|                              self.get_lex_props(py_string)) |         set_lex_struct_props(lex, props, self.strings) | ||||||
|         if mem is self.mem: |         if mem is self.mem: | ||||||
|             self._map.set(string.key, lex) |             lex.id = self.lexemes.size() | ||||||
|  |             self._add_lex_to_vocab(c_str.key, lex) | ||||||
|  |         else: | ||||||
|  |             lex.id = 1 | ||||||
|  |         return lex | ||||||
|  | 
 | ||||||
|  |     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: | ||||||
|  |         self._map.set(key, <void*>lex) | ||||||
|         while self.lexemes.size() < (lex.id + 1): |         while self.lexemes.size() < (lex.id + 1): | ||||||
|             self.lexemes.push_back(&EMPTY_LEXEME) |             self.lexemes.push_back(&EMPTY_LEXEME) | ||||||
|         self.lexemes[lex.id] = lex |         self.lexemes[lex.id] = lex | ||||||
|         else: |  | ||||||
|             lex[0].id = 1 |  | ||||||
|         return lex |  | ||||||
| 
 | 
 | ||||||
|     def __getitem__(self,  id_or_string): |     def __getitem__(self,  id_or_string): | ||||||
|         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously |         '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously | ||||||
|         unseen unicode string is given, a new LexemeC is created and stored. |         unseen unicode string is given, a new lexeme is created and stored. | ||||||
| 
 | 
 | ||||||
|         Args: |         Args: | ||||||
|             id_or_string (int or unicode): The integer ID of a word, or its unicode |             id_or_string (int or unicode): The integer ID of a word, or its unicode | ||||||
|  | @ -100,24 +86,28 @@ cdef class Vocab: | ||||||
|             lexeme (Lexeme): An instance of the Lexeme Python class, with data |             lexeme (Lexeme): An instance of the Lexeme Python class, with data | ||||||
|                 copied on instantiation. |                 copied on instantiation. | ||||||
|         ''' |         ''' | ||||||
|         cdef UniStr string |         cdef UniStr c_str | ||||||
|         cdef const LexemeC* lexeme |         cdef const LexemeC* lexeme | ||||||
|         if type(id_or_string) == int: |         if type(id_or_string) == int: | ||||||
|             if id_or_string >= self.lexemes.size(): |             if id_or_string >= self.lexemes.size(): | ||||||
|                 raise IndexError |                 raise IndexError | ||||||
|             lexeme = self.lexemes.at(id_or_string) |             lexeme = self.lexemes.at(id_or_string) | ||||||
|         else: |         else: | ||||||
|             slice_unicode(&string, id_or_string, 0, len(id_or_string)) |             slice_unicode(&c_str, id_or_string, 0, len(id_or_string)) | ||||||
|             lexeme = self.get(self.mem, &string) |             lexeme = self.get(self.mem, &c_str) | ||||||
|         return Lexeme_cinit(lexeme, self.strings) |         return Lexeme_cinit(lexeme, self.strings) | ||||||
| 
 | 
 | ||||||
|     def __setitem__(self, unicode uni_string, dict props): |     def __setitem__(self, unicode py_str, dict props): | ||||||
|         cdef UniStr s |         cdef UniStr c_str | ||||||
|         slice_unicode(&s, uni_string, 0, len(uni_string)) |         slice_unicode(&c_str, py_str, 0, len(py_str)) | ||||||
|         # Cast through the const here, since we're allowed to change our own |         cdef LexemeC* lex | ||||||
|         # LexemeCs. |         lex = <LexemeC*>self._map.get(c_str.key) | ||||||
|         lex = <LexemeC*><void*>self.get(self.mem, &s) |         if lex == NULL: | ||||||
|         lex[0] = lexeme_init(lex.id, s.chars[:s.n], s.key, self.strings, props) |             lex = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) | ||||||
|  |             lex.id = self.lexemes.size() | ||||||
|  |             self._add_lex_to_vocab(c_str.key, lex) | ||||||
|  |         set_lex_struct_props(lex, props, self.strings) | ||||||
|  |         assert lex.sic < 1000000 | ||||||
| 
 | 
 | ||||||
|     def dump(self, loc): |     def dump(self, loc): | ||||||
|         if path.exists(loc): |         if path.exists(loc): | ||||||
|  | @ -154,6 +144,7 @@ cdef class Vocab: | ||||||
|             if st != 1: |             if st != 1: | ||||||
|                 break |                 break | ||||||
|             lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) |             lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1) | ||||||
|  |             lexeme.vec = EMPTY_VEC | ||||||
|             st = fread(lexeme, sizeof(LexemeC), 1, fp) |             st = fread(lexeme, sizeof(LexemeC), 1, fp) | ||||||
|             if st != 1: |             if st != 1: | ||||||
|                 break |                 break | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user