mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Work on Issue #285: intern strings into document-specific pools, to address streaming data memory growth. StringStore.__getitem__ now raises KeyError when it can't find the string. Use StringStore.intern() to get the old behaviour. Still need to hunt down all uses of StringStore.__getitem__ in library and do testing, but logic looks good.
This commit is contained in:
		
							parent
							
								
									d3dc5718b2
								
							
						
					
					
						commit
						8423e8627f
					
				| 
						 | 
					@ -7,6 +7,8 @@ from libc.stdint cimport int64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .typedefs cimport hash_t
 | 
					from .typedefs cimport hash_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEF UINT64_MAX = 18446744073709551615
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef hash_t hash_string(unicode string) except 0
 | 
					cpdef hash_t hash_string(unicode string) except 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -22,6 +24,10 @@ cdef class StringStore:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef public PreshMap _map
 | 
					    cdef public PreshMap _map
 | 
				
			||||||
    cdef int64_t _resize_at
 | 
					    cdef int64_t _resize_at
 | 
				
			||||||
 | 
					    cdef PreshMap oov_maps
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef const Utf8Str* intern(self, unicode py_string) except NULL
 | 
					    cpdef int remove_oov_map(self, Pool mem) except -1
 | 
				
			||||||
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL
 | 
					
 | 
				
			||||||
 | 
					    cdef hash_t intern(self, unicode py_string, Pool mem=*) except UINT64_MAX
 | 
				
			||||||
 | 
					    cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string,
 | 
				
			||||||
 | 
					                                     int length) except NULL
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: infer_types=True
 | 
				
			||||||
from __future__ import unicode_literals, absolute_import
 | 
					from __future__ import unicode_literals, absolute_import
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cimport cython
 | 
					cimport cython
 | 
				
			||||||
| 
						 | 
					@ -6,7 +7,8 @@ from libc.stdint cimport uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from preshed.maps cimport map_iter, key_t
 | 
					from preshed.maps cimport map_init, map_set, map_get, map_iter
 | 
				
			||||||
 | 
					from preshed.maps cimport MapStruct
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .typedefs cimport hash_t
 | 
					from .typedefs cimport hash_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,13 +18,17 @@ except ImportError:
 | 
				
			||||||
    import json
 | 
					    import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEF UINT64_MAX = 18446744073709551615
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef hash_t hash_string(unicode string) except 0:
 | 
					cpdef hash_t hash_string(unicode string) except 0:
 | 
				
			||||||
    chars = string.encode('utf8')
 | 
					    byte_string = string.encode('utf8')
 | 
				
			||||||
    return _hash_utf8(chars, len(chars))
 | 
					    cdef unsigned char* chars = byte_string
 | 
				
			||||||
 | 
					    return _hash_utf8(chars, len(byte_string))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef hash_t _hash_utf8(char* utf8_string, int length):
 | 
					cdef hash_t _hash_utf8(const unsigned char* utf8_string, int length) nogil:
 | 
				
			||||||
    return hash64(utf8_string, length, 1)
 | 
					    return hash64(<void*>utf8_string, length, 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef unicode _decode(const Utf8Str* string):
 | 
					cdef unicode _decode(const Utf8Str* string):
 | 
				
			||||||
| 
						 | 
					@ -74,6 +80,7 @@ cdef class StringStore:
 | 
				
			||||||
    def __init__(self, strings=None):
 | 
					    def __init__(self, strings=None):
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        self._map = PreshMap()
 | 
					        self._map = PreshMap()
 | 
				
			||||||
 | 
					        self.oov_maps = PreshMap()
 | 
				
			||||||
        self._resize_at = 10000
 | 
					        self._resize_at = 10000
 | 
				
			||||||
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
 | 
					        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
 | 
				
			||||||
        self.size = 1
 | 
					        self.size = 1
 | 
				
			||||||
| 
						 | 
					@ -108,13 +115,20 @@ cdef class StringStore:
 | 
				
			||||||
            byte_string = <bytes>string_or_id
 | 
					            byte_string = <bytes>string_or_id
 | 
				
			||||||
            if len(byte_string) == 0:
 | 
					            if len(byte_string) == 0:
 | 
				
			||||||
                return 0
 | 
					                return 0
 | 
				
			||||||
            utf8str = self._intern_utf8(byte_string, len(byte_string))
 | 
					            key = _hash_utf8(byte_string, len(byte_string))
 | 
				
			||||||
 | 
					            utf8str = <Utf8Str*>self._map.get(key)
 | 
				
			||||||
 | 
					            if utf8str is NULL:
 | 
				
			||||||
 | 
					                raise KeyError(byte_string)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
                return utf8str - self.c
 | 
					                return utf8str - self.c
 | 
				
			||||||
        elif isinstance(string_or_id, unicode):
 | 
					        elif isinstance(string_or_id, unicode):
 | 
				
			||||||
            if len(<unicode>string_or_id) == 0:
 | 
					            if len(<unicode>string_or_id) == 0:
 | 
				
			||||||
                return 0
 | 
					                return 0
 | 
				
			||||||
            byte_string = (<unicode>string_or_id).encode('utf8')
 | 
					            key = hash_string(string_or_id)
 | 
				
			||||||
            utf8str = self._intern_utf8(byte_string, len(byte_string))
 | 
					            utf8str = <Utf8Str*>self._map.get(key)
 | 
				
			||||||
 | 
					            if utf8str is NULL:
 | 
				
			||||||
 | 
					                raise KeyError(string_or_id)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
                return utf8str - self.c
 | 
					                return utf8str - self.c
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            raise TypeError(type(string_or_id))
 | 
					            raise TypeError(type(string_or_id))
 | 
				
			||||||
| 
						 | 
					@ -131,6 +145,8 @@ cdef class StringStore:
 | 
				
			||||||
            yield _decode(&self.c[i]) if i > 0 else u''
 | 
					            yield _decode(&self.c[i]) if i > 0 else u''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
 | 
					        # TODO: Is it problematic that we don't save the OOV strings?
 | 
				
			||||||
 | 
					        # Probably yes? We're not restoring all the state...
 | 
				
			||||||
        strings = [""]
 | 
					        strings = [""]
 | 
				
			||||||
        for i in range(1, self.size):
 | 
					        for i in range(1, self.size):
 | 
				
			||||||
            string = &self.c[i]
 | 
					            string = &self.c[i]
 | 
				
			||||||
| 
						 | 
					@ -138,27 +154,77 @@ cdef class StringStore:
 | 
				
			||||||
            strings.append(py_string)
 | 
					            strings.append(py_string)
 | 
				
			||||||
        return (StringStore, (strings,), None, None, None)
 | 
					        return (StringStore, (strings,), None, None, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef const Utf8Str* intern(self, unicode py_string) except NULL:
 | 
					    cdef hash_t intern(self, unicode py_string, Pool mem=None) except UINT64_MAX:
 | 
				
			||||||
        # 0 means missing, but we don't bother offsetting the index.
 | 
					        if mem is None:
 | 
				
			||||||
 | 
					            mem = self.mem
 | 
				
			||||||
 | 
					        cdef hash_t map_key = id(mem)
 | 
				
			||||||
        cdef bytes byte_string = py_string.encode('utf8')
 | 
					        cdef bytes byte_string = py_string.encode('utf8')
 | 
				
			||||||
        return self._intern_utf8(byte_string, len(byte_string))
 | 
					        cdef hash_t key = _hash_utf8(byte_string, len(byte_string))
 | 
				
			||||||
 | 
					        cdef const Utf8Str* utf8str = <Utf8Str*>self._map.get(key)
 | 
				
			||||||
 | 
					        cdef hash_t map_id = id(mem)
 | 
				
			||||||
 | 
					        cdef MapStruct* oov_map
 | 
				
			||||||
 | 
					        if utf8str is not NULL:
 | 
				
			||||||
 | 
					            return utf8str - self.c
 | 
				
			||||||
 | 
					        elif mem is None or mem is self.mem:
 | 
				
			||||||
 | 
					            utf8str = self._intern_utf8(byte_string, len(byte_string))
 | 
				
			||||||
 | 
					            return utf8str - self.c
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            new_utf8str = <Utf8Str*>mem.alloc(sizeof(Utf8Str), 1)
 | 
				
			||||||
 | 
					            oov_map = <MapStruct*>self.oov_maps.get(map_key)
 | 
				
			||||||
 | 
					            if oov_map is NULL:
 | 
				
			||||||
 | 
					                oov_map = <MapStruct*>mem.alloc(sizeof(MapStruct), 1)
 | 
				
			||||||
 | 
					                map_init(mem, oov_map, 16)
 | 
				
			||||||
 | 
					                self.oov_maps.set(id(mem), oov_map)
 | 
				
			||||||
 | 
					            new_utf8str[0] = _allocate(mem, byte_string, len(byte_string))
 | 
				
			||||||
 | 
					            map_set(mem, oov_map, key, new_utf8str)
 | 
				
			||||||
 | 
					            return key
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def decode_int(self, hash_t int_, Pool mem=None):
 | 
				
			||||||
 | 
					        cdef hash_t map_key
 | 
				
			||||||
 | 
					        if int_ == 0:
 | 
				
			||||||
 | 
					            return u''
 | 
				
			||||||
 | 
					        elif int_ < <uint64_t>self.size:
 | 
				
			||||||
 | 
					            return _decode(&self.c[int_])
 | 
				
			||||||
 | 
					        elif mem is None or mem is self.mem:
 | 
				
			||||||
 | 
					            raise IndexError(int_)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            map_key = id(mem)
 | 
				
			||||||
 | 
					            oov_map = <MapStruct*>self.oov_maps.get(map_key)
 | 
				
			||||||
 | 
					            if oov_map is NULL:
 | 
				
			||||||
 | 
					                raise IndexError(
 | 
				
			||||||
 | 
					                    "Trying to decode integer into string, but it's not in " +
 | 
				
			||||||
 | 
					                    "the main store, and the memory pool hasn't been seen before.\n" +
 | 
				
			||||||
 | 
					                    ("int_ == %d\n" % int_) + 
 | 
				
			||||||
 | 
					                    "id(mem) == %d" % map_key) 
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                utf8str = <const Utf8Str*>map_get(oov_map, int_)
 | 
				
			||||||
 | 
					                if utf8str is NULL:
 | 
				
			||||||
 | 
					                    raise IndexError(
 | 
				
			||||||
 | 
					                        "Trying to decode integer into string, but it's not in " +
 | 
				
			||||||
 | 
					                        "the main store. The integer was also not found in the " +
 | 
				
			||||||
 | 
					                        "indicated auxiliary pool " +
 | 
				
			||||||
 | 
					                        "(which is usually specific to a document)." +
 | 
				
			||||||
 | 
					                        ("int_ == %d\n" % int_) +
 | 
				
			||||||
 | 
					                        "id(mem) == %d" % map_key)
 | 
				
			||||||
 | 
					                return _decode(utf8str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @cython.final
 | 
					    @cython.final
 | 
				
			||||||
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) except NULL:
 | 
					    cdef const Utf8Str* _intern_utf8(self, const unsigned char* utf8_string,
 | 
				
			||||||
        # 0 means missing, but we don't bother offsetting the index.
 | 
					            int length) except NULL:
 | 
				
			||||||
        cdef hash_t key = _hash_utf8(utf8_string, length)
 | 
					 | 
				
			||||||
        value = <Utf8Str*>self._map.get(key)
 | 
					 | 
				
			||||||
        if value is not NULL:
 | 
					 | 
				
			||||||
            return value
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if self.size == self._resize_at:
 | 
					        if self.size == self._resize_at:
 | 
				
			||||||
            self._realloc()
 | 
					            self._realloc()
 | 
				
			||||||
        self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
 | 
					        key = _hash_utf8(utf8_string, length)
 | 
				
			||||||
 | 
					        self.c[self.size] = _allocate(self.mem, utf8_string, length)
 | 
				
			||||||
        self._map.set(key, <void*>&self.c[self.size])
 | 
					        self._map.set(key, <void*>&self.c[self.size])
 | 
				
			||||||
        self.size += 1
 | 
					        self.size += 1
 | 
				
			||||||
        return &self.c[self.size-1]
 | 
					        return &self.c[self.size-1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cpdef int remove_oov_map(self, Pool mem) except -1:
 | 
				
			||||||
 | 
					        cdef hash_t key = id(mem)
 | 
				
			||||||
 | 
					        self._maps.pop(key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def dump(self, file_):
 | 
					    def dump(self, file_):
 | 
				
			||||||
 | 
					        # TODO: Is it problematic that we don't save the OOV strings? No, right?
 | 
				
			||||||
        string_data = json.dumps(list(self))
 | 
					        string_data = json.dumps(list(self))
 | 
				
			||||||
        if not isinstance(string_data, unicode):
 | 
					        if not isinstance(string_data, unicode):
 | 
				
			||||||
            string_data = string_data.decode('utf8')
 | 
					            string_data = string_data.decode('utf8')
 | 
				
			||||||
| 
						 | 
					@ -180,8 +246,8 @@ cdef class StringStore:
 | 
				
			||||||
        # we resize our array. So, first we remap to indices, then we resize,
 | 
					        # we resize our array. So, first we remap to indices, then we resize,
 | 
				
			||||||
        # then we can acquire the new pointers.
 | 
					        # then we can acquire the new pointers.
 | 
				
			||||||
        cdef Pool tmp_mem = Pool()
 | 
					        cdef Pool tmp_mem = Pool()
 | 
				
			||||||
        keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
 | 
					        keys = <hash_t*>tmp_mem.alloc(self.size, sizeof(hash_t))
 | 
				
			||||||
        cdef key_t key
 | 
					        cdef hash_t key
 | 
				
			||||||
        cdef void* value
 | 
					        cdef void* value
 | 
				
			||||||
        cdef const Utf8Str ptr
 | 
					        cdef const Utf8Str ptr
 | 
				
			||||||
        cdef int i = 0
 | 
					        cdef int i = 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -116,11 +116,11 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property text_with_ws:
 | 
					    property text_with_ws:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            cdef unicode orth = self.vocab.strings[self.c.lex.orth]
 | 
					            orth_ = self.orth_
 | 
				
			||||||
            if self.c.spacy:
 | 
					            if self.c.spacy:
 | 
				
			||||||
                return orth + u' '
 | 
					                return orth_ + u' '
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return orth
 | 
					                return orth_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property prob:
 | 
					    property prob:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					@ -403,7 +403,7 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property ent_type_:
 | 
					    property ent_type_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.ent_type]
 | 
					            return self.vocab.strings.decode_int(self.c.ent_type, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property ent_iob_:
 | 
					    property ent_iob_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					@ -424,7 +424,7 @@ cdef class Token:
 | 
				
			||||||
    property ent_id_:
 | 
					    property ent_id_:
 | 
				
			||||||
        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
 | 
					        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.ent_id]
 | 
					            return self.vocab.strings.decode_int(self.c.ent_id, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def __set__(self, hash_t key):
 | 
					        def __set__(self, hash_t key):
 | 
				
			||||||
            # TODO
 | 
					            # TODO
 | 
				
			||||||
| 
						 | 
					@ -438,35 +438,35 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property orth_:
 | 
					    property orth_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lex.orth]
 | 
					            return self.vocab.strings.decode_int(self.c.lex.orth, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property lower_:
 | 
					    property lower_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lex.lower]
 | 
					            return self.vocab.strings.decode_int(self.c.lex.lower, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property norm_:
 | 
					    property norm_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lex.norm]
 | 
					            return self.vocab.strings.decode_int(self.c.lex.norm, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property shape_:
 | 
					    property shape_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lex.shape]
 | 
					            return self.vocab.strings.decode_int(self.c.lex.shape, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property prefix_:
 | 
					    property prefix_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lex.prefix]
 | 
					            return self.vocab.strings.decode_int(self.c.lex.prefix, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property suffix_:
 | 
					    property suffix_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lex.suffix]
 | 
					            return self.vocab.strings.decode_int(self.c.lex.suffix, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property lang_:
 | 
					    property lang_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lex.lang]
 | 
					            return self.vocab.strings.decode_int(self.c.lex.lang, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property lemma_:
 | 
					    property lemma_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.lemma]
 | 
					            return self.vocab.strings.decode_int(self.c.lemma, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property pos_:
 | 
					    property pos_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					@ -474,13 +474,13 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property tag_:
 | 
					    property tag_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.tag]
 | 
					            return self.vocab.strings.decode_int(self.c.tag, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property dep_:
 | 
					    property dep_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.vocab.strings[self.c.dep]
 | 
					            return self.vocab.decode_int(self.c.dep, mem=self.mem)
 | 
				
			||||||
        def __set__(self, unicode label):
 | 
					        def __set__(self, unicode label):
 | 
				
			||||||
            self.c.dep = self.vocab.strings[label]
 | 
					            self.c.dep = self.vocab.strings.intern(label, mem=self.mem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property is_oov:
 | 
					    property is_oov:
 | 
				
			||||||
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
 | 
					        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,10 @@
 | 
				
			||||||
from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
 | 
					from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t, int32_t
 | 
				
			||||||
from libc.stdint cimport uint8_t
 | 
					from libc.stdint cimport uint8_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from libc.stdint cimport UINT64_MAX as err_hash_t
 | 
				
			||||||
 | 
					from libc.stdint cimport UINT64_MAX as err_flags_t
 | 
				
			||||||
 | 
					from libc.stdint cimport UINT64_MAX as err_len_t
 | 
				
			||||||
 | 
					from libc.stdint cimport UINT64_MAX as err_tag_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ctypedef uint64_t hash_t
 | 
					ctypedef uint64_t hash_t
 | 
				
			||||||
ctypedef char* utf8_t
 | 
					ctypedef char* utf8_t
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,6 +27,7 @@ cdef struct _Cached:
 | 
				
			||||||
cdef class Vocab:
 | 
					cdef class Vocab:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cpdef readonly StringStore strings
 | 
					    cpdef readonly StringStore strings
 | 
				
			||||||
 | 
					    cpdef readonly dict oov_stores
 | 
				
			||||||
    cpdef readonly Morphology morphology
 | 
					    cpdef readonly Morphology morphology
 | 
				
			||||||
    cdef readonly int length
 | 
					    cdef readonly int length
 | 
				
			||||||
    cdef public object _serializer
 | 
					    cdef public object _serializer
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -93,6 +93,7 @@ cdef class Vocab:
 | 
				
			||||||
        self._by_hash = PreshMap()
 | 
					        self._by_hash = PreshMap()
 | 
				
			||||||
        self._by_orth = PreshMap()
 | 
					        self._by_orth = PreshMap()
 | 
				
			||||||
        self.strings = StringStore()
 | 
					        self.strings = StringStore()
 | 
				
			||||||
 | 
					        self.oov_stores = {}
 | 
				
			||||||
        # Load strings in a special order, so that we have an onset number for
 | 
					        # Load strings in a special order, so that we have an onset number for
 | 
				
			||||||
        # the vocabulary. This way, when words are added in order, the orth ID
 | 
					        # the vocabulary. This way, when words are added in order, the orth ID
 | 
				
			||||||
        # is the frequency rank of the word, plus a certain offset. The structural
 | 
					        # is the frequency rank of the word, plus a certain offset. The structural
 | 
				
			||||||
| 
						 | 
					@ -140,7 +141,7 @@ cdef class Vocab:
 | 
				
			||||||
        lex = <LexemeC*>self._by_hash.get(key)
 | 
					        lex = <LexemeC*>self._by_hash.get(key)
 | 
				
			||||||
        cdef size_t addr
 | 
					        cdef size_t addr
 | 
				
			||||||
        if lex != NULL:
 | 
					        if lex != NULL:
 | 
				
			||||||
            if lex.orth != self.strings[string]:
 | 
					            if (string not in self.strings) or (lex.orth != self.strings[string]):
 | 
				
			||||||
                raise LookupError.mismatched_strings(
 | 
					                raise LookupError.mismatched_strings(
 | 
				
			||||||
                    lex.orth, self.strings[string], self.strings[lex.orth], string)
 | 
					                    lex.orth, self.strings[string], self.strings[lex.orth], string)
 | 
				
			||||||
            return lex
 | 
					            return lex
 | 
				
			||||||
| 
						 | 
					@ -163,10 +164,10 @@ cdef class Vocab:
 | 
				
			||||||
    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
 | 
					    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
 | 
				
			||||||
        cdef hash_t key
 | 
					        cdef hash_t key
 | 
				
			||||||
        cdef bint is_oov = mem is not self.mem
 | 
					        cdef bint is_oov = mem is not self.mem
 | 
				
			||||||
        if len(string) < 3:
 | 
					        if len(string) < 3 or not is_oov:
 | 
				
			||||||
            mem = self.mem
 | 
					            mem = self.mem
 | 
				
			||||||
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
 | 
					        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
 | 
				
			||||||
        lex.orth = self.strings[string]
 | 
					        lex.orth = self.strings.intern(string, mem=mem)
 | 
				
			||||||
        lex.length = len(string)
 | 
					        lex.length = len(string)
 | 
				
			||||||
        lex.id = self.length
 | 
					        lex.id = self.length
 | 
				
			||||||
        lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
 | 
					        lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
 | 
				
			||||||
| 
						 | 
					@ -174,7 +175,7 @@ cdef class Vocab:
 | 
				
			||||||
            for attr, func in self.lex_attr_getters.items():
 | 
					            for attr, func in self.lex_attr_getters.items():
 | 
				
			||||||
                value = func(string)
 | 
					                value = func(string)
 | 
				
			||||||
                if isinstance(value, unicode):
 | 
					                if isinstance(value, unicode):
 | 
				
			||||||
                    value = self.strings[value]
 | 
					                    value = self.strings.intern(value)
 | 
				
			||||||
                if attr == PROB:
 | 
					                if attr == PROB:
 | 
				
			||||||
                    lex.prob = value
 | 
					                    lex.prob = value
 | 
				
			||||||
                else:
 | 
					                else:
 | 
				
			||||||
| 
						 | 
					@ -205,7 +206,8 @@ cdef class Vocab:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __getitem__(self,  id_or_string):
 | 
					    def __getitem__(self,  id_or_string):
 | 
				
			||||||
        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
 | 
					        '''Retrieve a lexeme, given an int ID or a unicode string.  If a previously
 | 
				
			||||||
        unseen unicode string is given, a new lexeme is created and stored.
 | 
					        unseen unicode string is given, a new lexeme is created and stored, and
 | 
				
			||||||
 | 
					        the string is interned in the vocabulary.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Args:
 | 
					        Args:
 | 
				
			||||||
            id_or_string (int or unicode):
 | 
					            id_or_string (int or unicode):
 | 
				
			||||||
| 
						 | 
					@ -220,7 +222,7 @@ cdef class Vocab:
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        cdef attr_t orth
 | 
					        cdef attr_t orth
 | 
				
			||||||
        if type(id_or_string) == unicode:
 | 
					        if type(id_or_string) == unicode:
 | 
				
			||||||
            orth = self.strings[id_or_string]
 | 
					            orth = self.strings.intern(id_or_string)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            orth = id_or_string
 | 
					            orth = id_or_string
 | 
				
			||||||
        return Lexeme(self, orth)
 | 
					        return Lexeme(self, orth)
 | 
				
			||||||
| 
						 | 
					@ -236,7 +238,7 @@ cdef class Vocab:
 | 
				
			||||||
            if 'pos' in props:
 | 
					            if 'pos' in props:
 | 
				
			||||||
                self.morphology.assign_tag(token, props['pos'])
 | 
					                self.morphology.assign_tag(token, props['pos'])
 | 
				
			||||||
            if 'L' in props:
 | 
					            if 'L' in props:
 | 
				
			||||||
                tokens[i].lemma = self.strings[props['L']]
 | 
					                tokens[i].lemma = self.strings.intern(props['L'])
 | 
				
			||||||
            for feature, value in props.get('morph', {}).items():
 | 
					            for feature, value in props.get('morph', {}).items():
 | 
				
			||||||
                self.morphology.assign_feature(&token.morph, feature, value)
 | 
					                self.morphology.assign_feature(&token.morph, feature, value)
 | 
				
			||||||
        return tokens
 | 
					        return tokens
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user