mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Draft Vocab to/from disk/bytes
This commit is contained in:
		
							parent
							
								
									e6dd01fc90
								
							
						
					
					
						commit
						2edd96ce47
					
				| 
						 | 
					@ -275,9 +275,9 @@ cdef class Vocab:
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
        if not path.exists():
 | 
					        if not path.exists():
 | 
				
			||||||
            path.mkdir()
 | 
					            path.mkdir()
 | 
				
			||||||
        strings_loc = path / 'strings.json'
 | 
					        self.strings.to_disk(path / 'strings.json')
 | 
				
			||||||
        with strings_loc.open('w', encoding='utf8') as file_:
 | 
					        with (path / 'lexemes.bin').open('wb') as file_:
 | 
				
			||||||
            self.strings.dump(file_)
 | 
					            file_.write(self.lexemes_to_bytes())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_disk(self, path):
 | 
					    def from_disk(self, path):
 | 
				
			||||||
        """Loads state from a directory. Modifies the object in place and
 | 
					        """Loads state from a directory. Modifies the object in place and
 | 
				
			||||||
| 
						 | 
					@ -288,11 +288,10 @@ cdef class Vocab:
 | 
				
			||||||
        RETURNS (Vocab): The modified `Vocab` object.
 | 
					        RETURNS (Vocab): The modified `Vocab` object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
 | 
					        self.strings.from_disk(path / 'strings.json')
 | 
				
			||||||
            strings_list = ujson.load(file_)
 | 
					        with (path / 'lexemes.bin').open('rb') as file_:
 | 
				
			||||||
        for string in strings_list:
 | 
					            self.lexemes_from_bytes(file_.read())
 | 
				
			||||||
            self.strings.add(string)
 | 
					        return self
 | 
				
			||||||
        self.load_lexemes(path / 'lexemes.bin')
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_bytes(self, **exclude):
 | 
					    def to_bytes(self, **exclude):
 | 
				
			||||||
        """Serialize the current state to a binary string.
 | 
					        """Serialize the current state to a binary string.
 | 
				
			||||||
| 
						 | 
					@ -300,7 +299,12 @@ cdef class Vocab:
 | 
				
			||||||
        **exclude: Named attributes to prevent from being serialized.
 | 
					        **exclude: Named attributes to prevent from being serialized.
 | 
				
			||||||
        RETURNS (bytes): The serialized form of the `Vocab` object.
 | 
					        RETURNS (bytes): The serialized form of the `Vocab` object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError()
 | 
					        data = {}
 | 
				
			||||||
 | 
					        if 'strings' not in exclude:
 | 
				
			||||||
 | 
					            data['strings'] = self.strings.to_bytes()
 | 
				
			||||||
 | 
					        if 'lexemes' not in exclude:
 | 
				
			||||||
 | 
					            data['lexemes'] = self.lexemes_to_bytes
 | 
				
			||||||
 | 
					        return ujson.dumps(data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data, **exclude):
 | 
					    def from_bytes(self, bytes_data, **exclude):
 | 
				
			||||||
        """Load state from a binary string.
 | 
					        """Load state from a binary string.
 | 
				
			||||||
| 
						 | 
					@ -309,9 +313,14 @@ cdef class Vocab:
 | 
				
			||||||
        **exclude: Named attributes to prevent from being loaded.
 | 
					        **exclude: Named attributes to prevent from being loaded.
 | 
				
			||||||
        RETURNS (Vocab): The `Vocab` object.
 | 
					        RETURNS (Vocab): The `Vocab` object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        raise NotImplementedError()
 | 
					        data = ujson.loads(bytes_data)
 | 
				
			||||||
 | 
					        if 'strings' not in exclude:
 | 
				
			||||||
 | 
					            self.strings.from_bytes(data['strings'])
 | 
				
			||||||
 | 
					        if 'lexemes' not in exclude:
 | 
				
			||||||
 | 
					            self.lexemes_from_bytes(data['lexemes'])
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lexemes_to_bytes(self, **exclude):
 | 
					    def lexemes_to_bytes(self):
 | 
				
			||||||
        cdef hash_t key
 | 
					        cdef hash_t key
 | 
				
			||||||
        cdef size_t addr
 | 
					        cdef size_t addr
 | 
				
			||||||
        cdef LexemeC* lexeme = NULL
 | 
					        cdef LexemeC* lexeme = NULL
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user