mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* draft de/serialization functions in doc.pyx
This commit is contained in:
		
							parent
							
								
									9d956b07e9
								
							
						
					
					
						commit
						0e07c1ed2a
					
				| 
						 | 
					@ -278,7 +278,7 @@ cdef class Doc:
 | 
				
			||||||
            self.data[i].lex = &EMPTY_LEXEME
 | 
					            self.data[i].lex = &EMPTY_LEXEME
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int set_parse(self, const TokenC* parsed) except -1:
 | 
					    cdef int set_parse(self, const TokenC* parsed) except -1:
 | 
				
			||||||
        # TODO: This method is fairly misleading atm. It's used by GreedyParser
 | 
					        # TODO: This method is fairly misleading atm. It's used by Parser
 | 
				
			||||||
        # to actually apply the parse calculated. Need to rethink this.
 | 
					        # to actually apply the parse calculated. Need to rethink this.
 | 
				
			||||||
        self.is_parsed = True
 | 
					        self.is_parsed = True
 | 
				
			||||||
        for i in range(self.length):
 | 
					        for i in range(self.length):
 | 
				
			||||||
| 
						 | 
					@ -369,40 +369,40 @@ cdef class Doc:
 | 
				
			||||||
        # Return the merged Python object
 | 
					        # Return the merged Python object
 | 
				
			||||||
        return self[start]
 | 
					        return self[start]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def serialize(self, bits=None):
 | 
					    def serialize(self, codecs, bits=None):
 | 
				
			||||||
        if bits is None:
 | 
					        if bits is None:
 | 
				
			||||||
            bits = BitArray()
 | 
					            bits = BitArray()
 | 
				
			||||||
        codec = self.vocab.codec
 | 
					        array = self.to_array([codec.attr_id for codec in codecs])
 | 
				
			||||||
        ids = numpy.zeros(shape=(len(self),), dtype=numpy.uint32)
 | 
					        for i, codec in enumerate(codecs):
 | 
				
			||||||
        cdef int i
 | 
					            codec.encode(array[i,], bits)
 | 
				
			||||||
        for i in range(self.length):
 | 
					 | 
				
			||||||
            ids[i] = self.data[i].lex.id
 | 
					 | 
				
			||||||
        bits = codec.encode(ids, bits=bits)
 | 
					 | 
				
			||||||
        for i in range(self.length):
 | 
					 | 
				
			||||||
            bits.append(self.data[i].spacy)
 | 
					 | 
				
			||||||
        return bits
 | 
					        return bits
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def deserialize(Vocab vocab, bits):
 | 
					    def deserialize(Vocab vocab, bits):
 | 
				
			||||||
        biterator = iter(bits)
 | 
					        biterator = iter(bits)
 | 
				
			||||||
        ids = vocab.codec.decode(biterator)
 | 
					        ids = vocab.lex_codec.decode(bits)
 | 
				
			||||||
        spaces = []
 | 
					 | 
				
			||||||
        for bit in biterator:
 | 
					 | 
				
			||||||
            spaces.append(bit)
 | 
					 | 
				
			||||||
            if len(spaces) == len(ids):
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
        string = u''
 | 
					 | 
				
			||||||
        cdef const LexemeC* lex
 | 
					 | 
				
			||||||
        for id_, space in zip(ids, spaces):
 | 
					 | 
				
			||||||
            lex = vocab.lexemes[id_]
 | 
					 | 
				
			||||||
            string += vocab.strings[lex.orth]
 | 
					 | 
				
			||||||
            if space:
 | 
					 | 
				
			||||||
                string += u' '
 | 
					 | 
				
			||||||
        cdef Doc doc = Doc(vocab)
 | 
					        cdef Doc doc = Doc(vocab)
 | 
				
			||||||
        cdef bint has_space = False
 | 
					        cdef int id_
 | 
				
			||||||
        cdef int idx = 0
 | 
					        for id_ in ids:
 | 
				
			||||||
        for i, id_ in enumerate(ids):
 | 
					            is_spacy = biterator.next()
 | 
				
			||||||
            lex = vocab.lexemes[id_]
 | 
					            doc.push_back(vocab.lexemes.at(id_), is_spacy)
 | 
				
			||||||
            has_space = spaces[i]
 | 
					       
 | 
				
			||||||
            doc.push_back(lex, has_space)
 | 
					        cdef int i
 | 
				
			||||||
 | 
					        for codec in vocab.annotation_codecs:
 | 
				
			||||||
 | 
					            values = codec.decode(biterator)
 | 
				
			||||||
 | 
					            if codec.attr_id == HEAD:
 | 
				
			||||||
 | 
					                for i, head in enumerate(values):
 | 
				
			||||||
 | 
					                    doc.data[i].head = head
 | 
				
			||||||
 | 
					            elif codec.attr_id == TAG:
 | 
				
			||||||
 | 
					                for i, tag in enumerate(values):
 | 
				
			||||||
 | 
					                    doc.data[i].tag = tag
 | 
				
			||||||
 | 
					            elif codec.attr_id == DEP: 
 | 
				
			||||||
 | 
					                for i, dep in enumerate(values):
 | 
				
			||||||
 | 
					                    doc.data[i].dep = dep
 | 
				
			||||||
 | 
					            elif codec.attr_id == ENT_IOB:
 | 
				
			||||||
 | 
					                for i, ent_iob in enumerate(values):
 | 
				
			||||||
 | 
					                    doc.data[i].ent_iob = ent_iob
 | 
				
			||||||
 | 
					            elif codec.attr_id == ENT_TYPE:
 | 
				
			||||||
 | 
					                for i, ent_type in enumerate(values):
 | 
				
			||||||
 | 
					                    doc.data[i].ent_type = ent_type
 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user