mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	dumping all entryC entries + (inefficient) reading back in
This commit is contained in:
		
							parent
							
								
									8e70a564f1
								
							
						
					
					
						commit
						694fea597a
					
				|  | @ -426,16 +426,22 @@ if __name__ == "__main__": | |||
|     # STEP 3 : write KB to file | ||||
|     # TODO | ||||
|     nlp = spacy.load('en_core_web_sm') | ||||
|     kb = KnowledgeBase(vocab=nlp.vocab) | ||||
|     kb.dump(KB_FILE) | ||||
|     print("DUMPED") | ||||
|     kb.load(KB_FILE) | ||||
|     print("LOADED") | ||||
|     kb1 = KnowledgeBase(vocab=nlp.vocab) | ||||
| 
 | ||||
|     # PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' | ||||
|     kb1.add_entity(entity="Q53", prob=0.33) | ||||
|     kb1.add_entity(entity="Q17", prob=0.1) | ||||
|     kb1.add_entity(entity="Q007", prob=0.7) | ||||
|     kb1.add_entity(entity="Q44", prob=0.4) | ||||
|     print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases()) | ||||
| 
 | ||||
|     kb1.dump(KB_FILE) | ||||
| 
 | ||||
|     # STEP 4 : read KB back in from file | ||||
|     # TODO | ||||
| 
 | ||||
|     kb2 = KnowledgeBase(vocab=nlp.vocab) | ||||
|     kb2.load(KB_FILE) | ||||
|     print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases()) | ||||
| 
 | ||||
|     # STEP 5 : actually use the EL functionality | ||||
|     # add_el(my_kb, nlp) | ||||
|  |  | |||
|  | @ -168,7 +168,6 @@ cdef class Writer: | |||
| 
 | ||||
| cdef class Reader: | ||||
|     cdef FILE* _fp | ||||
|     cdef public int32_t nr_feat | ||||
| 
 | ||||
|     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										64
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							|  | @ -157,33 +157,45 @@ cdef class KnowledgeBase: | |||
| 
 | ||||
| 
 | ||||
|     def dump(self, loc): | ||||
|         # TODO: actually dump the data in this KB :-) | ||||
| 
 | ||||
|         cdef int64_t entry_id = 32 | ||||
|         self.vocab.strings.add("Q342") | ||||
|         cdef hash_t entity_hash = self.vocab.strings["Q342"] | ||||
|         cdef float prob = 0.333 | ||||
| 
 | ||||
|         cdef Writer writer = Writer(loc) | ||||
|         writer.write(entry_id, entity_hash, prob) | ||||
| 
 | ||||
|         for key, entry_index in self._entry_index.items(): | ||||
|             entry = self._entries[entry_index] | ||||
|             print("dumping") | ||||
|             print("index", entry_index) | ||||
|             print("hash", entry.entity_hash) | ||||
|             print("prob", entry.prob) | ||||
|             print("") | ||||
|             writer.write(entry_index, entry.entity_hash, entry.prob) | ||||
| 
 | ||||
|         writer.close() | ||||
| 
 | ||||
|     def load(self, loc): | ||||
|         cdef int64_t entry_id | ||||
|         cdef hash_t entity_hash | ||||
|         cdef float prob | ||||
|         cdef _EntryC entry | ||||
|         cdef int32_t dummy_value = 342 | ||||
| 
 | ||||
|         cdef Reader reader = Reader(loc) | ||||
|         reader.read(self.mem, &entry_id, &entity_hash, &prob) | ||||
|         result = reader.read(self.mem, &entry_id, &entity_hash, &prob)  # -1: error, 0: eof after this one | ||||
|         while result: | ||||
|             print("loading") | ||||
|             print("entryID", entry_id) | ||||
|             print("hash", entity_hash) | ||||
|             print("prob", prob) | ||||
|             print("result:", result) | ||||
|             print("") | ||||
|             entry.entity_hash = entity_hash | ||||
|             entry.prob = prob | ||||
| 
 | ||||
|         cdef _EntryC entry | ||||
|         entry.entity_hash = entity_hash | ||||
|         entry.prob = prob | ||||
|             # TODO features and vectors | ||||
|             entry.vector_rows = &dummy_value | ||||
|             entry.feats_row = dummy_value | ||||
| 
 | ||||
|         # TODO | ||||
|         cdef int32_t dummy_value = 342 | ||||
|         entry.vector_rows = &dummy_value | ||||
|         entry.feats_row = dummy_value | ||||
|             # TODO: use set instead of push_back to ensure the index remains the same? | ||||
|             self._entries.push_back(entry) | ||||
|             result = reader.read(self.mem, &entry_id, &entity_hash, &prob) | ||||
| 
 | ||||
| cdef class Writer: | ||||
|     def __init__(self, object loc): | ||||
|  | @ -199,10 +211,7 @@ cdef class Writer: | |||
|         assert status == 0 | ||||
| 
 | ||||
|     cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: | ||||
|         cdef int i = 0 | ||||
| 
 | ||||
|         # TODO: feats_rows and vector rows | ||||
| 
 | ||||
|         _write(&entry_id, sizeof(entry_id), self._fp) | ||||
|         _write(&entry_hash, sizeof(entry_hash), self._fp) | ||||
|         _write(&entry_prob, sizeof(entry_prob), self._fp) | ||||
|  | @ -227,21 +236,30 @@ cdef class Reader: | |||
|         fclose(self._fp) | ||||
| 
 | ||||
|     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: | ||||
|         status = fread(entry_id, sizeof(entry_id), 1, self._fp) | ||||
|         """  | ||||
|         Return values: | ||||
|         -1: error during current read (EOF during call) | ||||
|         0: means we read the last line succesfully (EOF after call) | ||||
|         1: we can continue reading this file """ | ||||
|         status = fread(entry_id, sizeof(int64_t), 1, self._fp) | ||||
|         if status < 1: | ||||
|             if feof(self._fp): | ||||
|                 return 0  # end of file | ||||
|             raise IOError("error reading entry ID from input file") | ||||
| 
 | ||||
|         #status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp) | ||||
|         status = fread(entity_hash, sizeof(entity_hash), 1, self._fp) | ||||
|         status = fread(entity_hash, sizeof(hash_t), 1, self._fp) | ||||
|         if status < 1: | ||||
|             if feof(self._fp): | ||||
|                 return 0  # end of file | ||||
|             raise IOError("error reading entity hash from input file") | ||||
| 
 | ||||
|         status = fread(prob, sizeof(prob), 1, self._fp) | ||||
|         status = fread(prob, sizeof(float), 1, self._fp) | ||||
|         if status < 1: | ||||
|             if feof(self._fp): | ||||
|                 return 0  # end of file | ||||
|             raise IOError("error reading entity prob from input file") | ||||
| 
 | ||||
|         if feof(self._fp): | ||||
|             return 0 | ||||
|         else: | ||||
|             return 1 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user