mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	dumping all entryC entries + (inefficient) reading back in
This commit is contained in:
		
							parent
							
								
									8e70a564f1
								
							
						
					
					
						commit
						694fea597a
					
				|  | @ -426,16 +426,22 @@ if __name__ == "__main__": | ||||||
|     # STEP 3 : write KB to file |     # STEP 3 : write KB to file | ||||||
|     # TODO |     # TODO | ||||||
|     nlp = spacy.load('en_core_web_sm') |     nlp = spacy.load('en_core_web_sm') | ||||||
|     kb = KnowledgeBase(vocab=nlp.vocab) |     kb1 = KnowledgeBase(vocab=nlp.vocab) | ||||||
|     kb.dump(KB_FILE) |  | ||||||
|     print("DUMPED") |  | ||||||
|     kb.load(KB_FILE) |  | ||||||
|     print("LOADED") |  | ||||||
| 
 | 
 | ||||||
|     # PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' |     kb1.add_entity(entity="Q53", prob=0.33) | ||||||
|  |     kb1.add_entity(entity="Q17", prob=0.1) | ||||||
|  |     kb1.add_entity(entity="Q007", prob=0.7) | ||||||
|  |     kb1.add_entity(entity="Q44", prob=0.4) | ||||||
|  |     print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases()) | ||||||
|  | 
 | ||||||
|  |     kb1.dump(KB_FILE) | ||||||
| 
 | 
 | ||||||
|     # STEP 4 : read KB back in from file |     # STEP 4 : read KB back in from file | ||||||
|     # TODO |     # TODO | ||||||
| 
 | 
 | ||||||
|  |     kb2 = KnowledgeBase(vocab=nlp.vocab) | ||||||
|  |     kb2.load(KB_FILE) | ||||||
|  |     print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases()) | ||||||
|  | 
 | ||||||
|     # STEP 5 : actually use the EL functionality |     # STEP 5 : actually use the EL functionality | ||||||
|     # add_el(my_kb, nlp) |     # add_el(my_kb, nlp) | ||||||
|  |  | ||||||
|  | @ -168,7 +168,6 @@ cdef class Writer: | ||||||
| 
 | 
 | ||||||
| cdef class Reader: | cdef class Reader: | ||||||
|     cdef FILE* _fp |     cdef FILE* _fp | ||||||
|     cdef public int32_t nr_feat |  | ||||||
| 
 | 
 | ||||||
|     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 |     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										64
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							|  | @ -157,33 +157,45 @@ cdef class KnowledgeBase: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|     def dump(self, loc): |     def dump(self, loc): | ||||||
|         # TODO: actually dump the data in this KB :-) |  | ||||||
| 
 |  | ||||||
|         cdef int64_t entry_id = 32 |  | ||||||
|         self.vocab.strings.add("Q342") |  | ||||||
|         cdef hash_t entity_hash = self.vocab.strings["Q342"] |  | ||||||
|         cdef float prob = 0.333 |  | ||||||
| 
 |  | ||||||
|         cdef Writer writer = Writer(loc) |         cdef Writer writer = Writer(loc) | ||||||
|         writer.write(entry_id, entity_hash, prob) | 
 | ||||||
|  |         for key, entry_index in self._entry_index.items(): | ||||||
|  |             entry = self._entries[entry_index] | ||||||
|  |             print("dumping") | ||||||
|  |             print("index", entry_index) | ||||||
|  |             print("hash", entry.entity_hash) | ||||||
|  |             print("prob", entry.prob) | ||||||
|  |             print("") | ||||||
|  |             writer.write(entry_index, entry.entity_hash, entry.prob) | ||||||
|  | 
 | ||||||
|         writer.close() |         writer.close() | ||||||
| 
 | 
 | ||||||
|     def load(self, loc): |     def load(self, loc): | ||||||
|         cdef int64_t entry_id |         cdef int64_t entry_id | ||||||
|         cdef hash_t entity_hash |         cdef hash_t entity_hash | ||||||
|         cdef float prob |         cdef float prob | ||||||
|  |         cdef _EntryC entry | ||||||
|  |         cdef int32_t dummy_value = 342 | ||||||
| 
 | 
 | ||||||
|         cdef Reader reader = Reader(loc) |         cdef Reader reader = Reader(loc) | ||||||
|         reader.read(self.mem, &entry_id, &entity_hash, &prob) |         result = reader.read(self.mem, &entry_id, &entity_hash, &prob)  # -1: error, 0: eof after this one | ||||||
|  |         while result: | ||||||
|  |             print("loading") | ||||||
|  |             print("entryID", entry_id) | ||||||
|  |             print("hash", entity_hash) | ||||||
|  |             print("prob", prob) | ||||||
|  |             print("result:", result) | ||||||
|  |             print("") | ||||||
|  |             entry.entity_hash = entity_hash | ||||||
|  |             entry.prob = prob | ||||||
| 
 | 
 | ||||||
|         cdef _EntryC entry |             # TODO features and vectors | ||||||
|         entry.entity_hash = entity_hash |             entry.vector_rows = &dummy_value | ||||||
|         entry.prob = prob |             entry.feats_row = dummy_value | ||||||
| 
 | 
 | ||||||
|         # TODO |             # TODO: use set instead of push_back to ensure the index remains the same? | ||||||
|         cdef int32_t dummy_value = 342 |             self._entries.push_back(entry) | ||||||
|         entry.vector_rows = &dummy_value |             result = reader.read(self.mem, &entry_id, &entity_hash, &prob) | ||||||
|         entry.feats_row = dummy_value |  | ||||||
| 
 | 
 | ||||||
| cdef class Writer: | cdef class Writer: | ||||||
|     def __init__(self, object loc): |     def __init__(self, object loc): | ||||||
|  | @ -199,10 +211,7 @@ cdef class Writer: | ||||||
|         assert status == 0 |         assert status == 0 | ||||||
| 
 | 
 | ||||||
|     cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: |     cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: | ||||||
|         cdef int i = 0 |  | ||||||
| 
 |  | ||||||
|         # TODO: feats_rows and vector rows |         # TODO: feats_rows and vector rows | ||||||
| 
 |  | ||||||
|         _write(&entry_id, sizeof(entry_id), self._fp) |         _write(&entry_id, sizeof(entry_id), self._fp) | ||||||
|         _write(&entry_hash, sizeof(entry_hash), self._fp) |         _write(&entry_hash, sizeof(entry_hash), self._fp) | ||||||
|         _write(&entry_prob, sizeof(entry_prob), self._fp) |         _write(&entry_prob, sizeof(entry_prob), self._fp) | ||||||
|  | @ -227,21 +236,30 @@ cdef class Reader: | ||||||
|         fclose(self._fp) |         fclose(self._fp) | ||||||
| 
 | 
 | ||||||
|     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: |     cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: | ||||||
|         status = fread(entry_id, sizeof(entry_id), 1, self._fp) |         """  | ||||||
|  |         Return values: | ||||||
|  |         -1: error during current read (EOF during call) | ||||||
|  |         0: means we read the last line succesfully (EOF after call) | ||||||
|  |         1: we can continue reading this file """ | ||||||
|  |         status = fread(entry_id, sizeof(int64_t), 1, self._fp) | ||||||
|         if status < 1: |         if status < 1: | ||||||
|             if feof(self._fp): |             if feof(self._fp): | ||||||
|                 return 0  # end of file |                 return 0  # end of file | ||||||
|             raise IOError("error reading entry ID from input file") |             raise IOError("error reading entry ID from input file") | ||||||
| 
 | 
 | ||||||
|         #status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp) |         status = fread(entity_hash, sizeof(hash_t), 1, self._fp) | ||||||
|         status = fread(entity_hash, sizeof(entity_hash), 1, self._fp) |  | ||||||
|         if status < 1: |         if status < 1: | ||||||
|             if feof(self._fp): |             if feof(self._fp): | ||||||
|                 return 0  # end of file |                 return 0  # end of file | ||||||
|             raise IOError("error reading entity hash from input file") |             raise IOError("error reading entity hash from input file") | ||||||
| 
 | 
 | ||||||
|         status = fread(prob, sizeof(prob), 1, self._fp) |         status = fread(prob, sizeof(float), 1, self._fp) | ||||||
|         if status < 1: |         if status < 1: | ||||||
|             if feof(self._fp): |             if feof(self._fp): | ||||||
|                 return 0  # end of file |                 return 0  # end of file | ||||||
|             raise IOError("error reading entity prob from input file") |             raise IOError("error reading entity prob from input file") | ||||||
|  | 
 | ||||||
|  |         if feof(self._fp): | ||||||
|  |             return 0 | ||||||
|  |         else: | ||||||
|  |             return 1 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user