mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	annotate kb_id through ents in doc
This commit is contained in:
		
							parent
							
								
									d849eb2455
								
							
						
					
					
						commit
						735fc2a735
					
				|  | @ -3,18 +3,23 @@ import spacy | |||
| 
 | ||||
| def add_el(): | ||||
|     nlp = spacy.load('en_core_web_sm') | ||||
|     print("pipes", nlp.pipe_names) | ||||
|     print("pipes before:", nlp.pipe_names) | ||||
| 
 | ||||
|     el_pipe = nlp.create_pipe(name='el') | ||||
|     nlp.add_pipe(el_pipe, last=True) | ||||
| 
 | ||||
|     print("pipes", nlp.pipe_names) | ||||
|     print("pipes after:", nlp.pipe_names) | ||||
|     print() | ||||
| 
 | ||||
|     text = "Australian striker John hits century" | ||||
|     text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel." | ||||
|     doc = nlp(text) | ||||
| 
 | ||||
|     for token in doc: | ||||
|         print("token", token.text, token.tag_, token.pos_, token.kb_id) | ||||
|         print("token", token.text, token.ent_type_, token.ent_kb_id_) | ||||
| 
 | ||||
|     print() | ||||
|     for ent in doc.ents: | ||||
|         print("ent", ent.text, ent.label_, ent.kb_id_) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  |  | |||
|  | @ -43,8 +43,6 @@ cdef class Morphology: | |||
| 
 | ||||
|     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 | ||||
| 
 | ||||
|     cdef int assign_kb_id(self, TokenC* token, kb_id) except -1 | ||||
| 
 | ||||
| 
 | ||||
| cdef enum univ_morph_t: | ||||
|     NIL = 0 | ||||
|  |  | |||
|  | @ -123,9 +123,6 @@ cdef class Morphology: | |||
|         else: | ||||
|             flags[0] &= ~(one << flag_id) | ||||
| 
 | ||||
|     cdef int assign_kb_id(self, TokenC* token, kb_id) except -1: | ||||
|         token.kb_id = kb_id | ||||
| 
 | ||||
|     def add_special_case(self, unicode tag_str, unicode orth_str, attrs, | ||||
|                          force=False): | ||||
|         """Add a special-case rule to the morphological analyser. Tokens whose | ||||
|  |  | |||
|  | @ -1104,8 +1104,10 @@ class EntityLinker(Pipe): | |||
|     def set_annotations(self, docs, scores, tensors=None): | ||||
|         # TODO Sofie: actually implement this class instead of dummy implementation | ||||
|         for i, doc in enumerate(docs): | ||||
|             for token in doc: | ||||
|                 token.kb_id = 342 | ||||
|             for ent in doc.ents: | ||||
|                 if ent.label_ in ["PERSON", "PER"]: | ||||
|                     for token in ent: | ||||
|                         token.ent_kb_id_ = "Q42" | ||||
| 
 | ||||
|     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): | ||||
|         scores, bp_scores = self.model.begin_update(docs, drop=drop) | ||||
|  |  | |||
|  | @ -70,6 +70,5 @@ cdef struct TokenC: | |||
|     int sent_start | ||||
|     int ent_iob | ||||
|     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. | ||||
|     attr_t ent_kb_id | ||||
|     hash_t ent_id | ||||
| 
 | ||||
|     hash_t kb_id | ||||
|  |  | |||
|  | @ -326,7 +326,7 @@ cdef class Doc: | |||
|     def doc(self): | ||||
|         return self | ||||
| 
 | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, vector=None): | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): | ||||
|         """Create a `Span` object from the slice `doc.text[start : end]`. | ||||
| 
 | ||||
|         doc (Doc): The parent document. | ||||
|  | @ -334,6 +334,7 @@ cdef class Doc: | |||
|         end (int): The index of the first character after the span. | ||||
|         label (uint64 or string): A label to attach to the Span, e.g. for | ||||
|             named entities. | ||||
|         kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity. | ||||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of | ||||
|             the span. | ||||
|         RETURNS (Span): The newly constructed object. | ||||
|  | @ -342,6 +343,8 @@ cdef class Doc: | |||
|         """ | ||||
|         if not isinstance(label, int): | ||||
|             label = self.vocab.strings.add(label) | ||||
|         if not isinstance(kb_id, int): | ||||
|             kb_id = self.vocab.strings.add(kb_id) | ||||
|         cdef int start = token_by_start(self.c, self.length, start_idx) | ||||
|         if start == -1: | ||||
|             return None | ||||
|  | @ -350,7 +353,7 @@ cdef class Doc: | |||
|             return None | ||||
|         # Currently we have the token index, we want the range-end index | ||||
|         end += 1 | ||||
|         cdef Span span = Span(self, start, end, label=label, vector=vector) | ||||
|         cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector) | ||||
|         return span | ||||
| 
 | ||||
|     def similarity(self, other): | ||||
|  | @ -484,6 +487,7 @@ cdef class Doc: | |||
|             cdef const TokenC* token | ||||
|             cdef int start = -1 | ||||
|             cdef attr_t label = 0 | ||||
|             cdef attr_t kb_id = 0 | ||||
|             output = [] | ||||
|             for i in range(self.length): | ||||
|                 token = &self.c[i] | ||||
|  | @ -493,16 +497,18 @@ cdef class Doc: | |||
|                         raise ValueError(Errors.E093.format(seq=" ".join(seq))) | ||||
|                 elif token.ent_iob == 2 or token.ent_iob == 0: | ||||
|                     if start != -1: | ||||
|                         output.append(Span(self, start, i, label=label)) | ||||
|                         output.append(Span(self, start, i, label=label, kb_id=kb_id)) | ||||
|                     start = -1 | ||||
|                     label = 0 | ||||
|                     kb_id = 0 | ||||
|                 elif token.ent_iob == 3: | ||||
|                     if start != -1: | ||||
|                         output.append(Span(self, start, i, label=label)) | ||||
|                         output.append(Span(self, start, i, label=label, kb_id=kb_id)) | ||||
|                     start = i | ||||
|                     label = token.ent_type | ||||
|                     kb_id = token.ent_kb_id | ||||
|             if start != -1: | ||||
|                 output.append(Span(self, start, self.length, label=label)) | ||||
|                 output.append(Span(self, start, self.length, label=label, kb_id=kb_id)) | ||||
|             return tuple(output) | ||||
| 
 | ||||
|         def __set__(self, ents): | ||||
|  |  | |||
|  | @ -11,6 +11,7 @@ cdef class Span: | |||
|     cdef readonly int start_char | ||||
|     cdef readonly int end_char | ||||
|     cdef readonly attr_t label | ||||
|     cdef readonly attr_t kb_id | ||||
| 
 | ||||
|     cdef public _vector | ||||
|     cdef public _vector_norm | ||||
|  |  | |||
|  | @ -85,13 +85,14 @@ cdef class Span: | |||
|         return Underscore.span_extensions.pop(name) | ||||
| 
 | ||||
|     def __cinit__(self, Doc doc, int start, int end, label=0, vector=None, | ||||
|                   vector_norm=None): | ||||
|                   vector_norm=None, kb_id=0): | ||||
|         """Create a `Span` object from the slice `doc[start : end]`. | ||||
| 
 | ||||
|         doc (Doc): The parent document. | ||||
|         start (int): The index of the first token of the span. | ||||
|         end (int): The index of the first token after the span. | ||||
|         label (uint64): A label to attach to the Span, e.g. for named entities. | ||||
|         kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. | ||||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation | ||||
|             of the span. | ||||
|         RETURNS (Span): The newly constructed object. | ||||
|  | @ -115,6 +116,7 @@ cdef class Span: | |||
|         self.label = label | ||||
|         self._vector = vector | ||||
|         self._vector_norm = vector_norm | ||||
|         self.kb_id = kb_id | ||||
| 
 | ||||
|     def __richcmp__(self, Span other, int op): | ||||
|         if other is None: | ||||
|  | @ -655,6 +657,13 @@ cdef class Span: | |||
|                 label_ = '' | ||||
|             raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) | ||||
| 
 | ||||
|     property kb_id_: | ||||
|         """RETURNS (unicode): The named entity's KB ID.""" | ||||
|         def __get__(self): | ||||
|             return self.doc.vocab.strings[self.kb_id] | ||||
|         def __set__(self, unicode kb_id_): | ||||
|             raise NotImplementedError(TempErrors.T007.format(attr='kb_id_')) | ||||
| 
 | ||||
| 
 | ||||
| cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: | ||||
|     # Don't allow spaces to be the root, if there are | ||||
|  |  | |||
|  | @ -354,14 +354,6 @@ cdef class Token: | |||
|         def __set__(self, attr_t tag): | ||||
|             self.vocab.morphology.assign_tag(self.c, tag) | ||||
| 
 | ||||
|     property kb_id: | ||||
|         """RETURNS (uint64): ID of entity (after Entity Linking).""" | ||||
|         def __get__(self): | ||||
|             return self.c.kb_id | ||||
| 
 | ||||
|         def __set__(self, attr_t kb_id): | ||||
|             self.vocab.morphology.assign_kb_id(self.c, kb_id) | ||||
| 
 | ||||
|     property dep: | ||||
|         """RETURNS (uint64): ID of syntactic dependency label.""" | ||||
|         def __get__(self): | ||||
|  | @ -777,6 +769,22 @@ cdef class Token: | |||
|         def __set__(self, name): | ||||
|             self.c.ent_id = self.vocab.strings.add(name) | ||||
| 
 | ||||
|     property ent_kb_id: | ||||
|         """RETURNS (uint64): Named entity KB ID.""" | ||||
|         def __get__(self): | ||||
|             return self.c.ent_kb_id | ||||
| 
 | ||||
|         def __set__(self, attr_t ent_kb_id): | ||||
|             self.c.ent_kb_id = ent_kb_id | ||||
| 
 | ||||
|     property ent_kb_id_: | ||||
|         """RETURNS (unicode): Named entity KB ID.""" | ||||
|         def __get__(self): | ||||
|             return self.vocab.strings[self.c.ent_kb_id] | ||||
| 
 | ||||
|         def __set__(self, ent_kb_id): | ||||
|             self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) | ||||
| 
 | ||||
|     @property | ||||
|     def whitespace_(self): | ||||
|         """RETURNS (unicode): The trailing whitespace character, if present.""" | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user