mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	minimal EL pipe
This commit is contained in:
		
							parent
							
								
									26afa4800f
								
							
						
					
					
						commit
						d0c763ba44
					
				
							
								
								
									
										14
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							| 
						 | 
					@ -109,7 +109,7 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        """Add an entry to the knowledge base."""
 | 
					        """Add an entry to the knowledge base."""
 | 
				
			||||||
        # This is what we'll map the hash key to. It's where the entry will sit
 | 
					        # This is what we'll map the hash key to. It's where the entry will sit
 | 
				
			||||||
        # in the vector of entries, so we can get it later.
 | 
					        # in the vector of entries, so we can get it later.
 | 
				
			||||||
        cdef int64_t entity_index = self._entries.size()
 | 
					        cdef int64_t new_index = self._entries.size()
 | 
				
			||||||
        self._entries.push_back(
 | 
					        self._entries.push_back(
 | 
				
			||||||
            _EntryC(
 | 
					            _EntryC(
 | 
				
			||||||
                entity_hash=entity_hash,
 | 
					                entity_hash=entity_hash,
 | 
				
			||||||
| 
						 | 
					@ -117,22 +117,22 @@ cdef class KnowledgeBase:
 | 
				
			||||||
                feats_row=feats_row,
 | 
					                feats_row=feats_row,
 | 
				
			||||||
                prob=prob
 | 
					                prob=prob
 | 
				
			||||||
            ))
 | 
					            ))
 | 
				
			||||||
        self._entry_index[entity_hash] = entity_index
 | 
					        self._entry_index[entity_hash] = new_index
 | 
				
			||||||
        return entity_index
 | 
					        return new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
 | 
					    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
 | 
				
			||||||
        """Connect a mention to a list of potential entities with their prior probabilities ."""
 | 
					        """Connect a mention to a list of potential entities with their prior probabilities ."""
 | 
				
			||||||
        cdef int64_t alias_index = self._aliases_table.size()
 | 
					        cdef int64_t new_index = self._aliases_table.size()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self._aliases_table.push_back(
 | 
					        self._aliases_table.push_back(
 | 
				
			||||||
            _AliasC(
 | 
					            _AliasC(
 | 
				
			||||||
                entry_indices=entry_indices,
 | 
					                entry_indices=entry_indices,
 | 
				
			||||||
                probs=probs
 | 
					                probs=probs
 | 
				
			||||||
            ))
 | 
					            ))
 | 
				
			||||||
        self._alias_index[alias_hash] = alias_index
 | 
					        self._alias_index[alias_hash] = new_index
 | 
				
			||||||
        return alias_index
 | 
					        return new_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline create_empty_vectors(self):
 | 
					    cdef inline _create_empty_vectors(self):
 | 
				
			||||||
        """ 
 | 
					        """ 
 | 
				
			||||||
        Making sure the first element of each vector is a dummy,
 | 
					        Making sure the first element of each vector is a dummy,
 | 
				
			||||||
        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
 | 
					        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -65,7 +65,7 @@ cdef class KnowledgeBase:
 | 
				
			||||||
        self._alias_index = PreshMap()
 | 
					        self._alias_index = PreshMap()
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        self.strings = StringStore()
 | 
					        self.strings = StringStore()
 | 
				
			||||||
        self.create_empty_vectors()
 | 
					        self._create_empty_vectors()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        return self.get_size_entities()
 | 
					        return self.get_size_entities()
 | 
				
			||||||
| 
						 | 
					@ -151,4 +151,3 @@ cdef class KnowledgeBase:
 | 
				
			||||||
                          prior_prob=prob)
 | 
					                          prior_prob=prob)
 | 
				
			||||||
                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
 | 
					                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
 | 
				
			||||||
                if entry_index != 0]
 | 
					                if entry_index != 0]
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -209,6 +209,10 @@ class Language(object):
 | 
				
			||||||
    def entity(self):
 | 
					    def entity(self):
 | 
				
			||||||
        return self.get_pipe("ner")
 | 
					        return self.get_pipe("ner")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def linker(self):
 | 
				
			||||||
 | 
					        return self.get_pipe("el")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def matcher(self):
 | 
					    def matcher(self):
 | 
				
			||||||
        return self.get_pipe("matcher")
 | 
					        return self.get_pipe("matcher")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1045,44 +1045,28 @@ class EntityLinker(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def Model(cls, nr_class=1, **cfg):
 | 
					    def Model(cls, nr_class=1, **cfg):
 | 
				
			||||||
        embed_size = util.env_opt("embed_size", 2000)
 | 
					        # TODO: non-dummy EL implementation
 | 
				
			||||||
        if "token_vector_width" in cfg:
 | 
					        return None
 | 
				
			||||||
            token_vector_width = cfg["token_vector_width"]
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            token_vector_width = util.env_opt("token_vector_width", 96)
 | 
					 | 
				
			||||||
        if cfg.get('architecture') == 'simple_cnn':
 | 
					 | 
				
			||||||
            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
 | 
					 | 
				
			||||||
            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            return None # build_text_classifier(nr_class, **cfg)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, model=True, **cfg):
 | 
				
			||||||
    def __init__(self, vocab, model=True, **cfg):
 | 
					        self.model = False
 | 
				
			||||||
        self.vocab = vocab
 | 
					 | 
				
			||||||
        self.model = model
 | 
					 | 
				
			||||||
        self._rehearsal_model = None
 | 
					 | 
				
			||||||
        self.cfg = dict(cfg)
 | 
					        self.cfg = dict(cfg)
 | 
				
			||||||
 | 
					        self.kb = self.cfg["kb"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, doc):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
        # scores, tensors = self.predict([doc])
 | 
					        self.set_annotations([doc], scores=None, tensors=None)
 | 
				
			||||||
        scores, tensors = None, None
 | 
					 | 
				
			||||||
        self.set_annotations([doc], scores, tensors=tensors)
 | 
					 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
					    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
				
			||||||
 | 
					        """Apply the pipe to a stream of documents.
 | 
				
			||||||
 | 
					        Both __call__ and pipe should delegate to the `predict()`
 | 
				
			||||||
 | 
					        and `set_annotations()` methods.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        for docs in util.minibatch(stream, size=batch_size):
 | 
					        for docs in util.minibatch(stream, size=batch_size):
 | 
				
			||||||
            docs = list(docs)
 | 
					            docs = list(docs)
 | 
				
			||||||
            scores, tensors = self.predict(docs)
 | 
					            self.set_annotations(docs, scores=None, tensors=None)
 | 
				
			||||||
            self.set_annotations(docs, scores, tensors=tensors)
 | 
					 | 
				
			||||||
            yield from docs
 | 
					            yield from docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def predict(self, docs):
 | 
					 | 
				
			||||||
        # self.require_model()
 | 
					 | 
				
			||||||
        scores = self.model(docs)
 | 
					 | 
				
			||||||
        scores = self.model.ops.asarray(scores)
 | 
					 | 
				
			||||||
        tensors = [doc.tensor for doc in docs]
 | 
					 | 
				
			||||||
        return scores, tensors
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def set_annotations(self, docs, scores, tensors=None):
 | 
					    def set_annotations(self, docs, scores, tensors=None):
 | 
				
			||||||
        # TODO Sofie: actually implement this class instead of dummy implementation
 | 
					        # TODO Sofie: actually implement this class instead of dummy implementation
 | 
				
			||||||
        for i, doc in enumerate(docs):
 | 
					        for i, doc in enumerate(docs):
 | 
				
			||||||
| 
						 | 
					@ -1091,67 +1075,13 @@ class EntityLinker(Pipe):
 | 
				
			||||||
                    for token in ent:
 | 
					                    for token in ent:
 | 
				
			||||||
                        token.ent_kb_id_ = "Q42"
 | 
					                        token.ent_kb_id_ = "Q42"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
 | 
					 | 
				
			||||||
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 | 
					 | 
				
			||||||
        loss, d_scores = self.get_loss(docs, golds, scores)
 | 
					 | 
				
			||||||
        bp_scores(d_scores, sgd=sgd)
 | 
					 | 
				
			||||||
        if losses is not None:
 | 
					 | 
				
			||||||
            losses.setdefault(self.name, 0.0)
 | 
					 | 
				
			||||||
            losses[self.name] += loss
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def rehearse(self, docs, drop=0., sgd=None, losses=None):
 | 
					 | 
				
			||||||
        if self._rehearsal_model is None:
 | 
					 | 
				
			||||||
            return
 | 
					 | 
				
			||||||
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 | 
					 | 
				
			||||||
        target = self._rehearsal_model(docs)
 | 
					 | 
				
			||||||
        gradient = scores - target
 | 
					 | 
				
			||||||
        bp_scores(gradient, sgd=sgd)
 | 
					 | 
				
			||||||
        if losses is not None:
 | 
					 | 
				
			||||||
            losses.setdefault(self.name, 0.0)
 | 
					 | 
				
			||||||
            losses[self.name] += (gradient**2).sum()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def get_loss(self, docs, golds, scores):
 | 
					    def get_loss(self, docs, golds, scores):
 | 
				
			||||||
        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
 | 
					        # TODO
 | 
				
			||||||
        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
 | 
					        pass
 | 
				
			||||||
        for i, gold in enumerate(golds):
 | 
					 | 
				
			||||||
            for j, label in enumerate(self.labels):
 | 
					 | 
				
			||||||
                if label in gold.cats:
 | 
					 | 
				
			||||||
                    truths[i, j] = gold.cats[label]
 | 
					 | 
				
			||||||
                else:
 | 
					 | 
				
			||||||
                    not_missing[i, j] = 0.
 | 
					 | 
				
			||||||
        truths = self.model.ops.asarray(truths)
 | 
					 | 
				
			||||||
        not_missing = self.model.ops.asarray(not_missing)
 | 
					 | 
				
			||||||
        d_scores = (scores-truths) / scores.shape[0]
 | 
					 | 
				
			||||||
        d_scores *= not_missing
 | 
					 | 
				
			||||||
        mean_square_error = (d_scores**2).sum(axis=1).mean()
 | 
					 | 
				
			||||||
        return float(mean_square_error), d_scores
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_label(self, label):
 | 
					    def add_label(self, label):
 | 
				
			||||||
        if label in self.labels:
 | 
					        # TODO
 | 
				
			||||||
            return 0
 | 
					        pass
 | 
				
			||||||
        if self.model not in (None, True, False):
 | 
					 | 
				
			||||||
            # This functionality was available previously, but was broken.
 | 
					 | 
				
			||||||
            # The problem is that we resize the last layer, but the last layer
 | 
					 | 
				
			||||||
            # is actually just an ensemble. We're not resizing the child layers
 | 
					 | 
				
			||||||
            # -- a huge problem.
 | 
					 | 
				
			||||||
            raise ValueError(Errors.E116)
 | 
					 | 
				
			||||||
            #smaller = self.model._layers[-1]
 | 
					 | 
				
			||||||
            #larger = Affine(len(self.labels)+1, smaller.nI)
 | 
					 | 
				
			||||||
            #copy_array(larger.W[:smaller.nO], smaller.W)
 | 
					 | 
				
			||||||
            #copy_array(larger.b[:smaller.nO], smaller.b)
 | 
					 | 
				
			||||||
            #self.model._layers[-1] = larger
 | 
					 | 
				
			||||||
        self.labels = tuple(list(self.labels) + [label])
 | 
					 | 
				
			||||||
        return 1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
 | 
					 | 
				
			||||||
                       **kwargs):
 | 
					 | 
				
			||||||
        if self.model is True:
 | 
					 | 
				
			||||||
            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
 | 
					 | 
				
			||||||
            self.model = self.Model(len(self.labels), **self.cfg)
 | 
					 | 
				
			||||||
            link_vectors_to_models(self.vocab)
 | 
					 | 
				
			||||||
        if sgd is None:
 | 
					 | 
				
			||||||
            sgd = self.create_optimizer()
 | 
					 | 
				
			||||||
        return sgd
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
 | 
					__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,16 +37,14 @@ def create_kb():
 | 
				
			||||||
    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 | 
					    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 | 
				
			||||||
    print()
 | 
					    print()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for alias in [alias1, "rubbish", alias3]:
 | 
					    return mykb
 | 
				
			||||||
        candidates = mykb.get_candidates(alias)
 | 
					 | 
				
			||||||
        print(len(candidates), "candidates for", alias)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def add_el():
 | 
					def add_el(kb):
 | 
				
			||||||
    nlp = spacy.load('en_core_web_sm')
 | 
					    nlp = spacy.load('en_core_web_sm')
 | 
				
			||||||
    print("pipes before:", nlp.pipe_names)
 | 
					    print("pipes before:", nlp.pipe_names)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    el_pipe = nlp.create_pipe(name='el')
 | 
					    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
 | 
				
			||||||
    nlp.add_pipe(el_pipe, last=True)
 | 
					    nlp.add_pipe(el_pipe, last=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print("pipes after:", nlp.pipe_names)
 | 
					    print("pipes after:", nlp.pipe_names)
 | 
				
			||||||
| 
						 | 
					@ -62,7 +60,12 @@ def add_el():
 | 
				
			||||||
    for ent in doc.ents:
 | 
					    for ent in doc.ents:
 | 
				
			||||||
        print("ent", ent.text, ent.label_, ent.kb_id_)
 | 
					        print("ent", ent.text, ent.label_, ent.kb_id_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print()
 | 
				
			||||||
 | 
					    for alias in ["douglassss", "rubbish", "adam"]:
 | 
				
			||||||
 | 
					        candidates = nlp.linker.kb.get_candidates(alias)
 | 
				
			||||||
 | 
					        print(len(candidates), "candidates for", alias)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    # add_el()
 | 
					    mykb = create_kb()
 | 
				
			||||||
    create_kb()
 | 
					    add_el(mykb)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user