mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	adding kb_id as field to token, el as nlp pipeline component
This commit is contained in:
		
							parent
							
								
									5eadf61327
								
							
						
					
					
						commit
						173d45ec5f
					
				
							
								
								
									
										0
									
								
								sandbox_test_sofie/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								sandbox_test_sofie/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										21
									
								
								sandbox_test_sofie/testing_el.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								sandbox_test_sofie/testing_el.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,21 @@
 | 
			
		|||
import spacy
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def add_el():
 | 
			
		||||
    nlp = spacy.load('en_core_web_sm')
 | 
			
		||||
    print("pipes", nlp.pipe_names)
 | 
			
		||||
 | 
			
		||||
    el_pipe = nlp.create_pipe(name='el')
 | 
			
		||||
    nlp.add_pipe(el_pipe, last=True)
 | 
			
		||||
 | 
			
		||||
    print("pipes", nlp.pipe_names)
 | 
			
		||||
    print()
 | 
			
		||||
 | 
			
		||||
    text = "Australian striker John hits century"
 | 
			
		||||
    doc = nlp(text)
 | 
			
		||||
    for token in doc:
 | 
			
		||||
        print("token", token.text, token.tag_, token.pos_, token.kb_id)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    add_el()
 | 
			
		||||
| 
						 | 
				
			
			@ -14,7 +14,7 @@ import srsly
 | 
			
		|||
from .tokenizer import Tokenizer
 | 
			
		||||
from .vocab import Vocab
 | 
			
		||||
from .lemmatizer import Lemmatizer
 | 
			
		||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
 | 
			
		||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer, EntityLinker
 | 
			
		||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
 | 
			
		||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 | 
			
		||||
from .pipeline import EntityRuler
 | 
			
		||||
| 
						 | 
				
			
			@ -114,6 +114,7 @@ class Language(object):
 | 
			
		|||
        "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
 | 
			
		||||
        "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
 | 
			
		||||
        "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
 | 
			
		||||
        "el": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
 | 
			
		||||
        "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
 | 
			
		||||
        "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
 | 
			
		||||
        "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -43,6 +43,8 @@ cdef class Morphology:
 | 
			
		|||
 | 
			
		||||
    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 | 
			
		||||
 | 
			
		||||
    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef enum univ_morph_t:
 | 
			
		||||
    NIL = 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -122,6 +122,9 @@ cdef class Morphology:
 | 
			
		|||
        else:
 | 
			
		||||
            flags[0] &= ~(one << flag_id)
 | 
			
		||||
 | 
			
		||||
    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
 | 
			
		||||
        token.kb_id = kb_id
 | 
			
		||||
 | 
			
		||||
    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
 | 
			
		||||
                         force=False):
 | 
			
		||||
        """Add a special-case rule to the morphological analyser. Tokens whose
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from .pipes import Tagger, DependencyParser, EntityRecognizer  # noqa
 | 
			
		||||
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker  # noqa
 | 
			
		||||
from .pipes import TextCategorizer, Tensorizer, Pipe  # noqa
 | 
			
		||||
from .entityruler import EntityRuler  # noqa
 | 
			
		||||
from .hooks import SentenceSegmenter, SimilarityHook  # noqa
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1040,4 +1040,116 @@ cdef class EntityRecognizer(Parser):
 | 
			
		|||
                if move[0] in ("B", "I", "L", "U")))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
 | 
			
		||||
class EntityLinker(Pipe):
 | 
			
		||||
    name = 'el'
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def Model(cls, nr_class=1, **cfg):
 | 
			
		||||
        embed_size = util.env_opt("embed_size", 2000)
 | 
			
		||||
        if "token_vector_width" in cfg:
 | 
			
		||||
            token_vector_width = cfg["token_vector_width"]
 | 
			
		||||
        else:
 | 
			
		||||
            token_vector_width = util.env_opt("token_vector_width", 96)
 | 
			
		||||
        if cfg.get('architecture') == 'simple_cnn':
 | 
			
		||||
            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
 | 
			
		||||
            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
 | 
			
		||||
        else:
 | 
			
		||||
            return None # build_text_classifier(nr_class, **cfg)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def __init__(self, vocab, model=True, **cfg):
 | 
			
		||||
        self.vocab = vocab
 | 
			
		||||
        self.model = model
 | 
			
		||||
        self._rehearsal_model = None
 | 
			
		||||
        self.cfg = dict(cfg)
 | 
			
		||||
 | 
			
		||||
    def __call__(self, doc):
 | 
			
		||||
        # scores, tensors = self.predict([doc])
 | 
			
		||||
        scores, tensors = None, None
 | 
			
		||||
        self.set_annotations([doc], scores, tensors=tensors)
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
    def pipe(self, stream, batch_size=128, n_threads=-1):
 | 
			
		||||
        for docs in util.minibatch(stream, size=batch_size):
 | 
			
		||||
            docs = list(docs)
 | 
			
		||||
            scores, tensors = self.predict(docs)
 | 
			
		||||
            self.set_annotations(docs, scores, tensors=tensors)
 | 
			
		||||
            yield from docs
 | 
			
		||||
 | 
			
		||||
    def predict(self, docs):
 | 
			
		||||
        # self.require_model()
 | 
			
		||||
        scores = self.model(docs)
 | 
			
		||||
        scores = self.model.ops.asarray(scores)
 | 
			
		||||
        tensors = [doc.tensor for doc in docs]
 | 
			
		||||
        return scores, tensors
 | 
			
		||||
 | 
			
		||||
    def set_annotations(self, docs, scores, tensors=None):
 | 
			
		||||
        # TODO Sofie: actually implement this class instead of dummy implementation
 | 
			
		||||
        for i, doc in enumerate(docs):
 | 
			
		||||
            for token in doc:
 | 
			
		||||
                token.kb_id = 342
 | 
			
		||||
 | 
			
		||||
    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
 | 
			
		||||
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 | 
			
		||||
        loss, d_scores = self.get_loss(docs, golds, scores)
 | 
			
		||||
        bp_scores(d_scores, sgd=sgd)
 | 
			
		||||
        if losses is not None:
 | 
			
		||||
            losses.setdefault(self.name, 0.0)
 | 
			
		||||
            losses[self.name] += loss
 | 
			
		||||
 | 
			
		||||
    def rehearse(self, docs, drop=0., sgd=None, losses=None):
 | 
			
		||||
        if self._rehearsal_model is None:
 | 
			
		||||
            return
 | 
			
		||||
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
 | 
			
		||||
        target = self._rehearsal_model(docs)
 | 
			
		||||
        gradient = scores - target
 | 
			
		||||
        bp_scores(gradient, sgd=sgd)
 | 
			
		||||
        if losses is not None:
 | 
			
		||||
            losses.setdefault(self.name, 0.0)
 | 
			
		||||
            losses[self.name] += (gradient**2).sum()
 | 
			
		||||
 | 
			
		||||
    def get_loss(self, docs, golds, scores):
 | 
			
		||||
        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
 | 
			
		||||
        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
 | 
			
		||||
        for i, gold in enumerate(golds):
 | 
			
		||||
            for j, label in enumerate(self.labels):
 | 
			
		||||
                if label in gold.cats:
 | 
			
		||||
                    truths[i, j] = gold.cats[label]
 | 
			
		||||
                else:
 | 
			
		||||
                    not_missing[i, j] = 0.
 | 
			
		||||
        truths = self.model.ops.asarray(truths)
 | 
			
		||||
        not_missing = self.model.ops.asarray(not_missing)
 | 
			
		||||
        d_scores = (scores-truths) / scores.shape[0]
 | 
			
		||||
        d_scores *= not_missing
 | 
			
		||||
        mean_square_error = (d_scores**2).sum(axis=1).mean()
 | 
			
		||||
        return float(mean_square_error), d_scores
 | 
			
		||||
 | 
			
		||||
    def add_label(self, label):
 | 
			
		||||
        if label in self.labels:
 | 
			
		||||
            return 0
 | 
			
		||||
        if self.model not in (None, True, False):
 | 
			
		||||
            # This functionality was available previously, but was broken.
 | 
			
		||||
            # The problem is that we resize the last layer, but the last layer
 | 
			
		||||
            # is actually just an ensemble. We're not resizing the child layers
 | 
			
		||||
            # -- a huge problem.
 | 
			
		||||
            raise ValueError(Errors.E116)
 | 
			
		||||
            #smaller = self.model._layers[-1]
 | 
			
		||||
            #larger = Affine(len(self.labels)+1, smaller.nI)
 | 
			
		||||
            #copy_array(larger.W[:smaller.nO], smaller.W)
 | 
			
		||||
            #copy_array(larger.b[:smaller.nO], smaller.b)
 | 
			
		||||
            #self.model._layers[-1] = larger
 | 
			
		||||
        self.labels = tuple(list(self.labels) + [label])
 | 
			
		||||
        return 1
 | 
			
		||||
 | 
			
		||||
    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
 | 
			
		||||
                       **kwargs):
 | 
			
		||||
        if self.model is True:
 | 
			
		||||
            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
 | 
			
		||||
            self.model = self.Model(len(self.labels), **self.cfg)
 | 
			
		||||
            link_vectors_to_models(self.vocab)
 | 
			
		||||
        if sgd is None:
 | 
			
		||||
            sgd = self.create_optimizer()
 | 
			
		||||
        return sgd
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -71,3 +71,5 @@ cdef struct TokenC:
 | 
			
		|||
    int ent_iob
 | 
			
		||||
    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
 | 
			
		||||
    hash_t ent_id
 | 
			
		||||
 | 
			
		||||
    hash_t kb_id
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -307,6 +307,14 @@ cdef class Token:
 | 
			
		|||
        def __set__(self, attr_t tag):
 | 
			
		||||
            self.vocab.morphology.assign_tag(self.c, tag)
 | 
			
		||||
 | 
			
		||||
    property kb_id:
 | 
			
		||||
        """RETURNS (uint64): ID of entity (after Entity Linking)."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.c.kb_id
 | 
			
		||||
 | 
			
		||||
        def __set__(self, attr_t kb_id):
 | 
			
		||||
            self.vocab.morphology.assign_kb_id(self.c, kb_id)
 | 
			
		||||
 | 
			
		||||
    property dep:
 | 
			
		||||
        """RETURNS (uint64): ID of syntactic dependency label."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user