From 78dd3e11da60532dc6f4c5cbcd76fa7577d3cb33 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 13 Jun 2019 16:25:39 +0200 Subject: [PATCH] write entity linking pipe to file and keep vocab consistent between kb and nlp --- .../wiki_entity_linking/kb_creator.py | 4 +- .../wiki_entity_linking/wiki_nel_pipeline.py | 145 +++++++++------- spacy/kb.pyx | 6 + spacy/language.py | 9 + spacy/pipeline/pipes.pyx | 155 ++++++++++++++---- 5 files changed, 226 insertions(+), 93 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index d097ac449..785811ea6 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = list(title_to_id.keys()) # TODO: remove this filter (just for quicker testing of code) - # title_list = title_list[0:34200] - # title_to_id = {t: title_to_id[t] for t in title_list} + title_list = title_list[0:342] + title_to_id = {t: title_to_id[t] for t in title_list} entity_list = [title_to_id[x] for x in title_list] diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ebad16ba5..0c03784a1 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -6,6 +6,7 @@ import random from spacy.util import minibatch, compounding from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el +from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH import spacy from spacy.vocab import Vocab @@ -22,41 +23,48 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' -VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' +NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1' +NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' MAX_CANDIDATES = 10 MIN_PAIR_OCC = 5 DOC_CHAR_CUTOFF = 300 -EPOCHS = 10 +EPOCHS = 2 DROPOUT = 0.1 def run_pipeline(): print("START", datetime.datetime.now()) print() - nlp = spacy.load('en_core_web_lg') - my_kb = None + nlp_1 = spacy.load('en_core_web_lg') + nlp_2 = None + kb_1 = None + kb_2 = None # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = False + to_create_kb = True # read KB back in from file to_read_kb = True - to_test_kb = False + to_test_kb = True # create training dataset create_wp_training = False # train the EL pipe train_pipe = True + measure_performance = False # test the EL pipe on a simple example to_test_pipeline = True + # write the NLP object, read back in and test again + test_nlp_io = True + # STEP 1 : create prior probabilities from WP # run only once ! if to_create_prior_probs: @@ -75,7 +83,7 @@ def run_pipeline(): # run only once ! if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) - my_kb = kb_creator.create_kb(nlp, + kb_1 = kb_creator.create_kb(nlp_1, max_entities_per_alias=MAX_CANDIDATES, min_occ=MIN_PAIR_OCC, entity_def_output=ENTITY_DEFS, @@ -83,63 +91,66 @@ def run_pipeline(): count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, to_print=False) - print("kb entities:", my_kb.get_size_entities()) - print("kb aliases:", my_kb.get_size_aliases()) + print("kb entities:", kb_1.get_size_entities()) + print("kb aliases:", kb_1.get_size_aliases()) print() - print("STEP 3b: write KB", datetime.datetime.now()) - my_kb.dump(KB_FILE) - nlp.vocab.to_disk(VOCAB_DIR) + print("STEP 3b: write KB and NLP", datetime.datetime.now()) + kb_1.dump(KB_FILE) + nlp_1.to_disk(NLP_1_DIR) print() # STEP 4 : read KB back in from file if to_read_kb: print("STEP 4: to_read_kb", datetime.datetime.now()) - my_vocab = Vocab() - my_vocab.from_disk(VOCAB_DIR) - my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) # TODO entity vectors - my_kb.load_bulk(KB_FILE) - print("kb entities:", my_kb.get_size_entities()) - print("kb aliases:", my_kb.get_size_aliases()) + # my_vocab = Vocab() + # my_vocab.from_disk(VOCAB_DIR) + # my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) + nlp_2 = spacy.load(NLP_1_DIR) + kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH) + kb_2.load_bulk(KB_FILE) + print("kb entities:", kb_2.get_size_entities()) + print("kb aliases:", kb_2.get_size_aliases()) print() # test KB if to_test_kb: - run_el.run_kb_toy_example(kb=my_kb) + run_el.run_kb_toy_example(kb=kb_2) print() # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) + training_set_creator.create_training(kb=kb_2, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: create the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 5000 - dev_limit = 1000 + train_limit = 10 + dev_limit = 5 print("Training on", train_limit, "articles") print("Dev testing on", dev_limit, "articles") print() - train_data = training_set_creator.read_training(nlp=nlp, + train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit, to_print=False) - dev_data = training_set_creator.read_training(nlp=nlp, + dev_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, to_print=False) - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF}) - nlp.add_pipe(el_pipe, last=True) + el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_CHAR_CUTOFF}) + el_pipe.set_kb(kb_2) + nlp_2.add_pipe(el_pipe, last=True) - other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] - with nlp.disable_pipes(*other_pipes): # only train Entity Linking - nlp.begin_training() + other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] + with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking + nlp_2.begin_training() for itn in range(EPOCHS): random.shuffle(train_data) @@ -147,11 +158,11 @@ def run_pipeline(): batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 - with nlp.disable_pipes(*other_pipes): + with nlp_2.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) - nlp.update( + nlp_2.update( docs, golds, drop=DROPOUT, @@ -164,40 +175,62 @@ def run_pipeline(): losses['entity_linker'] = losses['entity_linker'] / batchnr print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) - print() - print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) - print() + if measure_performance: + print() + print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) + print() - # print(" measuring accuracy 1-1") - el_pipe.context_weight = 1 - el_pipe.prior_weight = 1 - dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_1 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2)) + # print(" measuring accuracy 1-1") + el_pipe.context_weight = 1 + el_pipe.prior_weight = 1 + dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_1 = _measure_accuracy(train_data, el_pipe) + print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2)) - # baseline using only prior probabilities - el_pipe.context_weight = 0 - el_pipe.prior_weight = 1 - dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_0_1 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2)) + # baseline using only prior probabilities + el_pipe.context_weight = 0 + el_pipe.prior_weight = 1 + dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_0_1 = _measure_accuracy(train_data, el_pipe) + print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2)) - # using only context - el_pipe.context_weight = 1 - el_pipe.prior_weight = 0 - dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_0 = _measure_accuracy(train_data, el_pipe) + # using only context + el_pipe.context_weight = 1 + el_pipe.prior_weight = 0 + dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_0 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2)) - print() + print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2)) + print() if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) print() - run_el_toy_example(kb=my_kb, nlp=nlp) + run_el_toy_example(nlp=nlp_2) print() + if test_nlp_io: + print() + print("STEP 9: testing NLP IO", datetime.datetime.now()) + print() + print("writing to", NLP_2_DIR) + print(" vocab len nlp_2", len(nlp_2.vocab)) + print(" vocab len kb_2", len(kb_2.vocab)) + nlp_2.to_disk(NLP_2_DIR) + print() + print("reading from", NLP_2_DIR) + nlp_3 = spacy.load(NLP_2_DIR) + print(" vocab len nlp_3", len(nlp_3.vocab)) + + for pipe_name, pipe in nlp_3.pipeline: + if pipe_name == "entity_linker": + print(" vocab len kb_3", len(pipe.kb.vocab)) + + print() + print("running toy example with NLP 2") + run_el_toy_example(nlp=nlp_3) + print() print("STOP", datetime.datetime.now()) @@ -239,7 +272,7 @@ def _measure_accuracy(data, el_pipe): return acc -def run_el_toy_example(nlp, kb): +def run_el_toy_example(nlp): text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ "Douglas reminds us to always bring our towel. " \ "The main character in Doug's novel is the man Arthur Dent, " \ @@ -261,4 +294,4 @@ def run_el_toy_example(nlp, kb): if __name__ == "__main__": - run_pipeline() \ No newline at end of file + run_pipeline() diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ade2360be..9a84439ea 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -2,6 +2,8 @@ # cython: profile=True # coding: utf8 from collections import OrderedDict +from pathlib import Path, WindowsPath + from cpython.exc cimport PyErr_CheckSignals from spacy import util @@ -389,6 +391,8 @@ cdef class Writer: def __init__(self, object loc): if path.exists(loc): assert not path.isdir(loc), "%s is directory." % loc + if isinstance(loc, Path): + loc = bytes(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'wb') assert self._fp != NULL @@ -431,6 +435,8 @@ cdef class Reader: def __init__(self, object loc): assert path.exists(loc) assert not path.isdir(loc) + if isinstance(loc, Path): + loc = bytes(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'rb') if not self._fp: diff --git a/spacy/language.py b/spacy/language.py index ec3232bd5..0e5e29244 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -11,6 +11,7 @@ from copy import copy, deepcopy from thinc.neural import Model import srsly +from spacy.kb import KnowledgeBase from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer @@ -809,6 +810,14 @@ class Language(object): # Convert to list here in case exclude is (default) tuple exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) + + # download the KB for the entity linking component - requires the vocab + for pipe_name, pipe in self.pipeline: + if pipe_name == "entity_linker": + kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=pipe.cfg["entity_width"]) + kb.load_bulk(path / pipe_name / "kb") + pipe.set_kb(kb) + self._path = path return self diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f9043f0e4..e73ff6a0e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -14,6 +14,7 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module +from spacy.kb import KnowledgeBase from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -1077,7 +1078,7 @@ class EntityLinker(Pipe): hidden_width = cfg.get("hidden_width", 32) article_width = cfg.get("article_width", 128) sent_width = cfg.get("sent_width", 64) - entity_width = cfg["kb"].entity_vector_length + entity_width = cfg.get("entity_width") # no default because this needs to correspond with the KB article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg) sent_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) @@ -1089,34 +1090,41 @@ class EntityLinker(Pipe): return article_encoder, sent_encoder, mention_encoder def __init__(self, **cfg): + self.article_encoder = True + self.sent_encoder = True self.mention_encoder = True + self.kb = None self.cfg = dict(cfg) - self.kb = self.cfg["kb"] - self.doc_cutoff = self.cfg["doc_cutoff"] - - def use_avg_params(self): - # Modify the pipe's encoders/models, to use their average parameter values. - # TODO: this doesn't work yet because there's no exit method - self.article_encoder.use_params(self.sgd_article.averages) - self.sent_encoder.use_params(self.sgd_sent.averages) - self.mention_encoder.use_params(self.sgd_mention.averages) + self.doc_cutoff = self.cfg.get("doc_cutoff", 150) + def set_kb(self, kb): + self.kb = kb def require_model(self): # Raise an error if the component's model is not initialized. if getattr(self, "mention_encoder", None) in (None, True, False): raise ValueError(Errors.E109.format(name=self.name)) + def require_kb(self): + # Raise an error if the knowledge base is not initialized. + if getattr(self, "kb", None) in (None, True, False): + # TODO: custom error + raise ValueError(Errors.E109.format(name=self.name)) + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + self.require_kb() + self.cfg["entity_width"] = self.kb.entity_vector_length + if self.mention_encoder is True: self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) - self.sgd_article = create_default_optimizer(self.article_encoder.ops) - self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) - self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) + self.sgd_article = create_default_optimizer(self.article_encoder.ops) + self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) + self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) return self.sgd_article def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): self.require_model() + self.require_kb() if len(docs) != len(golds): raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs), @@ -1220,6 +1228,7 @@ class EntityLinker(Pipe): def predict(self, docs): self.require_model() + self.require_kb() if isinstance(docs, Doc): docs = [docs] @@ -1228,30 +1237,32 @@ class EntityLinker(Pipe): final_kb_ids = list() for i, article_doc in enumerate(docs): - doc_encoding = self.article_encoder([article_doc]) - for ent in article_doc.ents: - sent_doc = ent.sent.as_doc() - sent_encoding = self.sent_encoder([sent_doc]) - concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] - mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) - mention_enc_t = np.transpose(mention_encoding) + if len(article_doc) > 0: + doc_encoding = self.article_encoder([article_doc]) + for ent in article_doc.ents: + sent_doc = ent.sent.as_doc() + if len(sent_doc) > 0: + sent_encoding = self.sent_encoder([sent_doc]) + concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] + mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) + mention_enc_t = np.transpose(mention_encoding) - candidates = self.kb.get_candidates(ent.text) - if candidates: - scores = list() - for c in candidates: - prior_prob = c.prior_prob * self.prior_weight - kb_id = c.entity_ - entity_encoding = c.entity_vector - sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight - score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? - scores.append(score) + candidates = self.kb.get_candidates(ent.text) + if candidates: + scores = list() + for c in candidates: + prior_prob = c.prior_prob * self.prior_weight + kb_id = c.entity_ + entity_encoding = c.entity_vector + sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight + score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? + scores.append(score) - # TODO: thresholding - best_index = scores.index(max(scores)) - best_candidate = candidates[best_index] - final_entities.append(ent) - final_kb_ids.append(best_candidate.entity_) + # TODO: thresholding + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index] + final_entities.append(ent) + final_kb_ids.append(best_candidate.entity_) return final_entities, final_kb_ids @@ -1260,6 +1271,80 @@ class EntityLinker(Pipe): for token in entity: token.ent_kb_id_ = kb_id + def to_bytes(self, exclude=tuple(), **kwargs): + """Serialize the pipe to a bytestring. + + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + """ + serialize = OrderedDict() + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["kb"] = self.kb.to_bytes # TODO + if self.mention_encoder not in (True, False, None): + serialize["article_encoder"] = self.article_encoder.to_bytes + serialize["sent_encoder"] = self.sent_encoder.to_bytes + serialize["mention_encoder"] = self.mention_encoder.to_bytes + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + """Load the pipe from a bytestring.""" + deserialize = OrderedDict() + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["kb"] = lambda b: self.kb.from_bytes(b) # TODO + deserialize["article_encoder"] = lambda b: self.article_encoder.from_bytes(b) + deserialize["sent_encoder"] = lambda b: self.sent_encoder.from_bytes(b) + deserialize["mention_encoder"] = lambda b: self.mention_encoder.from_bytes(b) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + """Serialize the pipe to disk.""" + serialize = OrderedDict() + serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) + serialize["kb"] = lambda p: self.kb.dump(p) + if self.mention_encoder not in (None, True, False): + serialize["article_encoder"] = lambda p: p.open("wb").write(self.article_encoder.to_bytes()) + serialize["sent_encoder"] = lambda p: p.open("wb").write(self.sent_encoder.to_bytes()) + serialize["mention_encoder"] = lambda p: p.open("wb").write(self.mention_encoder.to_bytes()) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude=tuple(), **kwargs): + """Load the pipe from disk.""" + def load_article_encoder(p): + if self.article_encoder is True: + self.article_encoder, _, _ = self.Model(**self.cfg) + self.article_encoder.from_bytes(p.open("rb").read()) + + def load_sent_encoder(p): + if self.sent_encoder is True: + _, self.sent_encoder, _ = self.Model(**self.cfg) + self.sent_encoder.from_bytes(p.open("rb").read()) + + def load_mention_encoder(p): + if self.mention_encoder is True: + _, _, self.mention_encoder = self.Model(**self.cfg) + self.mention_encoder.from_bytes(p.open("rb").read()) + + deserialize = OrderedDict() + deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) + deserialize["article_encoder"] = load_article_encoder + deserialize["sent_encoder"] = load_sent_encoder + deserialize["mention_encoder"] = load_mention_encoder + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_disk(path, deserialize, exclude) + return self + + def rehearse(self, docs, sgd=None, losses=None, **config): + # TODO + pass + + def add_label(self, label): + pass + + class Sentencizer(object): """Segment the Doc into sentences using a rule-based strategy.