2019-08-13 16:38:59 +03:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# coding: utf8
|
|
|
|
|
|
|
|
"""Example of defining and (pre)training spaCy's knowledge base,
|
|
|
|
which is needed to implement entity linking functionality.
|
|
|
|
|
|
|
|
For more details, see the documentation:
|
|
|
|
* Knowledge base: https://spacy.io/api/kb
|
|
|
|
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
|
|
|
|
|
2019-09-12 12:38:34 +03:00
|
|
|
Compatible with: spaCy v2.2
|
|
|
|
Last tested with: v2.2
|
2019-08-13 16:38:59 +03:00
|
|
|
"""
|
|
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
|
|
|
|
import plac
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
from spacy.vocab import Vocab
|
|
|
|
import spacy
|
|
|
|
from spacy.kb import KnowledgeBase
|
|
|
|
|
|
|
|
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
|
|
|
|
|
|
|
|
|
|
|
# Q2146908 (Russ Cochran): American golfer
|
|
|
|
# Q7381115 (Russ Cochran): publisher
|
|
|
|
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
|
|
|
|
2019-10-02 11:37:39 +03:00
|
|
|
INPUT_DIM = 300 # dimension of pretrained input vectors
|
2019-08-13 16:38:59 +03:00
|
|
|
DESC_WIDTH = 64 # dimension of output entity vectors
|
|
|
|
|
|
|
|
|
|
|
|
@plac.annotations(
|
|
|
|
vocab_path=("Path to the vocab for the kb", "option", "v", Path),
|
|
|
|
model=("Model name, should have pretrained word embeddings", "option", "m", str),
|
|
|
|
output_dir=("Optional output directory", "option", "o", Path),
|
|
|
|
n_iter=("Number of training iterations", "option", "n", int),
|
|
|
|
)
|
|
|
|
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
|
|
|
"""Load the model, create the KB and pretrain the entity encodings.
|
2019-10-02 11:37:39 +03:00
|
|
|
Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
|
2019-08-13 16:38:59 +03:00
|
|
|
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
|
|
|
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
|
|
|
if model is None and vocab_path is None:
|
2019-08-21 15:00:37 +03:00
|
|
|
raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
|
2019-08-13 16:38:59 +03:00
|
|
|
|
|
|
|
if model is not None:
|
|
|
|
nlp = spacy.load(model) # load existing spaCy model
|
|
|
|
print("Loaded model '%s'" % model)
|
|
|
|
else:
|
|
|
|
vocab = Vocab().from_disk(vocab_path)
|
|
|
|
# create blank Language class with specified vocab
|
|
|
|
nlp = spacy.blank("en", vocab=vocab)
|
|
|
|
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
|
|
|
|
|
|
|
kb = KnowledgeBase(vocab=nlp.vocab)
|
|
|
|
|
|
|
|
# set up the data
|
|
|
|
entity_ids = []
|
|
|
|
descriptions = []
|
|
|
|
freqs = []
|
|
|
|
for key, value in ENTITIES.items():
|
|
|
|
desc, freq = value
|
|
|
|
entity_ids.append(key)
|
|
|
|
descriptions.append(desc)
|
|
|
|
freqs.append(freq)
|
|
|
|
|
|
|
|
# training entity description encodings
|
|
|
|
# this part can easily be replaced with a custom entity encoder
|
|
|
|
encoder = EntityEncoder(
|
|
|
|
nlp=nlp,
|
|
|
|
input_dim=INPUT_DIM,
|
|
|
|
desc_width=DESC_WIDTH,
|
|
|
|
epochs=n_iter,
|
|
|
|
)
|
|
|
|
encoder.train(description_list=descriptions, to_print=True)
|
|
|
|
|
|
|
|
# get the pretrained entity vectors
|
|
|
|
embeddings = encoder.apply_encoder(descriptions)
|
|
|
|
|
|
|
|
# set the entities, can also be done by calling `kb.add_entity` for each entity
|
|
|
|
kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)
|
|
|
|
|
|
|
|
# adding aliases, the entities need to be defined in the KB beforehand
|
|
|
|
kb.add_alias(
|
|
|
|
alias="Russ Cochran",
|
|
|
|
entities=["Q2146908", "Q7381115"],
|
|
|
|
probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1
|
|
|
|
)
|
|
|
|
|
|
|
|
# test the trained model
|
|
|
|
print()
|
|
|
|
_print_kb(kb)
|
|
|
|
|
|
|
|
# save model to output directory
|
|
|
|
if output_dir is not None:
|
|
|
|
output_dir = Path(output_dir)
|
|
|
|
if not output_dir.exists():
|
|
|
|
output_dir.mkdir()
|
|
|
|
kb_path = str(output_dir / "kb")
|
|
|
|
kb.dump(kb_path)
|
|
|
|
print()
|
|
|
|
print("Saved KB to", kb_path)
|
|
|
|
|
|
|
|
# only storing the vocab if we weren't already reading it from file
|
|
|
|
if not vocab_path:
|
|
|
|
vocab_path = output_dir / "vocab"
|
|
|
|
kb.vocab.to_disk(vocab_path)
|
|
|
|
print("Saved vocab to", vocab_path)
|
|
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
# test the saved model
|
|
|
|
# always reload a knowledge base with the same vocab instance!
|
|
|
|
print("Loading vocab from", vocab_path)
|
|
|
|
print("Loading KB from", kb_path)
|
|
|
|
vocab2 = Vocab().from_disk(vocab_path)
|
|
|
|
kb2 = KnowledgeBase(vocab=vocab2)
|
|
|
|
kb2.load_bulk(kb_path)
|
|
|
|
_print_kb(kb2)
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
def _print_kb(kb):
|
|
|
|
print(kb.get_size_entities(), "kb entities:", kb.get_entity_strings())
|
|
|
|
print(kb.get_size_aliases(), "kb aliases:", kb.get_alias_strings())
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
plac.call(main)
|
|
|
|
|
|
|
|
# Expected output:
|
|
|
|
|
|
|
|
# 2 kb entities: ['Q2146908', 'Q7381115']
|
|
|
|
# 1 kb aliases: ['Russ Cochran']
|