diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md index 7460a455e..56d0c1415 100644 --- a/bin/wiki_entity_linking/README.md +++ b/bin/wiki_entity_linking/README.md @@ -17,6 +17,7 @@ Run `wikipedia_pretrain_kb.py` Quick testing and rerunning: * When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything. + * e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1` * If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed. diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py index 940607b72..003074feb 100644 --- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py +++ b/bin/wiki_entity_linking/wikidata_pretrain_kb.py @@ -40,7 +40,7 @@ logger = logging.getLogger(__name__) loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path), loc_entity_defs=("Location to file with entity definitions", "option", "d", Path), loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path), - descr_from_wp=("Flag for using wp descriptions not wd", "flag", "wp"), + descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"), limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int), limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int), limit_wd=("Threshold to limit lines read from WD", "option", "lw", int), diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py index db6442ad4..54c68f653 100644 --- a/examples/training/pretrain_kb.py +++ b/examples/training/pretrain_kb.py @@ -32,27 +32,24 @@ DESC_WIDTH = 64 # dimension of output entity vectors @plac.annotations( - vocab_path=("Path to the vocab for the kb", "option", "v", Path), - model=("Model name, should have pretrained word embeddings", "option", "m", str), + model=("Model name, should have pretrained word embeddings", "positional", None, str), output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int), ) -def main(vocab_path=None, model=None, output_dir=None, n_iter=50): +def main(model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. - Either an nlp model or a vocab is needed to provide access to pretrained word embeddings. If an output_dir is provided, the KB will be stored there in a file 'kb'. - When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" - if model is None and vocab_path is None: - raise ValueError("Either the `nlp` model or the `vocab` should be specified.") + The updated vocab will also be written to a directory in the output_dir.""" - if model is not None: - nlp = spacy.load(model) # load existing spaCy model - print("Loaded model '%s'" % model) - else: - vocab = Vocab().from_disk(vocab_path) - # create blank Language class with specified vocab - nlp = spacy.blank("en", vocab=vocab) - print("Created blank 'en' model with vocab from '%s'" % vocab_path) + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + + # check the length of the nlp vectors + if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: + raise ValueError( + "The `nlp` object should have access to pretrained word vectors, " + " cf. https://spacy.io/usage/models#languages." + ) kb = KnowledgeBase(vocab=nlp.vocab) @@ -103,11 +100,9 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50): print() print("Saved KB to", kb_path) - # only storing the vocab if we weren't already reading it from file - if not vocab_path: - vocab_path = output_dir / "vocab" - kb.vocab.to_disk(vocab_path) - print("Saved vocab to", vocab_path) + vocab_path = output_dir / "vocab" + kb.vocab.to_disk(vocab_path) + print("Saved vocab to", vocab_path) print()