mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Friendly error warning for NEL example script (#4881)
* make model positional arg and raise error if no vectors * small doc fixes
This commit is contained in:
parent
d24bca62f6
commit
c70ccd543d
|
@ -17,6 +17,7 @@ Run `wikipedia_pretrain_kb.py`
|
||||||
|
|
||||||
Quick testing and rerunning:
|
Quick testing and rerunning:
|
||||||
* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything.
|
* When trying out the pipeline for a quick test, set `limit_prior` (`-lp`), `limit_train` (`-lt`) and/or `limit_wd` (`-lw`) to read only parts of the dumps instead of everything.
|
||||||
|
* e.g. set `-lt 20000 -lp 2000 -lw 3000 -f 1`
|
||||||
* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
|
* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ logger = logging.getLogger(__name__)
|
||||||
loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
|
loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
|
||||||
loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
|
loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
|
||||||
loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
|
loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
|
||||||
descr_from_wp=("Flag for using wp descriptions not wd", "flag", "wp"),
|
descr_from_wp=("Flag for using descriptions from WP instead of WD (default False)", "flag", "wp"),
|
||||||
limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
|
limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
|
||||||
limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
|
limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
|
||||||
limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
|
limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
|
||||||
|
|
|
@ -32,27 +32,24 @@ DESC_WIDTH = 64 # dimension of output entity vectors
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
vocab_path=("Path to the vocab for the kb", "option", "v", Path),
|
model=("Model name, should have pretrained word embeddings", "positional", None, str),
|
||||||
model=("Model name, should have pretrained word embeddings", "option", "m", str),
|
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
n_iter=("Number of training iterations", "option", "n", int),
|
n_iter=("Number of training iterations", "option", "n", int),
|
||||||
)
|
)
|
||||||
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
def main(model=None, output_dir=None, n_iter=50):
|
||||||
"""Load the model, create the KB and pretrain the entity encodings.
|
"""Load the model, create the KB and pretrain the entity encodings.
|
||||||
Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
|
|
||||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
||||||
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
The updated vocab will also be written to a directory in the output_dir."""
|
||||||
if model is None and vocab_path is None:
|
|
||||||
raise ValueError("Either the `nlp` model or the `vocab` should be specified.")
|
|
||||||
|
|
||||||
if model is not None:
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
print("Loaded model '%s'" % model)
|
||||||
print("Loaded model '%s'" % model)
|
|
||||||
else:
|
# check the length of the nlp vectors
|
||||||
vocab = Vocab().from_disk(vocab_path)
|
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
||||||
# create blank Language class with specified vocab
|
raise ValueError(
|
||||||
nlp = spacy.blank("en", vocab=vocab)
|
"The `nlp` object should have access to pretrained word vectors, "
|
||||||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
" cf. https://spacy.io/usage/models#languages."
|
||||||
|
)
|
||||||
|
|
||||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||||
|
|
||||||
|
@ -103,11 +100,9 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
||||||
print()
|
print()
|
||||||
print("Saved KB to", kb_path)
|
print("Saved KB to", kb_path)
|
||||||
|
|
||||||
# only storing the vocab if we weren't already reading it from file
|
vocab_path = output_dir / "vocab"
|
||||||
if not vocab_path:
|
kb.vocab.to_disk(vocab_path)
|
||||||
vocab_path = output_dir / "vocab"
|
print("Saved vocab to", vocab_path)
|
||||||
kb.vocab.to_disk(vocab_path)
|
|
||||||
print("Saved vocab to", vocab_path)
|
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user