Add option to omit extra lexeme tables in CLI

This commit is contained in:
Adriane Boyd 2020-05-20 15:51:44 +02:00
parent 40e65d6f63
commit daaa7bf451
2 changed files with 23 additions and 0 deletions

View File

@ -18,6 +18,7 @@ from wasabi import msg
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import ensure_path, get_lang_class, OOV_RANK from ..util import ensure_path, get_lang_class, OOV_RANK
from ..lookups import Lookups
try: try:
import ftfy import ftfy
@ -49,6 +50,7 @@ DEFAULT_OOV_PROB = -20
str, str,
), ),
model_name=("Optional name for the model meta", "option", "mn", str), model_name=("Optional name for the model meta", "option", "mn", str),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
) )
def init_model( def init_model(
lang, lang,
@ -61,6 +63,7 @@ def init_model(
prune_vectors=-1, prune_vectors=-1,
vectors_name=None, vectors_name=None,
model_name=None, model_name=None,
omit_extra_lookups=False,
): ):
""" """
Create a new model from raw data, like word frequencies, Brown clusters Create a new model from raw data, like word frequencies, Brown clusters
@ -93,6 +96,15 @@ def init_model(
with msg.loading("Creating model..."): with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name) nlp = create_model(lang, lex_attrs, name=model_name)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
msg.good("Successfully created model") msg.good("Successfully created model")
if vectors_loc is not None: if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)

View File

@ -17,6 +17,7 @@ from .._ml import create_default_optimizer
from ..util import use_gpu as set_gpu from ..util import use_gpu as set_gpu
from ..gold import GoldCorpus from ..gold import GoldCorpus
from ..compat import path2str from ..compat import path2str
from ..lookups import Lookups
from .. import util from .. import util
from .. import about from .. import about
@ -57,6 +58,7 @@ from .. import about
textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_arch=("Textcat model architecture", "option", "ta", str),
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
verbose=("Display more information for debug", "flag", "VV", bool), verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool), debug=("Run data diagnostics before training", "flag", "D", bool),
# fmt: on # fmt: on
@ -96,6 +98,7 @@ def train(
textcat_arch="bow", textcat_arch="bow",
textcat_positive_label=None, textcat_positive_label=None,
tag_map_path=None, tag_map_path=None,
omit_extra_lookups=False,
verbose=False, verbose=False,
debug=False, debug=False,
): ):
@ -247,6 +250,14 @@ def train(
# Update tag map with provided mapping # Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map) nlp.vocab.morphology.tag_map.update(tag_map)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
if vectors: if vectors:
msg.text("Loading vector from model '{}'".format(vectors)) msg.text("Loading vector from model '{}'".format(vectors))
_load_vectors(nlp, vectors) _load_vectors(nlp, vectors)