Merge pull request #5466 from adrianeboyd/feature/omit-extra-lexeme-info

Add option to omit extra lexeme tables in CLI
This commit is contained in:
Matthew Honnibal 2020-05-21 16:40:02 +02:00 committed by GitHub
commit 884d9b060d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 6 deletions

View File

@ -18,6 +18,8 @@ from wasabi import msg
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
from ..lookups import Lookups
try: try:
import ftfy import ftfy
@ -49,12 +51,8 @@ DEFAULT_OOV_PROB = -20
str, str,
), ),
model_name=("Optional name for the model meta", "option", "mn", str), model_name=("Optional name for the model meta", "option", "mn", str),
base_model=( omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
"Base model (for languages with custom tokenizers)", base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
"option",
"b",
str,
),
) )
def init_model( def init_model(
lang, lang,
@ -67,6 +65,7 @@ def init_model(
prune_vectors=-1, prune_vectors=-1,
vectors_name=None, vectors_name=None,
model_name=None, model_name=None,
omit_extra_lookups=False,
base_model=None, base_model=None,
): ):
""" """
@ -100,6 +99,15 @@ def init_model(
with msg.loading("Creating model..."): with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
msg.good("Successfully created model") msg.good("Successfully created model")
if vectors_loc is not None: if vectors_loc is not None:
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)

View File

@ -17,6 +17,7 @@ from .._ml import create_default_optimizer
from ..util import use_gpu as set_gpu from ..util import use_gpu as set_gpu
from ..gold import GoldCorpus from ..gold import GoldCorpus
from ..compat import path2str from ..compat import path2str
from ..lookups import Lookups
from .. import util from .. import util
from .. import about from .. import about
@ -57,6 +58,7 @@ from .. import about
textcat_arch=("Textcat model architecture", "option", "ta", str), textcat_arch=("Textcat model architecture", "option", "ta", str),
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str), textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
verbose=("Display more information for debug", "flag", "VV", bool), verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool), debug=("Run data diagnostics before training", "flag", "D", bool),
# fmt: on # fmt: on
@ -96,6 +98,7 @@ def train(
textcat_arch="bow", textcat_arch="bow",
textcat_positive_label=None, textcat_positive_label=None,
tag_map_path=None, tag_map_path=None,
omit_extra_lookups=False,
verbose=False, verbose=False,
debug=False, debug=False,
): ):
@ -247,6 +250,14 @@ def train(
# Update tag map with provided mapping # Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map) nlp.vocab.morphology.tag_map.update(tag_map)
# Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed
if omit_extra_lookups:
nlp.vocab.lookups_extra = Lookups()
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
nlp.vocab.lookups_extra.add_table("lexeme_prob")
nlp.vocab.lookups_extra.add_table("lexeme_settings")
if vectors: if vectors:
msg.text("Loading vector from model '{}'".format(vectors)) msg.text("Loading vector from model '{}'".format(vectors))
_load_vectors(nlp, vectors) _load_vectors(nlp, vectors)