From daaa7bf45111cd7d033868f875442b494a9dfead Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 20 May 2020 15:51:44 +0200
Subject: [PATCH] Add option to omit extra lexeme tables in CLI

---
 spacy/cli/init_model.py | 12 ++++++++++++
 spacy/cli/train.py      | 11 +++++++++++
 2 files changed, 23 insertions(+)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 3311a5120..18589a954 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -18,6 +18,7 @@ from wasabi import msg
 from ..vectors import Vectors
 from ..errors import Errors, Warnings
 from ..util import ensure_path, get_lang_class, OOV_RANK
+from ..lookups import Lookups
 
 try:
     import ftfy
@@ -49,6 +50,7 @@ DEFAULT_OOV_PROB = -20
         str,
     ),
     model_name=("Optional name for the model meta", "option", "mn", str),
+    omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
 )
 def init_model(
     lang,
@@ -61,6 +63,7 @@ def init_model(
     prune_vectors=-1,
     vectors_name=None,
     model_name=None,
+    omit_extra_lookups=False,
 ):
     """
     Create a new model from raw data, like word frequencies, Brown clusters
@@ -93,6 +96,15 @@ def init_model(
 
     with msg.loading("Creating model..."):
         nlp = create_model(lang, lex_attrs, name=model_name)
+
+    # Create empty extra lexeme tables so the data from spacy-lookups-data
+    # isn't loaded if these features are accessed
+    if omit_extra_lookups:
+        nlp.vocab.lookups_extra = Lookups()
+        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
+        nlp.vocab.lookups_extra.add_table("lexeme_prob")
+        nlp.vocab.lookups_extra.add_table("lexeme_settings")
+
     msg.good("Successfully created model")
     if vectors_loc is not None:
         add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 7cb2d9745..6ce095c15 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -17,6 +17,7 @@ from .._ml import create_default_optimizer
 from ..util import use_gpu as set_gpu
 from ..gold import GoldCorpus
 from ..compat import path2str
+from ..lookups import Lookups
 from .. import util
 from .. import about
 
@@ -57,6 +58,7 @@ from .. import about
     textcat_arch=("Textcat model architecture", "option", "ta", str),
     textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
     tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
+    omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
     verbose=("Display more information for debug", "flag", "VV", bool),
     debug=("Run data diagnostics before training", "flag", "D", bool),
     # fmt: on
@@ -96,6 +98,7 @@ def train(
     textcat_arch="bow",
     textcat_positive_label=None,
     tag_map_path=None,
+    omit_extra_lookups=False,
     verbose=False,
     debug=False,
 ):
@@ -247,6 +250,14 @@ def train(
     # Update tag map with provided mapping
     nlp.vocab.morphology.tag_map.update(tag_map)
 
+    # Create empty extra lexeme tables so the data from spacy-lookups-data
+    # isn't loaded if these features are accessed
+    if omit_extra_lookups:
+        nlp.vocab.lookups_extra = Lookups()
+        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
+        nlp.vocab.lookups_extra.add_table("lexeme_prob")
+        nlp.vocab.lookups_extra.add_table("lexeme_settings")
+
     if vectors:
         msg.text("Loading vector from model '{}'".format(vectors))
         _load_vectors(nlp, vectors)