diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index c0e3bd169..852f456de 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -13,6 +13,7 @@ import random
 from ..gold import GoldCorpus
 from .. import util
 from ..errors import Errors
+from ..ml import models   # don't remove - required to load the built-in architectures
 
 registry = util.registry
 
@@ -75,7 +76,7 @@ maxout_pieces = 3
 subword_features = true
 """
 
-
+# TODO: REMOVE ?
 class PipelineComponent(BaseModel):
     factory: str
     model: Model
@@ -83,7 +84,7 @@ class PipelineComponent(BaseModel):
     class Config:
         arbitrary_types_allowed = True
 
-
+# TODO: REMOVE ?
 class ConfigSchema(BaseModel):
     optimizer: Optional["Optimizer"]
 
@@ -123,7 +124,7 @@ class ConfigSchema(BaseModel):
     use_gpu=("Use GPU", "option", "g", int),
     # fmt: on
 )
-def train_from_config_cli(
+def train_cli(
     train_path,
     dev_path,
     config_path,
@@ -132,7 +133,7 @@ def train_from_config_cli(
     raw_text=None,
     debug=False,
     verbose=False,
-    use_gpu=-1
+    use_gpu=-1,
 ):
     """
     Train or update a spaCy model. Requires data to be formatted in spaCy's
@@ -156,7 +157,7 @@ def train_from_config_cli(
     else:
         msg.info("Using CPU")
 
-    train_from_config(
+    train(
         config_path,
         {"train": train_path, "dev": dev_path},
         output_path=output_path,
@@ -165,10 +166,11 @@ def train_from_config_cli(
     )
 
 
-def train_from_config(
+def train(
     config_path, data_paths, raw_text=None, meta_path=None, output_path=None,
 ):
     msg.info(f"Loading config from: {config_path}")
+    # Read the config first without creating objects, to get to the original nlp_config
     config = util.load_config(config_path, create_objects=False)
     util.fix_random_seed(config["training"]["seed"])
     if config["training"]["use_pytorch_for_gpu_memory"]:
@@ -177,8 +179,8 @@ def train_from_config(
     config = util.load_config(config_path, create_objects=True)
     msg.info("Creating nlp from config")
     nlp = util.load_model_from_config(nlp_config)
-    optimizer = config["optimizer"]
     training = config["training"]
+    optimizer = training["optimizer"]
     limit = training["limit"]
     msg.info("Loading training corpus")
     corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
@@ -246,13 +248,19 @@ def create_train_batches(nlp, corpus, cfg):
         if len(train_examples) == 0:
             raise ValueError(Errors.E988)
         random.shuffle(train_examples)
-        batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"])
+        batches = util.minibatch_by_words(train_examples, size=cfg["batch_size"], discard_oversize=cfg["discard_oversize"])
+        # make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
+        try:
+            first = next(batches)
+            yield first
+        except StopIteration:
+            raise ValueError(Errors.E986)
         for batch in batches:
             yield batch
         epochs_todo -= 1
         # We intentionally compare exactly to 0 here, so that max_epochs < 1
         # will not break.
-        if epochs_todo ==  0:
+        if epochs_todo == 0:
             break
 
 
@@ -366,6 +374,7 @@ def train_while_improving(
         # Stop if we've exhausted our max steps (if specified)
         if max_steps and (step * accumulate_gradient) >= max_steps:
             break
+        step += 1
 
 
 def subdivide_batch(batch, accumulate_gradient):
diff --git a/spacy/errors.py b/spacy/errors.py
index da2cfdf04..852c55225 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -554,6 +554,8 @@ class Errors(object):
 
     # TODO: fix numbering after merging develop into master
 
+    E986 = ("Could not create any training batches: check your input. "
+            "Perhaps discard_oversize should be set to False ?")
     E987 = ("The text of an example training instance is either a Doc or "
             "a string, but found {type} instead.")
     E988 = ("Could not parse any training examples. Ensure the data is "