diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py
index 5c8bc6798..d5cd9e3de 100644
--- a/spacy/cli/find_threshold.py
+++ b/spacy/cli/find_threshold.py
@@ -1,21 +1,16 @@
 from pathlib import Path
 import logging
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
 
 import numpy
 import wasabi.tables
 
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, import_code, setup_gpu
 from .. import util
 from ..pipeline import MultiLabel_TextCategorizer, Pipe
 from ..tokens import DocBin
 
-_DEFAULTS = {
-    "average": "micro",
-    "pipe_name": None,
-    "n_trials": 10,
-    "beta": 1,
-}
+_DEFAULTS = {"average": "micro", "n_trials": 10, "beta": 1, "use_gpu": -1}
 
 
 @app.command(
@@ -24,62 +19,73 @@ _DEFAULTS = {
 )
 def find_threshold_cli(
     # fmt: off
-    model_path: Path = Arg(..., help="Path to model file", exists=True, allow_dash=True),
-    doc_path: Path = Arg(..., help="Path to doc bin file", exists=True, allow_dash=True),
+    model: str = Arg(..., help="Model name or path"),
+    data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
+    pipe_name: str = Opt(..., "--pipe_name", "-p", help="Name of pipe to examine thresholds for"),
     average: str = Arg(_DEFAULTS["average"], help="How to aggregate F-scores over labels. One of ('micro', 'macro')", exists=True, allow_dash=True),
-    pipe_name: Optional[str] = Opt(_DEFAULTS["pipe_name"], "--pipe_name", "-p", help="Name of pipe to examine thresholds for"),
     n_trials: int = Opt(_DEFAULTS["n_trials"], "--n_trials", "-n", help="Number of trials to determine optimal thresholds"),
     beta: float = Opt(_DEFAULTS["beta"], "--beta", help="Beta for F1 calculation. Ignored if different metric is used"),
-    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+    verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
     # fmt: on
 ):
     """
     Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric from CLI.
-    model_path (Path): Path to file with trained model.
-    doc_path (Path): Path to file with DocBin with docs to use for threshold search.
+    model (Path): Path to file with trained model.
+    data_path (Path): Path to file with DocBin with docs to use for threshold search.
+    pipe_name (str): Name of pipe to examine thresholds for.
     average (str): How to average F-scores across labels. One of ('micro', 'macro').
-    pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer
-        is seleted. If there are multiple, an error is raised.
     n_trials (int): Number of trials to determine optimal thresholds
-    beta (float): Beta for F1 calculation. Ignored if different metric is used.
-    verbose (bool): Display more information for debugging purposes
+    beta (float): Beta for F1 calculation.
+    code_path (Optional[Path]): Path to Python file with additional code (registered functions) to be imported.
+    use_gpu (int): GPU ID or -1 for CPU.
+    silent (bool): Display more information for debugging purposes
     """
 
     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    import_code(code_path)
     find_threshold(
-        model_path,
-        doc_path,
-        average=average,
+        model,
+        data_path,
         pipe_name=pipe_name,
+        average=average,
         n_trials=n_trials,
         beta=beta,
+        use_gpu=use_gpu,
+        silent=False,
     )
 
 
 def find_threshold(
-    model_path: Union[str, Path],
-    doc_path: Union[str, Path],
+    model: str,
+    data_path: Path,
     *,
+    pipe_name: str,  # type: ignore
     average: str = _DEFAULTS["average"],  # type: ignore
-    pipe_name: Optional[str] = _DEFAULTS["pipe_name"],  # type: ignore
     n_trials: int = _DEFAULTS["n_trials"],  # type: ignore
-    beta: float = _DEFAULTS["beta"],  # type: ignore
-    verbose: bool = True,
+    beta: float = _DEFAULTS["beta"],  # type: ignore,
+    use_gpu: int = _DEFAULTS["use_gpu"],
+    silent: bool = True,
 ) -> Tuple[float, float]:
     """
     Runs prediction trials for `textcat` models with varying tresholds to maximize the specified metric.
-    model_path (Union[str, Path]): Path to file with trained model.
-    doc_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search.
+    model (Union[str, Path]): Path to file with trained model.
+    data_path (Union[str, Path]): Path to file with DocBin with docs to use for threshold search.
+    pipe_name (str): Name of pipe to examine thresholds for.
     average (str): How to average F-scores across labels. One of ('micro', 'macro').
-    pipe_name (Optional[str]): Name of pipe to examine thresholds for. If None, pipe of type MultiLabel_TextCategorizer
-        is seleted. If there are multiple, an error is raised.
-    n_trials (int): Number of trials to determine optimal thresholds
-    beta (float): Beta for F1 calculation. Ignored if different metric is used.
-    verbose (bool): Whether to print non-error-related output to stdout.
+    n_trials (int): Number of trials to determine optimal thresholds.
+    beta (float): Beta for F1 calculation.
+    use_gpu (int): GPU ID or -1 for CPU.
+    silent (bool): Whether to print non-error-related output to stdout.
     RETURNS (Tuple[float, float]): Best found threshold with corresponding F-score.
     """
 
-    nlp = util.load_model(model_path)
+    setup_gpu(use_gpu, silent=silent)
+    data_path = util.ensure_path(data_path)
+    if not data_path.exists():
+        wasabi.msg.fail("Evaluation data not found", data_path, exits=1)
+    nlp = util.load_model(model)
     pipe: Optional[Pipe] = None
     selected_pipe_name: Optional[str] = pipe_name
 
@@ -90,7 +96,9 @@ def find_threshold(
         )
 
     for _pipe_name, _pipe in nlp.pipeline:
-        if pipe_name and _pipe_name == pipe_name:
+        # todo instead of instance check, assert _pipe has a .threshold arg
+        #   won't work, actually. e.g. spancat doesn't .threshold.
+        if _pipe_name == pipe_name:
             if not isinstance(_pipe, MultiLabel_TextCategorizer):
                 wasabi.msg.fail(
                     "Specified component '{component}' is not of type `MultiLabel_TextCategorizer`.".format(
@@ -100,36 +108,22 @@ def find_threshold(
                 )
             pipe = _pipe
             break
-        elif pipe_name is None:
-            if isinstance(_pipe, MultiLabel_TextCategorizer):
-                if pipe:
-                    wasabi.msg.fail(
-                        "Multiple components of type `MultiLabel_TextCategorizer` exist in pipeline. Specify name of "
-                        "component to evaluate.",
-                        exits=1,
-                    )
-                pipe = _pipe
-                selected_pipe_name = _pipe_name
 
     if pipe is None:
-        if pipe_name:
-            wasabi.msg.fail(
-                f"No component with name {pipe_name} found in pipeline.", exits=1
-            )
         wasabi.msg.fail(
-            "No component of type `MultiLabel_TextCategorizer` found in pipeline.",
-            exits=1,
+            f"No component with name {pipe_name} found in pipeline.", exits=1
         )
     # This is purely for MyPy. Type checking is done in loop above already.
     assert isinstance(pipe, MultiLabel_TextCategorizer)
 
-    if verbose:
+    if silent:
         print(
             f"Searching threshold with the best {average} F-score for component '{selected_pipe_name}' with {n_trials} "
             f"trials and beta = {beta}."
         )
 
     thresholds = numpy.linspace(0, 1, n_trials)
+    # todo use Scorer.score_cats. possibly to be extended?
     ref_pos_counts = {label: 0 for label in pipe.labels}
     pred_pos_counts = {
         t: {True: ref_pos_counts.copy(), False: ref_pos_counts.copy()}
@@ -140,7 +134,7 @@ def find_threshold(
 
     # Count true/false positives for provided docs.
     doc_bin = DocBin()
-    doc_bin.from_disk(doc_path)
+    doc_bin.from_disk(data_path)
     for ref_doc in doc_bin.get_docs(nlp.vocab):
         for label, score in ref_doc.cats.items():
             if score not in (0, 1):
@@ -198,7 +192,7 @@ def find_threshold(
             ) / len(ref_pos_counts)
 
     best_threshold = max(f_scores.keys(), key=(lambda key: f_scores[key]))
-    if verbose:
+    if silent:
         print(
             f"Best threshold: {round(best_threshold, ndigits=4)} with F-score of {f_scores[best_threshold]}.",
             wasabi.tables.table(