From c07941d14a4639933e7e4c2697b7d426078bf1d1 Mon Sep 17 00:00:00 2001
From: shadeMe <shadeMe@users.noreply.github.com>
Date: Mon, 6 Mar 2023 10:31:18 +0530
Subject: [PATCH] Update `Tok2Vec.distill` docstring

---
 spacy/pipeline/tok2vec.py    | 14 +++++++-------
 website/docs/api/tok2vec.mdx | 16 ++++++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 72761abf7..d9639f8d5 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -199,14 +199,14 @@ class Tok2Vec(TrainablePipe):
         sgd: Optional[Optimizer] = None,
         losses: Optional[Dict[str, float]] = None,
     ) -> Dict[str, float]:
-        """Train a pipe (the student) on the predictions of another pipe
-        (the teacher). The student is typically trained on the probability
-        distribution of the teacher, but details may differ per pipe.
+        """Performs an update of the student pipe's model using the
+        student's distillation examples and sets the annotations
+        of the teacher's distillation examples using the teacher pipe.
 
-        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
-            from.
-        examples (Iterable[Example]): Distillation examples. The reference
-            and predicted docs must have the same number of tokens and the
+        teacher_pipe (Optional[TrainablePipe]): The teacher pipe to use
+            for prediction.
+        examples (Iterable[Example]): Distillation examples. The reference (teacher)
+            and predicted (student) docs must have the same number of tokens and the
             same orthography.
         drop (float): dropout rate.
         sgd (Optional[Optimizer]): An optimizer. Will be created via
diff --git a/website/docs/api/tok2vec.mdx b/website/docs/api/tok2vec.mdx
index 6b410d724..8b6d2380b 100644
--- a/website/docs/api/tok2vec.mdx
+++ b/website/docs/api/tok2vec.mdx
@@ -102,10 +102,14 @@ pipeline components are applied to the `Doc` in order. Both
 
 ## Tok2Vec.distill {id="distill", tag="method,experimental", version="4"}
 
-Train a pipe (the student) on the predictions of another pipe (the teacher). The
-student is typically trained on the probability distribution of the teacher, but
-details may differ per pipe. The goal of distillation is to transfer knowledge
-from the teacher to the student.
+Performs an update of the student pipe's model using the student's distillation 
+examples and sets the annotations of the teacher's distillation examples using 
+the teacher pipe. 
+
+Unlike other trainable pipes, the student pipe doesn't directly learn its 
+representations from the teacher. However, since downstream pipes that do 
+perform distillation expect the tok2vec annotations to be present on the 
+correct distillation examples, we need to ensure that they are set beforehand.
 
 The distillation is performed on ~~Example~~ objects. The `Example.reference`
 and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
@@ -125,8 +129,8 @@ This feature is experimental.
 
 | Name           | Description                                                                                                                                 |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
-| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~                                                                                 |
-| `examples`     | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
+| `teacher_pipe` | The teacher pipe to use for prediction. ~~Optional[TrainablePipe]~~                                                                                 |
+| `examples`     | Distillation examples. The reference (teacher) and predicted (student) docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
 | _keyword-only_ |                                                                                                                                             |
 | `drop`         | Dropout rate. ~~float~~                                                                                                                     |
 | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                               |