From 7ef8a64df9abfe4afa09f326ff3d08accd88b8b1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 7 Aug 2020 14:59:34 +0200
Subject: [PATCH] Add docstring for parser

---
 spacy/ml/models/parser.py | 53 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 429ceff28..0b0676b7f 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -8,13 +8,64 @@ from ..tb_framework import TransitionModel
 
 @registry.architectures.register("spacy.TransitionBasedParser.v1")
 def build_tb_parser_model(
-    tok2vec: Model,
+    tok2vec: Model[List[Doc], List[Floats2d]],
     nr_feature_tokens: int,
     hidden_width: int,
     maxout_pieces: int,
     use_upper: bool = True,
     nO: Optional[int] = None,
 ) -> Model:
+    """
+    Build a transition-based parser model. Can apply to NER or dependency-parsing.
+    
+    Transition-based parsing is an approach to structured prediction where the
+    task of predicting the structure is mapped to a series of state transitions.
+    You might find this tutorial helpful as background:
+    https://explosion.ai/blog/parsing-english-in-python
+
+    The neural network state prediction model consists of either two or three
+    subnetworks:
+
+    * tok2vec: Map each token into a vector representations. This subnetwork
+        is run once for each batch.
+    * lower: Construct a feature-specific vector for each (token, feature) pair.
+        This is also run once for each batch. Constructing the state
+        representation is then simply a matter of summing the component features
+        and applying the non-linearity.
+    * upper (optional): A feed-forward network that predicts scores from the
+        state representation. If not present, the output from the lower model is
+        ued as action scores directly.
+
+    tok2vec (Model[List[Doc], List[Floats2d]]):
+        Subnetwork to map tokens into vector representations.
+    nr_feature_tokens (int): The number of tokens in the context to use to
+        construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
+        2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
+        feature sets are designed for the NER. The recommended feature sets are
+        3 for NER, and 8 for the dependency parser.
+        
+        TODO: This feature should be split into two, state_type: ["deps", "ner"]
+        and extra_state_features: [True, False]. This would map into:
+        
+        (deps, False): 8
+        (deps, True): 13
+        (ner, False): 3
+        (ner, True): 6
+
+    hidden_width (int): The width of the hidden layer.
+    maxout_pieces (int): How many pieces to use in the state prediction layer.
+        Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
+        is replaced with a ReLu non-linearity if use_upper=True, and no
+        non-linearity if use_upper=False.
+    use_upper (bool): Whether to use an additional hidden layer after the state
+        vector in order to predict the action scores. It is recommended to set
+        this to False for large pretrained models such as transformers, and False
+        for smaller networks. The upper layer is computed on CPU, which becomes
+        a bottleneck on larger GPU-based models, where it's also less necessary.
+    nO (int or None): The number of actions the model will predict between.
+        Usually inferred from data at the beginning of training, or loaded from
+        disk.
+    """
     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
     tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
     tok2vec.set_dim("nO", hidden_width)