From 7ef8a64df9abfe4afa09f326ff3d08accd88b8b1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 7 Aug 2020 14:59:34 +0200 Subject: [PATCH] Add docstring for parser --- spacy/ml/models/parser.py | 53 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py index 429ceff28..0b0676b7f 100644 --- a/spacy/ml/models/parser.py +++ b/spacy/ml/models/parser.py @@ -8,13 +8,64 @@ from ..tb_framework import TransitionModel @registry.architectures.register("spacy.TransitionBasedParser.v1") def build_tb_parser_model( - tok2vec: Model, + tok2vec: Model[List[Doc], List[Floats2d]], nr_feature_tokens: int, hidden_width: int, maxout_pieces: int, use_upper: bool = True, nO: Optional[int] = None, ) -> Model: + """ + Build a transition-based parser model. Can apply to NER or dependency-parsing. + + Transition-based parsing is an approach to structured prediction where the + task of predicting the structure is mapped to a series of state transitions. + You might find this tutorial helpful as background: + https://explosion.ai/blog/parsing-english-in-python + + The neural network state prediction model consists of either two or three + subnetworks: + + * tok2vec: Map each token into a vector representations. This subnetwork + is run once for each batch. + * lower: Construct a feature-specific vector for each (token, feature) pair. + This is also run once for each batch. Constructing the state + representation is then simply a matter of summing the component features + and applying the non-linearity. + * upper (optional): A feed-forward network that predicts scores from the + state representation. If not present, the output from the lower model is + ued as action scores directly. + + tok2vec (Model[List[Doc], List[Floats2d]]): + Subnetwork to map tokens into vector representations. + nr_feature_tokens (int): The number of tokens in the context to use to + construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The + 2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 + feature sets are designed for the NER. The recommended feature sets are + 3 for NER, and 8 for the dependency parser. + + TODO: This feature should be split into two, state_type: ["deps", "ner"] + and extra_state_features: [True, False]. This would map into: + + (deps, False): 8 + (deps, True): 13 + (ner, False): 3 + (ner, True): 6 + + hidden_width (int): The width of the hidden layer. + maxout_pieces (int): How many pieces to use in the state prediction layer. + Recommended values are 1, 2 or 3. If 1, the maxout non-linearity + is replaced with a ReLu non-linearity if use_upper=True, and no + non-linearity if use_upper=False. + use_upper (bool): Whether to use an additional hidden layer after the state + vector in order to predict the action scores. It is recommended to set + this to False for large pretrained models such as transformers, and False + for smaller networks. The upper layer is computed on CPU, which becomes + a bottleneck on larger GPU-based models, where it's also less necessary. + nO (int or None): The number of actions the model will predict between. + Usually inferred from data at the beginning of training, or loaded from + disk. + """ t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) tok2vec.set_dim("nO", hidden_width)