# cython: infer_types=True, profile=True, binding=True from collections import defaultdict from typing import Optional, Iterable from thinc.api import Model, Config from .transition_parser cimport Parser from ._parser_internals.arc_eager cimport ArcEager from .functions import merge_subtokens from ..language import Language from ._parser_internals import nonproj from ._parser_internals.nonproj import DELIMITER from ..scorer import Scorer from ..training import validate_examples default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v2" state_type = "parser" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 use_upper = true [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v1" pretrained_vectors = null width = 96 depth = 4 embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true """ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "parser", assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], default_config={ "moves": None, "update_with_oracle_cut_size": 100, "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, }, default_score_weights={ "dep_uas": 0.5, "dep_las": 0.5, "dep_las_per_type": None, "sents_p": None, "sents_r": None, "sents_f": 0.0, }, ) def make_parser( nlp: Language, name: str, model: Model, moves: Optional[list], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int ): """Create a transition-based DependencyParser component. The dependency parser jointly learns sentence segmentation and labelled dependency parsing, and can optionally learn to merge tokens that had been over-segmented by the tokenizer. The parser uses a variant of the non-monotonic arc-eager transition-system described by Honnibal and Johnson (2014), with the addition of a "break" transition to perform the sentence segmentation. Nivre's pseudo-projective dependency transformation is used to allow the parser to predict non-projective parses. The parser is trained using an imitation learning objective. The parser follows the actions predicted by the current weights, and at each state, determines which actions are compatible with the optimal parse that could be reached from the current state. The weights such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note that more than one action may be optimal for a given state. model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. moves (List[str]): A list of transition names. Inferred from the data if not provided. update_with_oracle_cut_size (int): During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. 100 is a good default. learn_tokens (bool): Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. min_action_freq (int): The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. """ return DependencyParser( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, multitasks=[], learn_tokens=learn_tokens, min_action_freq=min_action_freq, beam_width=1, beam_density=0.0, beam_update_prob=0.0, ) @Language.factory( "beam_parser", assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], default_config={ "beam_width": 8, "beam_density": 0.01, "beam_update_prob": 0.5, "moves": None, "update_with_oracle_cut_size": 100, "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, }, default_score_weights={ "dep_uas": 0.5, "dep_las": 0.5, "dep_las_per_type": None, "sents_p": None, "sents_r": None, "sents_f": 0.0, }, ) def make_beam_parser( nlp: Language, name: str, model: Model, moves: Optional[list], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int, beam_width: int, beam_density: float, beam_update_prob: float, ): """Create a transition-based DependencyParser component that uses beam-search. The dependency parser jointly learns sentence segmentation and labelled dependency parsing, and can optionally learn to merge tokens that had been over-segmented by the tokenizer. The parser uses a variant of the non-monotonic arc-eager transition-system described by Honnibal and Johnson (2014), with the addition of a "break" transition to perform the sentence segmentation. Nivre's pseudo-projective dependency transformation is used to allow the parser to predict non-projective parses. The parser is trained using a global objective. That is, it learns to assign probabilities to whole parses. model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. moves (List[str]): A list of transition names. Inferred from the data if not provided. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring candidates that are too far behind. This is mostly intended to improve efficiency, but it can also improve accuracy as deeper search is not always better. beam_update_prob (float): The chance of making a beam update, instead of a greedy update. Greedy updates are an approximation for the beam updates, and are faster to compute. learn_tokens (bool): Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. min_action_freq (int): The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. """ return DependencyParser( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, multitasks=[], learn_tokens=learn_tokens, min_action_freq=min_action_freq ) cdef class DependencyParser(Parser): """Pipeline component for dependency parsing. DOCS: https://spacy.io/api/dependencyparser """ TransitionSystem = ArcEager @property def postprocesses(self): output = [nonproj.deprojectivize] if self.cfg.get("learn_tokens") is True: output.append(merge_subtokens) return tuple(output) def add_multitask_objective(self, mt_component): self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, nlp=None, **cfg): # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) labeller.initialize(get_examples, nlp=nlp) @property def labels(self): labels = set() # Get the labels from the model by looking at the available moves for move in self.move_names: if "-" in move: label = move.split("-")[1] if DELIMITER in label: label = label.split(DELIMITER)[1] labels.add(label) return tuple(sorted(labels)) def score(self, examples, **kwargs): """Score a batch of examples. examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans and Scorer.score_deps. DOCS: https://spacy.io/api/dependencyparser#score """ def has_sents(doc): return doc.has_annotation("SENT_START") validate_examples(examples, "DependencyParser.score") def dep_getter(token, attr): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() return dep results = {} results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) kwargs.setdefault("getter", dep_getter) kwargs.setdefault("ignore_labels", ("p", "punct")) results.update(Scorer.score_deps(examples, "dep", **kwargs)) del results["sents_per_type"] return results def scored_parses(self, beams): """Return two dictionaries with scores for each beam/doc that was processed: one containing (i, head) keys, and another containing (i, label) keys. """ head_scores = [] label_scores = [] for beam in beams: score_head_dict = defaultdict(float) score_label_dict = defaultdict(float) for score, parses in self.moves.get_beam_parses(beam): for head, i, label in parses: score_head_dict[(i, head)] += score score_label_dict[(i, label)] += score head_scores.append(score_head_dict) label_scores.append(score_label_dict) return head_scores, label_scores def _ensure_labels_are_added(self, docs): # This gives the parser a chance to add labels it's missing for a batch # of documents. However, this isn't desirable for the dependency parser, # because we instead have a label frequency cut-off and back off rare # labels to 'dep'. pass