# cython: infer_types=True, profile=True, binding=True from collections import defaultdict from typing import Callable, Optional from thinc.api import Config, Model from ..language import Language from ..scorer import Scorer from ..training import remove_bilu_prefix from ..util import registry from ._parser_internals import nonproj from ._parser_internals.arc_eager import ArcEager from ._parser_internals.nonproj import DELIMITER from ._parser_internals.transition_system import TransitionSystem from .functions import merge_subtokens from .transition_parser import Parser default_model_config = """ [model] @architectures = "spacy.TransitionBasedParser.v3" state_type = "parser" extra_state_tokens = false hidden_width = 64 maxout_pieces = 2 [model.tok2vec] @architectures = "spacy.HashEmbedCNN.v2" pretrained_vectors = null width = 96 depth = 4 embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true """ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "parser", assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], default_config={ "moves": None, "update_with_oracle_cut_size": 100, "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, "scorer": {"@scorers": "spacy.parser_scorer.v1"}, }, default_score_weights={ "dep_uas": 0.5, "dep_las": 0.5, "dep_las_per_type": None, "sents_p": None, "sents_r": None, "sents_f": 0.0, }, ) def make_parser( nlp: Language, name: str, model: Model, moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int, scorer: Optional[Callable], ): """Create a transition-based DependencyParser component. The dependency parser jointly learns sentence segmentation and labelled dependency parsing, and can optionally learn to merge tokens that had been over-segmented by the tokenizer. The parser uses a variant of the non-monotonic arc-eager transition-system described by Honnibal and Johnson (2014), with the addition of a "break" transition to perform the sentence segmentation. Nivre's pseudo-projective dependency transformation is used to allow the parser to predict non-projective parses. The parser is trained using an imitation learning objective. The parser follows the actions predicted by the current weights, and at each state, determines which actions are compatible with the optimal parse that could be reached from the current state. The weights such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note that more than one action may be optimal for a given state. model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. moves (Optional[TransitionSystem]): This defines how the parse-state is created, updated and evaluated. If 'moves' is None, a new instance is created with `self.TransitionSystem()`. Defaults to `None`. update_with_oracle_cut_size (int): During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. 100 is a good default. learn_tokens (bool): Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. min_action_freq (int): The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. scorer (Optional[Callable]): The scoring method. """ return DependencyParser( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, multitasks=[], learn_tokens=learn_tokens, min_action_freq=min_action_freq, beam_width=1, beam_density=0.0, beam_update_prob=0.0, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. incorrect_spans_key=None, scorer=scorer, ) @Language.factory( "beam_parser", assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], default_config={ "beam_width": 8, "beam_density": 0.01, "beam_update_prob": 0.5, "moves": None, "update_with_oracle_cut_size": 100, "learn_tokens": False, "min_action_freq": 30, "model": DEFAULT_PARSER_MODEL, "scorer": {"@scorers": "spacy.parser_scorer.v1"}, }, default_score_weights={ "dep_uas": 0.5, "dep_las": 0.5, "dep_las_per_type": None, "sents_p": None, "sents_r": None, "sents_f": 0.0, }, ) def make_beam_parser( nlp: Language, name: str, model: Model, moves: Optional[TransitionSystem], update_with_oracle_cut_size: int, learn_tokens: bool, min_action_freq: int, beam_width: int, beam_density: float, beam_update_prob: float, scorer: Optional[Callable], ): """Create a transition-based DependencyParser component that uses beam-search. The dependency parser jointly learns sentence segmentation and labelled dependency parsing, and can optionally learn to merge tokens that had been over-segmented by the tokenizer. The parser uses a variant of the non-monotonic arc-eager transition-system described by Honnibal and Johnson (2014), with the addition of a "break" transition to perform the sentence segmentation. Nivre's pseudo-projective dependency transformation is used to allow the parser to predict non-projective parses. The parser is trained using a global objective. That is, it learns to assign probabilities to whole parses. model (Model): The model for the transition-based parser. The model needs to have a specific substructure of named components --- see the spacy.ml.tb_framework.TransitionModel for details. moves (Optional[TransitionSystem]): This defines how the parse-state is created, updated and evaluated. If 'moves' is None, a new instance is created with `self.TransitionSystem()`. Defaults to `None`. update_with_oracle_cut_size (int): During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. 100 is a good default. beam_width (int): The number of candidate analyses to maintain. beam_density (float): The minimum ratio between the scores of the first and last candidates in the beam. This allows the parser to avoid exploring candidates that are too far behind. This is mostly intended to improve efficiency, but it can also improve accuracy as deeper search is not always better. beam_update_prob (float): The chance of making a beam update, instead of a greedy update. Greedy updates are an approximation for the beam updates, and are faster to compute. learn_tokens (bool): Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. min_action_freq (int): The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. """ return DependencyParser( nlp.vocab, model, name, moves=moves, update_with_oracle_cut_size=update_with_oracle_cut_size, beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, multitasks=[], learn_tokens=learn_tokens, min_action_freq=min_action_freq, # At some point in the future we can try to implement support for # partial annotations, perhaps only in the beam objective. incorrect_spans_key=None, scorer=scorer, ) def parser_score(examples, **kwargs): """Score a batch of examples. examples (Iterable[Example]): The examples to score. RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans and Scorer.score_deps. DOCS: https://spacy.io/api/dependencyparser#score """ def has_sents(doc): return doc.has_annotation("SENT_START") def dep_getter(token, attr): dep = getattr(token, attr) dep = token.vocab.strings.as_string(dep).lower() return dep results = {} results.update( Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) ) kwargs.setdefault("getter", dep_getter) kwargs.setdefault("ignore_labels", ("p", "punct")) results.update(Scorer.score_deps(examples, "dep", **kwargs)) del results["sents_per_type"] return results @registry.scorers("spacy.parser_scorer.v1") def make_parser_scorer(): return parser_score class DependencyParser(Parser): """Pipeline component for dependency parsing. DOCS: https://spacy.io/api/dependencyparser """ TransitionSystem = ArcEager def __init__( self, vocab, model, name="parser", moves=None, *, update_with_oracle_cut_size=100, min_action_freq=30, learn_tokens=False, beam_width=1, beam_density=0.0, beam_update_prob=0.0, multitasks=tuple(), incorrect_spans_key=None, scorer=parser_score, ): """Create a DependencyParser.""" super().__init__( vocab, model, name, moves, update_with_oracle_cut_size=update_with_oracle_cut_size, min_action_freq=min_action_freq, learn_tokens=learn_tokens, beam_width=beam_width, beam_density=beam_density, beam_update_prob=beam_update_prob, multitasks=multitasks, incorrect_spans_key=incorrect_spans_key, scorer=scorer, ) @property def postprocesses(self): output = [nonproj.deprojectivize] if self.cfg.get("learn_tokens") is True: output.append(merge_subtokens) return tuple(output) def add_multitask_objective(self, mt_component): self._multitasks.append(mt_component) def init_multitask_objectives(self, get_examples, nlp=None, **cfg): # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? for labeller in self._multitasks: labeller.model.set_dim("nO", len(self.labels)) if labeller.model.has_ref("output_layer"): labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) labeller.initialize(get_examples, nlp=nlp) @property def labels(self): labels = set() # Get the labels from the model by looking at the available moves for move in self.move_names: if "-" in move: label = remove_bilu_prefix(move) if DELIMITER in label: label = label.split(DELIMITER)[1] labels.add(label) return tuple(sorted(labels)) def scored_parses(self, beams): """Return two dictionaries with scores for each beam/doc that was processed: one containing (i, head) keys, and another containing (i, label) keys. """ head_scores = [] label_scores = [] for beam in beams: score_head_dict = defaultdict(float) score_label_dict = defaultdict(float) for score, parses in self.moves.get_beam_parses(beam): for head, i, label in parses: score_head_dict[(i, head)] += score score_label_dict[(i, label)] += score head_scores.append(score_head_dict) label_scores.append(score_label_dict) return head_scores, label_scores def _ensure_labels_are_added(self, docs): # This gives the parser a chance to add labels it's missing for a batch # of documents. However, this isn't desirable for the dependency parser, # because we instead have a label frequency cut-off and back off rare # labels to 'dep'. pass