spaCy/spacy/pipeline/dep_parser.pyx
Matthew Honnibal 8656a08777
Add beam_parser and beam_ner components for v3 (#6369)
* Get basic beam tests working

* Get basic beam tests working

* Compile _beam_utils

* Remove prints

* Test beam density

* Beam parser seems to train

* Draft beam NER

* Upd beam

* Add hypothesis as dev dependency

* Implement missing is-gold-parse method

* Implement early update

* Fix state hashing

* Fix test

* Fix test

* Default to non-beam in parser constructor

* Improve oracle for beam

* Start refactoring beam

* Update test

* Refactor beam

* Update nn

* Refactor beam and weight by cost

* Update ner beam settings

* Update test

* Add __init__.pxd

* Upd test

* Fix test

* Upd test

* Fix test

* Remove ring buffer history from StateC

* WIP change arc-eager transitions

* Add state tests

* Support ternary sent start values

* Fix arc eager

* Fix NER

* Pass oracle cut size for beam

* Fix ner test

* Fix beam

* Improve StateC.clone

* Improve StateClass.borrow

* Work directly with StateC, not StateClass

* Remove print statements

* Fix state copy

* Improve state class

* Refactor parser oracles

* Fix arc eager oracle

* Fix arc eager oracle

* Use a vector to implement the stack

* Refactor state data structure

* Fix alignment of sent start

* Add get_aligned_sent_starts method

* Add test for ae oracle when bad sentence starts

* Fix sentence segment handling

* Avoid Reduce that inserts illegal sentence

* Update preset SBD test

* Fix test

* Remove prints

* Fix sent starts in Example

* Improve python API of StateClass

* Tweak comments and debug output of arc eager

* Upd test

* Fix state test

* Fix state test
2020-12-13 09:08:32 +08:00

260 lines
9.5 KiB
Cython

# cython: infer_types=True, profile=True, binding=True
from typing import Optional, Iterable
from thinc.api import Model, Config
from .transition_parser cimport Parser
from ._parser_internals.arc_eager cimport ArcEager
from .functions import merge_subtokens
from ..language import Language
from ._parser_internals import nonproj
from ..scorer import Scorer
from ..training import validate_examples
default_model_config = """
[model]
@architectures = "spacy.TransitionBasedParser.v1"
state_type = "parser"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)
def make_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int
):
"""Create a transition-based DependencyParser component. The dependency parser
jointly learns sentence segmentation and labelled dependency parsing, and can
optionally learn to merge tokens that had been over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using an imitation learning objective. The parser follows
the actions predicted by the current weights, and at each state, determines
which actions are compatible with the optimal parse that could be reached
from the current state. The weights such that the scores assigned to the
set of optimal actions is increased, while scores assigned to other
actions are decreased. Note that more than one action may be optimal for
a given state.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (List[str]): A list of transition names. Inferred from the data if not
provided.
update_with_oracle_cut_size (int):
During training, cut long sequences into shorter segments by creating
intermediate states based on the gold-standard history. The model is
not very sensitive to this parameter, so you usually won't need to change
it. 100 is a good default.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq,
beam_width=1,
beam_density=0.0,
beam_update_prob=0.0,
)
@Language.factory(
"beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
default_config={
"beam_width": 8,
"beam_density": 0.01,
"beam_update_prob": 0.5,
"moves": None,
"update_with_oracle_cut_size": 100,
"learn_tokens": False,
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)
def make_beam_parser(
nlp: Language,
name: str,
model: Model,
moves: Optional[list],
update_with_oracle_cut_size: int,
learn_tokens: bool,
min_action_freq: int,
beam_width: int,
beam_density: float,
beam_update_prob: float,
):
"""Create a transition-based DependencyParser component that uses beam-search.
The dependency parser jointly learns sentence segmentation and labelled
dependency parsing, and can optionally learn to merge tokens that had been
over-segmented by the tokenizer.
The parser uses a variant of the non-monotonic arc-eager transition-system
described by Honnibal and Johnson (2014), with the addition of a "break"
transition to perform the sentence segmentation. Nivre's pseudo-projective
dependency transformation is used to allow the parser to predict
non-projective parses.
The parser is trained using a global objective. That is, it learns to assign
probabilities to whole parses.
model (Model): The model for the transition-based parser. The model needs
to have a specific substructure of named components --- see the
spacy.ml.tb_framework.TransitionModel for details.
moves (List[str]): A list of transition names. Inferred from the data if not
provided.
beam_width (int): The number of candidate analyses to maintain.
beam_density (float): The minimum ratio between the scores of the first and
last candidates in the beam. This allows the parser to avoid exploring
candidates that are too far behind. This is mostly intended to improve
efficiency, but it can also improve accuracy as deeper search is not
always better.
beam_update_prob (float): The chance of making a beam update, instead of a
greedy update. Greedy updates are an approximation for the beam updates,
and are faster to compute.
learn_tokens (bool): Whether to learn to merge subtokens that are split
relative to the gold standard. Experimental.
min_action_freq (int): The minimum frequency of labelled actions to retain.
Rarer labelled actions have their label backed-off to "dep". While this
primarily affects the label accuracy, it can also affect the attachment
structure, as the labels are used to represent the pseudo-projectivity
transformation.
"""
return DependencyParser(
nlp.vocab,
model,
name,
moves=moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
beam_width=beam_width,
beam_density=beam_density,
beam_update_prob=beam_update_prob,
multitasks=[],
learn_tokens=learn_tokens,
min_action_freq=min_action_freq
)
cdef class DependencyParser(Parser):
"""Pipeline component for dependency parsing.
DOCS: https://nightly.spacy.io/api/dependencyparser
"""
TransitionSystem = ArcEager
@property
def postprocesses(self):
output = [nonproj.deprojectivize]
if self.cfg.get("learn_tokens") is True:
output.append(merge_subtokens)
return tuple(output)
def add_multitask_objective(self, mt_component):
self._multitasks.append(mt_component)
def init_multitask_objectives(self, get_examples, nlp=None, **cfg):
# TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ?
for labeller in self._multitasks:
labeller.model.set_dim("nO", len(self.labels))
if labeller.model.has_ref("output_layer"):
labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels))
labeller.initialize(get_examples, nlp=nlp)
@property
def labels(self):
labels = set()
# Get the labels from the model by looking at the available moves
for move in self.move_names:
if "-" in move:
label = move.split("-")[1]
if "||" in label:
label = label.split("||")[1]
labels.add(label)
return tuple(sorted(labels))
def score(self, examples, **kwargs):
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans
and Scorer.score_deps.
DOCS: https://nightly.spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
validate_examples(examples, "DependencyParser.score")
def dep_getter(token, attr):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results