Getting scores out of beam_parser (#6684)

* clean up of ner tests

* beam_parser tests

* implement get_beam_parses and scored_parses for the dep parser

* we don't have to add the parse if there are no arcs
This commit is contained in:
Sofie Van Landeghem 2021-01-07 06:28:27 +01:00 committed by GitHub
parent 3983bc6b1e
commit 8c1a23209f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 201 additions and 15 deletions

View File

@ -4,4 +4,4 @@ from .transition_system cimport Transition, TransitionSystem
cdef class ArcEager(TransitionSystem):
pass
cdef get_arcs(self, StateC* state)

View File

@ -1,6 +1,7 @@
# cython: profile=True, cdivision=True, infer_types=True
from cymem.cymem cimport Pool, Address
from libc.stdint cimport int32_t
from libcpp.vector cimport vector
from collections import defaultdict, Counter
@ -10,9 +11,9 @@ from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads
from ...training.example cimport Example
from .stateclass cimport StateClass
from ._state cimport StateC
from ._state cimport StateC, ArcC
from ...errors import Errors
from thinc.extra.search cimport Beam
cdef weight_t MIN_SCORE = -90000
cdef attr_t SUBTOK_LABEL = hash_string(u'subtok')
@ -707,6 +708,28 @@ cdef class ArcEager(TransitionSystem):
doc.c[i].dep = self.root_label
set_children_from_heads(doc.c, 0, doc.length)
def get_beam_parses(self, Beam beam):
parses = []
probs = beam.probs
for i in range(beam.size):
state = <StateC*>beam.at(i)
if state.is_final():
prob = probs[i]
parse = []
arcs = self.get_arcs(state)
if arcs:
for arc in arcs:
dep = arc["label"]
label = self.strings[dep]
parse.append((arc["head"], arc["child"], label))
parses.append((prob, parse))
return parses
cdef get_arcs(self, StateC* state):
cdef vector[ArcC] arcs
state.get_arcs(&arcs)
return list(arcs)
def has_gold(self, Example eg, start=0, end=None):
for word in eg.y[start:end]:
if word.dep != 0:

View File

@ -257,6 +257,7 @@ cdef class BiluoPushDown(TransitionSystem):
parse = []
for j in range(state._ents.size()):
ent = state._ents.at(j)
if ent.start != -1 and ent.end != -1:
parse.append((ent.start, ent.end, self.strings[ent.label]))
parses.append((prob, parse))
return parses

View File

@ -1,4 +1,5 @@
# cython: infer_types=True, profile=True, binding=True
from collections import defaultdict
from typing import Optional, Iterable
from thinc.api import Model, Config
@ -258,3 +259,20 @@ cdef class DependencyParser(Parser):
results.update(Scorer.score_deps(examples, "dep", **kwargs))
del results["sents_per_type"]
return results
def scored_parses(self, beams):
"""Return two dictionaries with scores for each beam/doc that was processed:
one containing (i, head) keys, and another containing (i, label) keys.
"""
head_scores = []
label_scores = []
for beam in beams:
score_head_dict = defaultdict(float)
score_label_dict = defaultdict(float)
for score, parses in self.moves.get_beam_parses(beam):
for head, i, label in parses:
score_head_dict[(i, head)] += score
score_label_dict[(i, label)] += score
head_scores.append(score_head_dict)
label_scores.append(score_label_dict)
return head_scores, label_scores

View File

@ -1,5 +1,3 @@
from collections import defaultdict
import pytest
from numpy.testing import assert_equal
from spacy.attrs import ENT_IOB
@ -305,7 +303,7 @@ def test_block_ner():
@pytest.mark.parametrize("use_upper", [True, False])
def test_overfitting_IO(use_upper):
# Simple test to try and quickly overfit the NER component - ensuring the ML models work correctly
# Simple test to try and quickly overfit the NER component
nlp = English()
ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
train_examples = []
@ -386,7 +384,6 @@ def test_beam_ner_scores():
test_text = "I like London."
doc = nlp.make_doc(test_text)
docs = [doc]
ner = nlp.get_pipe("beam_ner")
beams = ner.predict(docs)
entity_scores = ner.scored_ents(beams)[0]
@ -423,7 +420,6 @@ def test_beam_overfitting_IO():
# test the scores from the beam
test_text = "I like London."
docs = [nlp.make_doc(test_text)]
ner = nlp.get_pipe("beam_ner")
beams = ner.predict(docs)
entity_scores = ner.scored_ents(beams)[0]
assert entity_scores[(2, 3, "LOC")] == 1.0
@ -433,7 +429,7 @@ def test_beam_overfitting_IO():
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
docs2 = [nlp2(test_text)]
docs2 = [nlp2.make_doc(test_text)]
ner2 = nlp2.get_pipe("beam_ner")
beams2 = ner2.predict(docs2)
entity_scores2 = ner2.scored_ents(beams2)[0]

View File

@ -28,6 +28,26 @@ TRAIN_DATA = [
]
CONFLICTING_DATA = [
(
"I like London and Berlin.",
{
"heads": [1, 1, 1, 2, 2, 1],
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
},
),
(
"I like London and Berlin.",
{
"heads": [0, 0, 0, 0, 0, 0],
"deps": ["ROOT", "nsubj", "nsubj", "cc", "conj", "punct"],
},
),
]
eps = 0.01
def test_parser_root(en_vocab):
words = ["i", "do", "n't", "have", "other", "assistance"]
heads = [3, 3, 3, 3, 5, 3]
@ -185,26 +205,31 @@ def test_parser_set_sent_starts(en_vocab):
assert token.head in sent
def test_overfitting_IO():
# Simple test to try and quickly overfit the dependency parser - ensuring the ML models work correctly
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
def test_overfitting_IO(pipe_name):
# Simple test to try and quickly overfit the dependency parser (normal or beam)
nlp = English()
parser = nlp.add_pipe("parser")
parser = nlp.add_pipe(pipe_name)
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []):
parser.add_label(dep)
optimizer = nlp.initialize()
for i in range(100):
# run overfitting
for i in range(150):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["parser"] < 0.0001
assert losses[pipe_name] < 0.0001
# test the trained model
test_text = "I like securities."
doc = nlp(test_text)
assert doc[0].dep_ == "nsubj"
assert doc[2].dep_ == "dobj"
assert doc[3].dep_ == "punct"
assert doc[0].head.i == 1
assert doc[2].head.i == 1
assert doc[3].head.i == 1
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
@ -213,6 +238,9 @@ def test_overfitting_IO():
assert doc2[0].dep_ == "nsubj"
assert doc2[2].dep_ == "dobj"
assert doc2[3].dep_ == "punct"
assert doc2[0].head.i == 1
assert doc2[2].head.i == 1
assert doc2[3].head.i == 1
# Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
texts = [
@ -226,3 +254,123 @@ def test_overfitting_IO():
no_batch_deps = [doc.to_array([DEP]) for doc in [nlp(text) for text in texts]]
assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps)
def test_beam_parser_scores():
# Test that we can get confidence values out of the beam_parser pipe
beam_width = 16
beam_density = 0.0001
nlp = English()
config = {
"beam_width": beam_width,
"beam_density": beam_density,
}
parser = nlp.add_pipe("beam_parser", config=config)
train_examples = []
for text, annotations in CONFLICTING_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []):
parser.add_label(dep)
optimizer = nlp.initialize()
# update a bit with conflicting data
for i in range(10):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
# test the scores from the beam
test_text = "I like securities."
doc = nlp.make_doc(test_text)
docs = [doc]
beams = parser.predict(docs)
head_scores, label_scores = parser.scored_parses(beams)
for j in range(len(doc)):
for label in parser.labels:
label_score = label_scores[0][(j, label)]
assert 0 - eps <= label_score <= 1 + eps
for i in range(len(doc)):
head_score = head_scores[0][(j, i)]
assert 0 - eps <= head_score <= 1 + eps
def test_beam_overfitting_IO():
# Simple test to try and quickly overfit the Beam dependency parser
nlp = English()
beam_width = 16
beam_density = 0.0001
config = {
"beam_width": beam_width,
"beam_density": beam_density,
}
parser = nlp.add_pipe("beam_parser", config=config)
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []):
parser.add_label(dep)
optimizer = nlp.initialize()
# run overfitting
for i in range(150):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["beam_parser"] < 0.0001
# test the scores from the beam
test_text = "I like securities."
docs = [nlp.make_doc(test_text)]
beams = parser.predict(docs)
head_scores, label_scores = parser.scored_parses(beams)
# we only processed one document
head_scores = head_scores[0]
label_scores = label_scores[0]
# test label annotations: 0=nsubj, 2=dobj, 3=punct
assert label_scores[(0, "nsubj")] == pytest.approx(1.0, eps)
assert label_scores[(0, "dobj")] == pytest.approx(0.0, eps)
assert label_scores[(0, "punct")] == pytest.approx(0.0, eps)
assert label_scores[(2, "nsubj")] == pytest.approx(0.0, eps)
assert label_scores[(2, "dobj")] == pytest.approx(1.0, eps)
assert label_scores[(2, "punct")] == pytest.approx(0.0, eps)
assert label_scores[(3, "nsubj")] == pytest.approx(0.0, eps)
assert label_scores[(3, "dobj")] == pytest.approx(0.0, eps)
assert label_scores[(3, "punct")] == pytest.approx(1.0, eps)
# test head annotations: the root is token at index 1
assert head_scores[(0, 0)] == pytest.approx(0.0, eps)
assert head_scores[(0, 1)] == pytest.approx(1.0, eps)
assert head_scores[(0, 2)] == pytest.approx(0.0, eps)
assert head_scores[(2, 0)] == pytest.approx(0.0, eps)
assert head_scores[(2, 1)] == pytest.approx(1.0, eps)
assert head_scores[(2, 2)] == pytest.approx(0.0, eps)
assert head_scores[(3, 0)] == pytest.approx(0.0, eps)
assert head_scores[(3, 1)] == pytest.approx(1.0, eps)
assert head_scores[(3, 2)] == pytest.approx(0.0, eps)
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
docs2 = [nlp2.make_doc(test_text)]
parser2 = nlp2.get_pipe("beam_parser")
beams2 = parser2.predict(docs2)
head_scores2, label_scores2 = parser2.scored_parses(beams2)
# we only processed one document
head_scores2 = head_scores2[0]
label_scores2 = label_scores2[0]
# check the results again
assert label_scores2[(0, "nsubj")] == pytest.approx(1.0, eps)
assert label_scores2[(0, "dobj")] == pytest.approx(0.0, eps)
assert label_scores2[(0, "punct")] == pytest.approx(0.0, eps)
assert label_scores2[(2, "nsubj")] == pytest.approx(0.0, eps)
assert label_scores2[(2, "dobj")] == pytest.approx(1.0, eps)
assert label_scores2[(2, "punct")] == pytest.approx(0.0, eps)
assert label_scores2[(3, "nsubj")] == pytest.approx(0.0, eps)
assert label_scores2[(3, "dobj")] == pytest.approx(0.0, eps)
assert label_scores2[(3, "punct")] == pytest.approx(1.0, eps)
assert head_scores2[(0, 0)] == pytest.approx(0.0, eps)
assert head_scores2[(0, 1)] == pytest.approx(1.0, eps)
assert head_scores2[(0, 2)] == pytest.approx(0.0, eps)
assert head_scores2[(2, 0)] == pytest.approx(0.0, eps)
assert head_scores2[(2, 1)] == pytest.approx(1.0, eps)
assert head_scores2[(2, 2)] == pytest.approx(0.0, eps)
assert head_scores2[(3, 0)] == pytest.approx(0.0, eps)
assert head_scores2[(3, 1)] == pytest.approx(1.0, eps)
assert head_scores2[(3, 2)] == pytest.approx(0.0, eps)