Flag to ignore examples with mismatched raw/gold text (#4534)

* Flag to ignore examples with mismatched raw/gold text

After #4525, we're seeing some alignment failures on our OntoNotes data. I think we actually have fixes for most of these cases.

In general it's better to fix the data, but it seems good to allow the GoldCorpus class to just skip cases where the raw text doesn't
match up to the gold words. I think previously we were silently ignoring these cases.

* Try to fix test on Python 2.7
This commit is contained in:
Matthew Honnibal 2019-10-28 11:40:12 +01:00 committed by GitHub
parent 795699015c
commit f0ec7bcb79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 50 additions and 23 deletions

View File

@ -262,7 +262,8 @@ def train(
exits=1,
)
train_docs = corpus.train_docs(
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0,
ignore_misaligned=True
)
train_labels = set()
if textcat_multilabel:
@ -343,6 +344,7 @@ def train(
orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc,
max_length=0,
ignore_misaligned=True
)
if raw_text:
random.shuffle(raw_text)
@ -381,7 +383,8 @@ def train(
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_docs = list(
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc,
ignore_misaligned=True)
)
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer()
@ -398,7 +401,8 @@ def train(
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_docs = list(
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc,
ignore_misaligned=True)
)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)

View File

@ -11,7 +11,6 @@ import itertools
from pathlib import Path
import srsly
from . import _align
from .syntax import nonproj
from .tokens import Doc, Span
from .errors import Errors
@ -73,12 +72,22 @@ def merge_sents(sents):
return [(m_deps, (m_cats, m_brackets))]
_NORM_MAP = {"``": '"', "''": '"'}
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
def _normalize(tokens):
def _normalize_for_alignment(tokens):
tokens = [w.replace(" ", "").lower() for w in tokens]
return [_NORM_MAP.get(word, word) for word in tokens]
output = []
for token in tokens:
token = token.replace(" ", "").lower()
for before, after in _ALIGNMENT_NORM_MAP:
token = token.replace(before, after)
output.append(token)
return output
class AlignmentError(ValueError):
pass
def align(tokens_a, tokens_b):
@ -99,8 +108,8 @@ def align(tokens_a, tokens_b):
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
tokens_a = _normalize(tokens_a)
tokens_b = _normalize(tokens_b)
tokens_a = _normalize_for_alignment(tokens_a)
tokens_b = _normalize_for_alignment(tokens_b)
cost = 0
a2b = numpy.empty(len(tokens_a), dtype="i")
b2a = numpy.empty(len(tokens_b), dtype="i")
@ -143,7 +152,7 @@ def align(tokens_a, tokens_b):
offset_a += len(b)
else:
assert "".join(tokens_a) != "".join(tokens_b)
raise ValueError(f"{tokens_a} and {tokens_b} is different texts.")
raise AlignmentError(f"{tokens_a} and {tokens_b} are different texts.")
return cost, a2b, b2a, a2b_multi, b2a_multi
@ -250,7 +259,8 @@ class GoldCorpus(object):
return n
def train_docs(self, nlp, gold_preproc=False, max_length=None,
noise_level=0.0, orth_variant_level=0.0):
noise_level=0.0, orth_variant_level=0.0,
ignore_misaligned=False):
locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs)
train_tuples = self.read_tuples(locs, limit=self.limit)
@ -258,20 +268,23 @@ class GoldCorpus(object):
max_length=max_length,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True)
make_projective=True,
ignore_misaligned=ignore_misaligned)
yield from gold_docs
def train_docs_without_preprocessing(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc)
yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc)
def dev_docs(self, nlp, gold_preproc=False, ignore_misaligned=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned)
yield from gold_docs
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0, orth_variant_level=0.0, make_projective=False):
noise_level=0.0, orth_variant_level=0.0, make_projective=False,
ignore_misaligned=False):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
@ -280,10 +293,12 @@ class GoldCorpus(object):
docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
paragraph_tuples, gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
golds = cls._make_golds(docs, paragraph_tuples, make_projective)
golds = cls._make_golds(docs, paragraph_tuples, make_projective,
ignore_misaligned=ignore_misaligned)
for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length:
yield doc, gold
if gold is not None:
if (not max_length) or len(doc) < max_length:
yield doc, gold
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
@ -299,14 +314,22 @@ class GoldCorpus(object):
@classmethod
def _make_golds(cls, docs, paragraph_tuples, make_projective):
def _make_golds(cls, docs, paragraph_tuples, make_projective, ignore_misaligned=False):
if len(docs) != len(paragraph_tuples):
n_annots = len(paragraph_tuples)
raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots))
return [GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats,
make_projective=make_projective)
for doc, (sent_tuples, (cats, brackets))
in zip(docs, paragraph_tuples)]
golds = []
for doc, (sent_tuples, (cats, brackets)) in zip(docs, paragraph_tuples):
try:
gold = GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats,
make_projective=make_projective)
except AlignmentError:
if ignore_misaligned:
gold = None
else:
raise
golds.append(gold)
return golds
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):