From f5c551a43a73c8b43bb12434fcbe4dbc65f5467f Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 21 Oct 2019 12:20:28 +0200 Subject: [PATCH] Checks/errors related to ill-formed IOB input in CLI convert and debug-data (#4487) * Error for ill-formed input to iob_to_biluo() Check for empty label in iob_to_biluo(), which can result from ill-formed input. * Check for empty NER label in debug-data --- spacy/cli/debug_data.py | 3 +++ spacy/errors.py | 1 + spacy/gold.pyx | 2 ++ spacy/tests/test_gold.py | 12 +++++++++++- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index fe6cccf81..8161ddf45 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -206,6 +206,9 @@ def debug_data( missing_values, "value" if missing_values == 1 else "values" ) ) + for label in new_labels: + if len(label) == 0: + msg.fail("Empty label found in new labels") if new_labels: labels_with_counts = [ (label, count) diff --git a/spacy/errors.py b/spacy/errors.py index b2ccd3c49..23203d98a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -503,6 +503,7 @@ class Errors(object): "names: {names}") E175 = ("Can't remove rule for unknown match pattern ID: {key}") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") + E177 = ("Ill-formed IOB input detected: {tag}") @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 098df9ed2..1e626c4ed 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -522,6 +522,8 @@ def _consume_ent(tags): tags.pop(0) label = tag[2:] if length == 1: + if len(label) == 0: + raise ValueError(Errors.E177.format(tag=tag)) return ["U-" + label] else: start = "B-" + label diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 4f79c4463..234a91443 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags -from spacy.gold import spans_from_biluo_tags, GoldParse +from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json from spacy.lang.en import English from spacy.tokens import Doc @@ -87,6 +87,16 @@ def test_gold_ner_missing_tags(en_tokenizer): gold = GoldParse(doc, entities=biluo_tags) # noqa: F841 +def test_iob_to_biluo(): + good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"] + good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"] + bad_iob = ["O", "O", "\"", "B-LOC", "I-LOC"] + converted_biluo = iob_to_biluo(good_iob) + assert good_biluo == converted_biluo + with pytest.raises(ValueError): + iob_to_biluo(bad_iob) + + def test_roundtrip_docs_to_json(): text = "I flew to Silicon Valley via London." cats = {"TRAVEL": 1.0, "BAKING": 0.0}