mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Checks/errors related to ill-formed IOB input in CLI convert and debug-data (#4487)
* Error for ill-formed input to iob_to_biluo() Check for empty label in iob_to_biluo(), which can result from ill-formed input. * Check for empty NER label in debug-data
This commit is contained in:
parent
3195a8f170
commit
f5c551a43a
|
@ -206,6 +206,9 @@ def debug_data(
|
|||
missing_values, "value" if missing_values == 1 else "values"
|
||||
)
|
||||
)
|
||||
for label in new_labels:
|
||||
if len(label) == 0:
|
||||
msg.fail("Empty label found in new labels")
|
||||
if new_labels:
|
||||
labels_with_counts = [
|
||||
(label, count)
|
||||
|
|
|
@ -503,6 +503,7 @@ class Errors(object):
|
|||
"names: {names}")
|
||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||
E177 = ("Ill-formed IOB input detected: {tag}")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -522,6 +522,8 @@ def _consume_ent(tags):
|
|||
tags.pop(0)
|
||||
label = tag[2:]
|
||||
if length == 1:
|
||||
if len(label) == 0:
|
||||
raise ValueError(Errors.E177.format(tag=tag))
|
||||
return ["U-" + label]
|
||||
else:
|
||||
start = "B-" + label
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
from spacy.gold import spans_from_biluo_tags, GoldParse
|
||||
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
|
||||
from spacy.gold import GoldCorpus, docs_to_json
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
|
@ -87,6 +87,16 @@ def test_gold_ner_missing_tags(en_tokenizer):
|
|||
gold = GoldParse(doc, entities=biluo_tags) # noqa: F841
|
||||
|
||||
|
||||
def test_iob_to_biluo():
|
||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||
good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
|
||||
bad_iob = ["O", "O", "\"", "B-LOC", "I-LOC"]
|
||||
converted_biluo = iob_to_biluo(good_iob)
|
||||
assert good_biluo == converted_biluo
|
||||
with pytest.raises(ValueError):
|
||||
iob_to_biluo(bad_iob)
|
||||
|
||||
|
||||
def test_roundtrip_docs_to_json():
|
||||
text = "I flew to Silicon Valley via London."
|
||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||
|
|
Loading…
Reference in New Issue
Block a user