From 43d41d6bb646a5b659b09d433a681b96062bdc5c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 16 Jun 2020 15:30:05 +0200 Subject: [PATCH] allow None as BILUO annotation --- spacy/gold/example.pyx | 12 ++++++++---- spacy/gold/iob_utils.py | 7 +++++-- spacy/tests/test_gold.py | 3 ++- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 09a32ee4d..7dfa76221 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -285,11 +285,15 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces=None): ent_iobs = [] ent_types = [] for iob_tag in biluo_to_iob(biluo): - ent_iobs.append(iob_tag.split("-")[0]) - if iob_tag.startswith("I") or iob_tag.startswith("B"): - ent_types.append(iob_tag.split("-", 1)[1]) - else: + if iob_tag is None: + ent_iobs.append("") ent_types.append("") + else: + ent_iobs.append(iob_tag.split("-")[0]) + if iob_tag.startswith("I") or iob_tag.startswith("B"): + ent_types.append(iob_tag.split("-", 1)[1]) + else: + ent_types.append("") return ent_iobs, ent_types def _parse_links(vocab, words, links, entities): diff --git a/spacy/gold/iob_utils.py b/spacy/gold/iob_utils.py index 6d16cf1a5..c74ef5671 100644 --- a/spacy/gold/iob_utils.py +++ b/spacy/gold/iob_utils.py @@ -15,8 +15,11 @@ def iob_to_biluo(tags): def biluo_to_iob(tags): out = [] for tag in tags: - tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1) - out.append(tag) + if tag is None: + out.append(tag) + else: + tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1) + out.append(tag) return out diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index ea9c460ac..a032cf6e2 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -237,7 +237,8 @@ def test_biluo_spans(en_tokenizer): def test_gold_ner_missing_tags(en_tokenizer): doc = en_tokenizer("I flew to Silicon Valley via London.") biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] - gold = GoldParse(doc, entities=biluo_tags) # noqa: F841 + example = Example.from_dict(doc, {"entities": biluo_tags}) + assert example.get_aligned("ENT_IOB") == [0, 2, 2, 3, 1, 2, 3, 2] def test_iob_to_biluo():