bugfix excl Span.end in iob2docs

This commit is contained in:
svlandeg 2020-06-23 17:20:41 +02:00
parent 7c76a2b796
commit 28ad71c187
3 changed files with 11 additions and 14 deletions

View File

@ -59,6 +59,6 @@ def read_iob(raw_sents, vocab, n_sents):
doc[i].is_sent_start = sent_start doc[i].is_sent_start = sent_start
biluo = iob_to_biluo(iob) biluo = iob_to_biluo(iob)
entities = tags_to_entities(biluo) entities = tags_to_entities(biluo)
doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities] doc.ents = [Span(doc, start=s, end=e+1, label=L) for (L, s, e) in entities]
docs.append(doc) docs.append(doc)
return docs return docs

View File

@ -172,6 +172,8 @@ def offsets_from_biluo_tags(doc, tags):
def tags_to_entities(tags): def tags_to_entities(tags):
""" Note that the end index returned by this function is inclusive.
To use it for Span creation, increment the end by 1."""
entities = [] entities = []
start = None start = None
for i, tag in enumerate(tags): for i, tag in enumerate(tags):

View File

@ -10,7 +10,6 @@ from spacy.cli.pretrain import make_docs
# from spacy.gold.converters import conllu2docs # from spacy.gold.converters import conllu2docs
@pytest.mark.xfail
def test_cli_converters_conllu2json(): def test_cli_converters_conllu2json():
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
lines = [ lines = [
@ -35,7 +34,6 @@ def test_cli_converters_conllu2json():
assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
@pytest.mark.xfail
@pytest.mark.parametrize( @pytest.mark.parametrize(
"lines", "lines",
[ [
@ -73,7 +71,6 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"] assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
@pytest.mark.xfail
def test_cli_converters_conllu2json_subtokens(): def test_cli_converters_conllu2json_subtokens():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [ lines = [
@ -117,7 +114,6 @@ def test_cli_converters_conllu2json_subtokens():
assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"] assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
@pytest.mark.xfail
def test_cli_converters_iob2json(en_vocab): def test_cli_converters_iob2json(en_vocab):
lines = [ lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@ -142,10 +138,7 @@ def test_cli_converters_iob2json(en_vocab):
for ent in converted_docs[0].ents: for ent in converted_docs[0].ents:
assert(ent.text in ["New York City", "London"]) assert(ent.text in ["New York City", "London"])
assert converted["paragraphs"][0]["entities"] == [(18, 26, 'GPE'), (52, 60, 'GPE'), (86, 94, 'GPE'), (120, 128, 'GPE')]
@pytest.mark.xfail
def test_cli_converters_conll_ner2json(): def test_cli_converters_conll_ner2json():
lines = [ lines = [
"-DOCSTART- -X- O O", "-DOCSTART- -X- O O",
@ -197,19 +190,21 @@ def test_cli_converters_conll_ner2json():
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted_docs = conll_ner2docs(input_data, n_sents=10) converted_docs = conll_ner2docs(input_data, n_sents=10)
assert len(converted_docs) == 1
converted = docs_to_json(converted_docs) converted = docs_to_json(converted_docs)
assert len(converted) == 1 assert converted["id"] == 0
assert converted[0]["id"] == 0 assert len(converted["paragraphs"]) == 1
assert len(converted[0]["paragraphs"]) == 1 assert len(converted["paragraphs"][0]["sentences"]) == 5
assert len(converted[0]["paragraphs"][0]["sentences"]) == 5
for i in range(0, 5): for i in range(0, 5):
sent = converted[0]["paragraphs"][0]["sentences"][i] sent = converted["paragraphs"][0]["sentences"][i]
assert len(sent["tokens"]) == 8 assert len(sent["tokens"]) == 8
tokens = sent["tokens"] tokens = sent["tokens"]
# fmt: off # fmt: off
assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."] assert [t["orth"] for t in tokens] == ["I", "like", "London", "and", "New", "York", "City", "."]
assert [t["ner"] for t in tokens] == ["O", "O", "U-GPE", "O", "B-GPE", "I-GPE", "L-GPE", "O"]
# fmt: on # fmt: on
assert len(converted_docs[0].ents) == 10
for ent in converted_docs[0].ents:
assert (ent.text in ["New York City", "London"])
def test_pretrain_make_docs(): def test_pretrain_make_docs():