diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index a7a26b6b3..425254320 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -71,12 +71,13 @@ cdef class Example: self._alignment = Alignment(spacy_words, gold_words) return self._alignment - def get_aligned(self, field): + def get_aligned(self, field, as_string=False): """Return an aligned array for a token attribute.""" # TODO: This is probably wrong. I just bashed this out and there's probably # all sorts of edge-cases. alignment = self.alignment i2j_multi = alignment.i2j_multi + j2i_multi = alignment.j2i_multi gold_to_cand = alignment.gold_to_cand cand_to_gold = alignment.cand_to_gold @@ -92,8 +93,18 @@ cdef class Example: else: output.append(None) else: - output.append([gold_values[gold_i]]) - output = [vocab.strings[o] for o in output] + output.append(gold_values[gold_i]) + + if field in ["ENT_IOB", "ENT_TYPE"]: + # Assign O/- for one-to-many O/- NER tags + for j, cand_j in enumerate(gold_to_cand): + if cand_j is None: + if j in j2i_multi: + i = j2i_multi[j] + output[i] = gold_values[j] + + if as_string: + output = [vocab.strings[o] if o is not None else o for o in output] return output def to_dict(self): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 897502f93..5cf3b4c01 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,12 +1,10 @@ from spacy.errors import AlignmentError from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align -from spacy.gold import GoldCorpus, docs_to_json, DocAnnotation +from spacy.gold import GoldCorpus, docs_to_json from spacy.gold.example import Example from spacy.lang.en import English from spacy.syntax.nonproj import is_nonproj_tree -from spacy.syntax.gold_parse import GoldParse, get_parses_from_example -from spacy.syntax.gold_parse import get_parses_from_example from spacy.tokens import Doc from spacy.util import get_words_and_spaces, compounding, minibatch import pytest @@ -158,12 +156,10 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): spaces = [True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - gp = GoldParse( - doc, - words=["I", "flew", "to", "San", "Francisco", "Valley", "."], - entities=entities, - ) - assert gp.ner == ["O", "O", "U-LOC", "O"] + gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + example = Example.from_dict(doc, {"words": gold_words, "entities": entities}) + assert example.get_aligned("ENT_IOB") == [2, 2, 1, 2] + assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", ""] # many-to-one words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] diff --git a/spacy/tests/test_new_example.py b/spacy/tests/test_new_example.py index b7af77149..0b2846c14 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/test_new_example.py @@ -40,7 +40,7 @@ def test_Example_from_dict_with_tags(pred_words, annots): example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): assert token.tag_ == annots["tags"][i] - aligned_tags = example.get_aligned("tag") + aligned_tags = example.get_aligned("tag", as_string=True) assert aligned_tags == ["NN" for _ in predicted] @@ -52,7 +52,7 @@ def test_aligned_tags(): vocab = Vocab() predicted = Doc(vocab, words=pred_words) example = Example.from_dict(predicted, annots) - aligned_tags = example.get_aligned("tag") + aligned_tags = example.get_aligned("tag", as_string=True) assert aligned_tags == ["VERB", "DET", None, "SCONJ", "PRON", "VERB", "VERB"] @@ -64,7 +64,7 @@ def test_aligned_tags_multi(): vocab = Vocab() predicted = Doc(vocab, words=pred_words) example = Example.from_dict(predicted, annots) - aligned_tags = example.get_aligned("tag") + aligned_tags = example.get_aligned("tag", as_string=True) assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"] @@ -159,14 +159,11 @@ def test_Example_from_dict_with_entities(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) + assert len(list(example.reference.ents)) == 2 - assert example.reference[0].ent_iob_ == "O" - assert example.reference[1].ent_iob_ == "O" - assert example.reference[2].ent_iob_ == "B" - assert example.reference[3].ent_iob_ == "I" - assert example.reference[4].ent_iob_ == "O" - assert example.reference[5].ent_iob_ == "B" - assert example.reference[6].ent_iob_ == "O" + assert [example.reference[i].ent_iob_ for i in range(7)] == ["O", "O", "B", "I", "O", "B", "O"] + assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2, 3, 2] + assert example.reference[2].ent_type_ == "LOC" assert example.reference[3].ent_type_ == "LOC" assert example.reference[5].ent_type_ == "LOC"