spaCy/spacy/tests/test_gold.py

# coding: utf-8
from __future__ import unicode_literals

from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, GoldParse
from spacy.tokens import Doc
import pytest


def test_gold_biluo_U(en_vocab):
    words = ["I", "flew", "to", "London", "."]
    spaces = [True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to London"), "LOC")]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "U-LOC", "O"]


def test_gold_biluo_BL(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "."]
    spaces = [True, True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]


def test_gold_biluo_BIL(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]


def test_gold_biluo_overlap(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
    spaces = [True, True, True, True, True, False, True]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [
        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
        (len("I flew to "), len("I flew to San Francisco"), "LOC"),
    ]
    with pytest.raises(ValueError):
        biluo_tags_from_offsets(doc, entities)


def test_gold_biluo_misalign(en_vocab):
    words = ["I", "flew", "to", "San", "Francisco", "Valley."]
    spaces = [True, True, True, True, True, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ["O", "O", "O", "-", "-", "-"]


def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
    doc = en_tokenizer(text)
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets


def test_biluo_spans(en_tokenizer):
    doc = en_tokenizer("I flew to Silicon Valley via London.")
    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    spans = spans_from_biluo_tags(doc, biluo_tags)
    assert len(spans) == 2
    assert spans[0].text == "Silicon Valley"
    assert spans[0].label_ == "LOC"
    assert spans[1].text == "London"
    assert spans[1].label_ == "GPE"


def test_gold_ner_missing_tags(en_tokenizer):
    doc = en_tokenizer("I flew to Silicon Valley via London.")
    biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
    gold = GoldParse(doc, entities=biluo_tags)  # noqa: F841
Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`# coding: utf-8`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`from __future__ import unicode_literals`

💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags`
💫 Improve handling of missing NER tags (closes #2603) (#3341) * Improve handling of missing NER tags GoldParse can accept missing NER tags, if entities is provided in BILUO format (rather than as spans). Missing tags can be provided as None values. Fix bug that occurred when first tag was a None value. Closes #2603. * Document specification of missing NER tags. 2019-02-27 14:06:32 +03:00			`from spacy.gold import spans_from_biluo_tags, GoldParse`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`from spacy.tokens import Doc`
biluo_tags_from_offsets throw exception for overlapping entities (#4021) * Check whether two entities overlap - biluo_gold_biluo_overlap now throw exception when entities passed in have overlaps - added unit test * SCA agreement 2019-08-15 19:13:32 +03:00			`import pytest`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00
Tidy up and auto-format 2019-08-18 16:09:16 +03:00
Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_U(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`words = ["I", "flew", "to", "London", "."]`
			`spaces = [True, True, True, False, True]`
			`doc = Doc(en_vocab, words=words, spaces=spaces)`
			`entities = [(len("I flew to "), len("I flew to London"), "LOC")]`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`tags = biluo_tags_from_offsets(doc, entities)`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert tags == ["O", "O", "O", "U-LOC", "O"]`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00

Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_BL(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`words = ["I", "flew", "to", "San", "Francisco", "."]`
			`spaces = [True, True, True, True, False, True]`
			`doc = Doc(en_vocab, words=words, spaces=spaces)`
			`entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`tags = biluo_tags_from_offsets(doc, entities)`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00

Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_BIL(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]`
			`spaces = [True, True, True, True, True, False, True]`
			`doc = Doc(en_vocab, words=words, spaces=spaces)`
			`entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`tags = biluo_tags_from_offsets(doc, entities)`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00
Tidy up and auto-format 2019-08-18 16:09:16 +03:00
biluo_tags_from_offsets throw exception for overlapping entities (#4021) * Check whether two entities overlap - biluo_gold_biluo_overlap now throw exception when entities passed in have overlaps - added unit test * SCA agreement 2019-08-15 19:13:32 +03:00			`def test_gold_biluo_overlap(en_vocab):`
			`words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]`
			`spaces = [True, True, True, True, True, False, True]`
			`doc = Doc(en_vocab, words=words, spaces=spaces)`
Tidy up and auto-format 2019-08-18 16:09:16 +03:00			`entities = [`
			`(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),`
			`(len("I flew to "), len("I flew to San Francisco"), "LOC"),`
			`]`
biluo_tags_from_offsets throw exception for overlapping entities (#4021) * Check whether two entities overlap - biluo_gold_biluo_overlap now throw exception when entities passed in have overlaps - added unit test * SCA agreement 2019-08-15 19:13:32 +03:00			`with pytest.raises(ValueError):`
Tidy up and auto-format 2019-08-18 16:09:16 +03:00			`biluo_tags_from_offsets(doc, entities)`

Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00
Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_misalign(en_vocab):`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`words = ["I", "flew", "to", "San", "Francisco", "Valley."]`
			`spaces = [True, True, True, True, True, False]`
			`doc = Doc(en_vocab, words=words, spaces=spaces)`
			`entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`tags = biluo_tags_from_offsets(doc, entities)`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert tags == ["O", "O", "O", "-", "-", "-"]`
Add offsets_from_biluo_tags helper and tests (see #1626) 2017-11-26 18:38:01 +03:00

			`def test_roundtrip_offsets_biluo_conversion(en_tokenizer):`
			`text = "I flew to Silicon Valley via London."`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]`
			`offsets = [(10, 24, "LOC"), (29, 35, "GPE")]`
Add offsets_from_biluo_tags helper and tests (see #1626) 2017-11-26 18:38:01 +03:00			`doc = en_tokenizer(text)`
			`biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)`
			`assert biluo_tags_converted == biluo_tags`
			`offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)`
			`assert offsets_converted == offsets`
Add gold.spans_from_biluo_tags helper (#3227) 2019-02-06 13:50:26 +03:00

			`def test_biluo_spans(en_tokenizer):`
			`doc = en_tokenizer("I flew to Silicon Valley via London.")`
			`biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]`
			`spans = spans_from_biluo_tags(doc, biluo_tags)`
			`assert len(spans) == 2`
			`assert spans[0].text == "Silicon Valley"`
			`assert spans[0].label_ == "LOC"`
			`assert spans[1].text == "London"`
			`assert spans[1].label_ == "GPE"`
💫 Improve handling of missing NER tags (closes #2603) (#3341) * Improve handling of missing NER tags GoldParse can accept missing NER tags, if entities is provided in BILUO format (rather than as spans). Missing tags can be provided as None values. Fix bug that occurred when first tag was a None value. Closes #2603. * Document specification of missing NER tags. 2019-02-27 14:06:32 +03:00
Auto-format [ci skip] 2019-02-27 16:24:55 +03:00
💫 Improve handling of missing NER tags (closes #2603) (#3341) * Improve handling of missing NER tags GoldParse can accept missing NER tags, if entities is provided in BILUO format (rather than as spans). Missing tags can be provided as None values. Fix bug that occurred when first tag was a None value. Closes #2603. * Document specification of missing NER tags. 2019-02-27 14:06:32 +03:00			`def test_gold_ner_missing_tags(en_tokenizer):`
			`doc = en_tokenizer("I flew to Silicon Valley via London.")`
			`biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]`
Auto-format [ci skip] 2019-02-27 16:24:55 +03:00			`gold = GoldParse(doc, entities=biluo_tags) # noqa: F841`