spaCy/spacy/tests/test_gold.py

# coding: utf-8
from __future__ import unicode_literals

from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import docs_to_json
from spacy.tokens import Doc
from .util import get_doc


def test_gold_biluo_U(en_vocab):
    orths_and_spaces = [('I', True), ('flew', True), ('to', True),
                        ('London', False), ('.', True)]
    doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
    entities = [(len("I flew to "), len("I flew to London"), 'LOC')]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'O', 'U-LOC', 'O']


def test_gold_biluo_BL(en_vocab):
    orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
                        ('Francisco', False), ('.', True)]
    doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco"), 'LOC')]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O']


def test_gold_biluo_BIL(en_vocab):
    orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
                        ('Francisco', True), ('Valley', False), ('.', True)]
    doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O']


def test_gold_biluo_misalign(en_vocab):
    orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
                        ('Francisco', True), ('Valley.', False)]
    doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'O', '-', '-', '-']


def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
    text = "I flew to Silicon Valley via London."
    biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']
    offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]
    doc = en_tokenizer(text)
    biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
    assert biluo_tags_converted == biluo_tags
    offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
    assert offsets_converted == offsets

def test_docs_to_json(en_vocab):
    '''Test we can convert a list of Doc objects into the JSON-serializable
    format we use for training.
    '''
    docs = [
        get_doc(
            en_vocab,
            words=['a', 'b'],
            pos=['VBP', 'NN'],
            heads=[0, -1],
            deps=['ROOT', 'dobj'],
            ents=[]),
        get_doc(
            en_vocab,
            words=['c', 'd', 'e'],
            pos=['VBP', 'NN', 'NN'],
            heads=[0, -1, -2],
            deps=['ROOT', 'dobj', 'dobj'],
            ents=[(1, 2, 'ORG')]),
    ]
    json_doc = docs_to_json(0, docs)
    assert json_doc['id'] == 0
    assert len(json_doc['paragraphs']) == 2
    assert len(json_doc['paragraphs'][0]['sentences']) == 1
    assert len(json_doc['paragraphs'][1]['sentences']) == 1
    assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2
    assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3
Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`# coding: utf-8`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`from __future__ import unicode_literals`

💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags`
Add function to get train format from Doc objects Our JSON training format is annoying to work with, and we've wanted to retire it for some time. In the meantime, we can at least add some missing functions to make it easier to live with. This patch adds a function that generates the JSON format from a list of Doc objects, one per paragraph. This should be a convenient way to handle a lot of data conversions: whatever format you have the source information in, you can use it to setup a Doc object. This approach should offer better future-proofing as well. Hopefully, we can steadily rewrite code that is sensitive to the current data-format, so that it instead goes through this function. Then when we change the data format, we won't have such a problem. 2018-08-14 14:13:10 +03:00			`from spacy.gold import docs_to_json`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`from spacy.tokens import Doc`
Add function to get train format from Doc objects Our JSON training format is annoying to work with, and we've wanted to retire it for some time. In the meantime, we can at least add some missing functions to make it easier to live with. This patch adds a function that generates the JSON format from a list of Doc objects, one per paragraph. This should be a convenient way to handle a lot of data conversions: whatever format you have the source information in, you can use it to setup a Doc object. This approach should offer better future-proofing as well. Hopefully, we can steadily rewrite code that is sensitive to the current data-format, so that it instead goes through this function. Then when we change the data format, we won't have such a problem. 2018-08-14 14:13:10 +03:00			`from .util import get_doc`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00

Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_U(en_vocab):`
			`orths_and_spaces = [('I', True), ('flew', True), ('to', True),`
			`('London', False), ('.', True)]`
			`doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`entities = [(len("I flew to "), len("I flew to London"), 'LOC')]`
			`tags = biluo_tags_from_offsets(doc, entities)`
			`assert tags == ['O', 'O', 'O', 'U-LOC', 'O']`


Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_BL(en_vocab):`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),`
			`('Francisco', False), ('.', True)]`
Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`entities = [(len("I flew to "), len("I flew to San Francisco"), 'LOC')]`
			`tags = biluo_tags_from_offsets(doc, entities)`
			`assert tags == ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O']`


Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_BIL(en_vocab):`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),`
			`('Francisco', True), ('Valley', False), ('.', True)]`
Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]`
			`tags = biluo_tags_from_offsets(doc, entities)`
			`assert tags == ['O', 'O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O']`


Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`def test_gold_biluo_misalign(en_vocab):`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),`
			`('Francisco', True), ('Valley.', False)]`
Modernise BILUO tests 2017-01-13 01:39:18 +03:00			`doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)`
Add tests for entity->biluo transformation 2016-10-15 22:50:43 +03:00			`entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]`
			`tags = biluo_tags_from_offsets(doc, entities)`
Update test for biluo tags 2016-10-16 12:42:45 +03:00			`assert tags == ['O', 'O', 'O', '-', '-', '-']`
Add offsets_from_biluo_tags helper and tests (see #1626) 2017-11-26 18:38:01 +03:00

			`def test_roundtrip_offsets_biluo_conversion(en_tokenizer):`
			`text = "I flew to Silicon Valley via London."`
			`biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']`
			`offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]`
			`doc = en_tokenizer(text)`
			`biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)`
			`assert biluo_tags_converted == biluo_tags`
			`offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)`
			`assert offsets_converted == offsets`
Add function to get train format from Doc objects Our JSON training format is annoying to work with, and we've wanted to retire it for some time. In the meantime, we can at least add some missing functions to make it easier to live with. This patch adds a function that generates the JSON format from a list of Doc objects, one per paragraph. This should be a convenient way to handle a lot of data conversions: whatever format you have the source information in, you can use it to setup a Doc object. This approach should offer better future-proofing as well. Hopefully, we can steadily rewrite code that is sensitive to the current data-format, so that it instead goes through this function. Then when we change the data format, we won't have such a problem. 2018-08-14 14:13:10 +03:00
			`def test_docs_to_json(en_vocab):`
			`'''Test we can convert a list of Doc objects into the JSON-serializable`
			`format we use for training.`
			`'''`
			`docs = [`
			`get_doc(`
			`en_vocab,`
			`words=['a', 'b'],`
			`pos=['VBP', 'NN'],`
			`heads=[0, -1],`
			`deps=['ROOT', 'dobj'],`
			`ents=[]),`
			`get_doc(`
			`en_vocab,`
			`words=['c', 'd', 'e'],`
			`pos=['VBP', 'NN', 'NN'],`
			`heads=[0, -1, -2],`
			`deps=['ROOT', 'dobj', 'dobj'],`
			`ents=[(1, 2, 'ORG')]),`
			`]`
			`json_doc = docs_to_json(0, docs)`
			`assert json_doc['id'] == 0`
			`assert len(json_doc['paragraphs']) == 2`
			`assert len(json_doc['paragraphs'][0]['sentences']) == 1`
			`assert len(json_doc['paragraphs'][1]['sentences']) == 1`
			`assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2`
			`assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3`