spaCy/spacy/tests/util.py

# coding: utf-8
from __future__ import unicode_literals

import numpy
import tempfile
import shutil
import contextlib
import srsly
from pathlib import Path

from spacy import Errors
from spacy.tokens import Doc, Span
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
from spacy.compat import path2str


@contextlib.contextmanager
def make_tempfile(mode="r"):
    f = tempfile.TemporaryFile(mode=mode)
    yield f
    f.close()


@contextlib.contextmanager
def make_tempdir():
    d = Path(tempfile.mkdtemp())
    yield d
    shutil.rmtree(path2str(d))


def get_doc(
    vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
):
    """Create Doc object from given vocab, words and annotations."""
    if deps and not heads:
        heads = [0] * len(deps)
    headings = []
    values = []
    annotations = [pos, heads, deps, lemmas, tags]
    possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
    for a, annot in enumerate(annotations):
        if annot is not None:
            if len(annot) != len(words):
                raise ValueError(Errors.E189)
            headings.append(possible_headings[a])
            if annot is not heads:
                values.extend(annot)
    for value in values:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)

    # if there are any other annotations, set them
    if headings:
        attrs = doc.to_array(headings)

        j = 0
        for annot in annotations:
            if annot:
                if annot is heads:
                    for i in range(len(words)):
                        if attrs.ndim == 1:
                            attrs[i] = heads[i]
                        else:
                            attrs[i, j] = heads[i]
                else:
                    for i in range(len(words)):
                        if attrs.ndim == 1:
                            attrs[i] = doc.vocab.strings[annot[i]]
                        else:
                            attrs[i, j] = doc.vocab.strings[annot[i]]
                j += 1
        doc.from_array(headings, attrs)

    # finally, set the entities
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    return doc


def apply_transition_sequence(parser, doc, sequence):
    """Perform a series of pre-specified transitions, to put the parser in a
    desired state."""
    for action_name in sequence:
        if "-" in action_name:
            move, label = action_name.split("-")
            parser.add_label(label)
    with parser.step_through(doc) as stepwise:
        for transition in sequence:
            stepwise.transition(transition)


def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
    vocab.reset_vectors(width=length)
    for word, vec in vectors:
        vocab.set_vector(word, vector=vec)
    return vocab


def get_cosine(vec1, vec2):
    """Get cosine for two given vectors"""
    return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))


def assert_docs_equal(doc1, doc2):
    """Compare two Doc objects and assert that they're equal. Tests for tokens,
    tags, dependencies and entities."""
    assert [t.orth for t in doc1] == [t.orth for t in doc2]

    assert [t.pos for t in doc1] == [t.pos for t in doc2]
    assert [t.tag for t in doc1] == [t.tag for t in doc2]

    assert [t.head.i for t in doc1] == [t.head.i for t in doc2]
    assert [t.dep for t in doc1] == [t.dep for t in doc2]
    assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2]

    assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]
    assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]
    for ent1, ent2 in zip(doc1.ents, doc2.ents):
        assert ent1.start == ent2.start
        assert ent1.end == ent2.end
        assert ent1.label == ent2.label
        assert ent1.kb_id == ent2.kb_id


def assert_packed_msg_equal(b1, b2):
    """Assert that two packed msgpack messages are equal."""
    msg1 = srsly.msgpack_loads(b1)
    msg2 = srsly.msgpack_loads(b2)
    assert sorted(msg1.keys()) == sorted(msg2.keys())
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

Add get_cosine util function 2017-01-12 18:49:57 +03:00			`import numpy`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`import tempfile`
			`import shutil`
			`import contextlib`
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003) Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉 See here: https://github.com/explosion/srsly Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place. At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel. srsly currently includes forks of the following packages: ujson msgpack msgpack-numpy cloudpickle * WIP: replace json/ujson with srsly * Replace ujson in examples Use regular json instead of srsly to make code easier to read and follow * Update requirements * Fix imports * Fix typos * Replace msgpack with srsly * Fix warning 2018-12-03 03:28:22 +03:00			`import srsly`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`from pathlib import Path`
Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method 2020-03-02 13:49:28 +03:00
			`from spacy import Errors`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`from spacy.tokens import Doc, Span`
Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method 2020-03-02 13:49:28 +03:00			`from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`from spacy.compat import path2str`
Add load_test_model function with importorskip() Loads model only if it can be imported, i.e. if it's installed as a package. 2017-05-29 23:11:31 +03:00

Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`@contextlib.contextmanager`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`def make_tempfile(mode="r"):`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`f = tempfile.TemporaryFile(mode=mode)`
			`yield f`
			`f.close()`


			`@contextlib.contextmanager`
			`def make_tempdir():`
			`d = Path(tempfile.mkdtemp())`
			`yield d`
			`shutil.rmtree(path2str(d))`


Tidy up and auto-format 2020-03-25 14:28:12 +03:00			`def get_doc(`
			`vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None`
			`):`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`"""Create Doc object from given vocab, words and annotations."""`
Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method 2020-03-02 13:49:28 +03:00			`if deps and not heads:`
			`heads = [0] * len(deps)`
			`headings = []`
			`values = []`
			`annotations = [pos, heads, deps, lemmas, tags]`
			`possible_headings = [POS, HEAD, DEP, LEMMA, TAG]`
			`for a, annot in enumerate(annotations):`
			`if annot is not None:`
			`if len(annot) != len(words):`
			`raise ValueError(Errors.E189)`
			`headings.append(possible_headings[a])`
			`if annot is not heads:`
			`values.extend(annot)`
			`for value in values:`
Finish stringstore change. Also xfail vectors tests 2017-05-28 16:10:22 +03:00			`vocab.strings.add(value)`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00
			`doc = Doc(vocab, words=words)`
Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method 2020-03-02 13:49:28 +03:00
			`# if there are any other annotations, set them`
			`if headings:`
			`attrs = doc.to_array(headings)`

			`j = 0`
			`for annot in annotations:`
			`if annot:`
			`if annot is heads:`
			`for i in range(len(words)):`
			`if attrs.ndim == 1:`
			`attrs[i] = heads[i]`
			`else:`
Tidy up and auto-format 2020-03-25 14:28:12 +03:00			`attrs[i, j] = heads[i]`
Bugfix/get doc (#5049) * new (broken) unit test * fixing get_doc method 2020-03-02 13:49:28 +03:00			`else:`
			`for i in range(len(words)):`
			`if attrs.ndim == 1:`
			`attrs[i] = doc.vocab.strings[annot[i]]`
			`else:`
			`attrs[i, j] = doc.vocab.strings[annot[i]]`
			`j += 1`
			`doc.from_array(headings, attrs)`

			`# finally, set the entities`
Allow setting ents in get_doc 2017-01-12 14:25:10 +03:00			`if ents:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`doc.ents = [`
			`Span(doc, start, end, label=doc.vocab.strings[label])`
			`for start, end, label in ents`
			`]`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`return doc`
Add apply_transition_sequence util function to utils 2017-01-11 23:30:14 +03:00

			`def apply_transition_sequence(parser, doc, sequence):`
			`"""Perform a series of pre-specified transitions, to put the parser in a`
			`desired state."""`
			`for action_name in sequence:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`if "-" in action_name:`
			`move, label = action_name.split("-")`
Add apply_transition_sequence util function to utils 2017-01-11 23:30:14 +03:00			`parser.add_label(label)`
			`with parser.step_through(doc) as stepwise:`
			`for transition in sequence:`
			`stepwise.transition(transition)`
Add get_cosine util function 2017-01-12 18:49:57 +03:00

Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`def add_vecs_to_vocab(vocab, vectors):`
			`"""Add list of vector tuples to given vocab. All vectors need to have the`
			`same length. Format: [("text", [1, 2, 3])]"""`
			`length = len(vectors[0][1])`
Revise and simplify Vectors class 2017-10-31 20:25:08 +03:00			`vocab.reset_vectors(width=length)`
Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`for word, vec in vectors:`
Revise and simplify Vectors class 2017-10-31 20:25:08 +03:00			`vocab.set_vector(word, vector=vec)`
Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`return vocab`


Add get_cosine util function 2017-01-12 18:49:57 +03:00			`def get_cosine(vec1, vec2):`
			`"""Get cosine for two given vectors"""`
			`return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00

			`def assert_docs_equal(doc1, doc2):`
Reformat add_docs_equal and add docstring 2017-01-13 16:25:53 +03:00			`"""Compare two Doc objects and assert that they're equal. Tests for tokens,`
			`tags, dependencies and entities."""`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.orth for t in doc1] == [t.orth for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.pos for t in doc1] == [t.pos for t in doc2]`
			`assert [t.tag for t in doc1] == [t.tag for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.head.i for t in doc1] == [t.head.i for t in doc2]`
			`assert [t.dep for t in doc1] == [t.dep for t in doc2]`
Fix sents comparison in test util Due to changes to `Span` (#5005), spans from different documents are now never equal. Check `Token.is_sent_start` values instead. 2020-03-13 11:25:23 +03:00			`assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]`
			`assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]`
Sync Span __eq__ and __hash__ (#5005) * Sync Span __eq__ and __hash__ Use the same tuple for `__eq__` and `__hash__`, including all attributes except `vector` and `vector_norm`. * Update entity comparison in tests Update `assert_docs_equal()` test util to compare `Span` properties for ents rather than `Span` objects. 2020-02-16 19:20:36 +03:00			`for ent1, ent2 in zip(doc1.ents, doc2.ents):`
			`assert ent1.start == ent2.start`
			`assert ent1.end == ent2.end`
			`assert ent1.label == ent2.label`
			`assert ent1.kb_id == ent2.kb_id`
Add assert_packed_msg_equal util function 2017-06-03 18:04:30 +03:00

			`def assert_packed_msg_equal(b1, b2):`
			`"""Assert that two packed msgpack messages are equal."""`
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003) Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉 See here: https://github.com/explosion/srsly Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place. At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel. srsly currently includes forks of the following packages: ujson msgpack msgpack-numpy cloudpickle * WIP: replace json/ujson with srsly * Replace ujson in examples Use regular json instead of srsly to make code easier to read and follow * Update requirements * Fix imports * Fix typos * Replace msgpack with srsly * Fix warning 2018-12-03 03:28:22 +03:00			`msg1 = srsly.msgpack_loads(b1)`
			`msg2 = srsly.msgpack_loads(b2)`
Add assert_packed_msg_equal util function 2017-06-03 18:04:30 +03:00			`assert sorted(msg1.keys()) == sorted(msg2.keys())`
			`for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):`
			`assert k1 == k2`
			`assert v1 == v2`