spaCy/spacy/tests/util.py

# coding: utf-8
from __future__ import unicode_literals

import numpy
import tempfile
import shutil
import contextlib
import msgpack
from pathlib import Path
from spacy.tokens import Doc, Span
from spacy.attrs import POS, HEAD, DEP
from spacy.compat import path2str


@contextlib.contextmanager
def make_tempfile(mode="r"):
    f = tempfile.TemporaryFile(mode=mode)
    yield f
    f.close()


@contextlib.contextmanager
def make_tempdir():
    d = Path(tempfile.mkdtemp())
    yield d
    shutil.rmtree(path2str(d))


def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [""] * len(words)
    tags = tags or [""] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [""] * len(words)
    for value in deps + tags + pos:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
        attrs[i, 0] = doc.vocab.strings[p]
        attrs[i, 1] = head
        attrs[i, 2] = doc.vocab.strings[dep]
    doc.from_array([POS, HEAD, DEP], attrs)
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    if tags:
        for token in doc:
            token.tag_ = tags[token.i]
    return doc


def apply_transition_sequence(parser, doc, sequence):
    """Perform a series of pre-specified transitions, to put the parser in a
    desired state."""
    for action_name in sequence:
        if "-" in action_name:
            move, label = action_name.split("-")
            parser.add_label(label)
    with parser.step_through(doc) as stepwise:
        for transition in sequence:
            stepwise.transition(transition)


def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
    vocab.reset_vectors(width=length)
    for word, vec in vectors:
        vocab.set_vector(word, vector=vec)
    return vocab


def get_cosine(vec1, vec2):
    """Get cosine for two given vectors"""
    return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))


def assert_docs_equal(doc1, doc2):
    """Compare two Doc objects and assert that they're equal. Tests for tokens,
    tags, dependencies and entities."""
    assert [t.orth for t in doc1] == [t.orth for t in doc2]

    assert [t.pos for t in doc1] == [t.pos for t in doc2]
    assert [t.tag for t in doc1] == [t.tag for t in doc2]

    assert [t.head.i for t in doc1] == [t.head.i for t in doc2]
    assert [t.dep for t in doc1] == [t.dep for t in doc2]
    if doc1.is_parsed and doc2.is_parsed:
        assert [s for s in doc1.sents] == [s for s in doc2.sents]

    assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]
    assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]
    assert [ent for ent in doc1.ents] == [ent for ent in doc2.ents]


def assert_packed_msg_equal(b1, b2):
    """Assert that two packed msgpack messages are equal."""
    msg1 = msgpack.loads(b1, encoding="utf8")
    msg2 = msgpack.loads(b2, encoding="utf8")
    assert sorted(msg1.keys()) == sorted(msg2.keys())
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

Add get_cosine util function 2017-01-12 18:49:57 +03:00			`import numpy`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`import tempfile`
			`import shutil`
			`import contextlib`
Add assert_packed_msg_equal util function 2017-06-03 18:04:30 +03:00			`import msgpack`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`from pathlib import Path`
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`from spacy.tokens import Doc, Span`
			`from spacy.attrs import POS, HEAD, DEP`
			`from spacy.compat import path2str`
Add load_test_model function with importorskip() Loads model only if it can be imported, i.e. if it's installed as a package. 2017-05-29 23:11:31 +03:00

Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`@contextlib.contextmanager`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`def make_tempfile(mode="r"):`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`f = tempfile.TemporaryFile(mode=mode)`
			`yield f`
			`f.close()`


			`@contextlib.contextmanager`
			`def make_tempdir():`
			`d = Path(tempfile.mkdtemp())`
			`yield d`
			`shutil.rmtree(path2str(d))`


Allow setting ents in get_doc 2017-01-12 14:25:10 +03:00			`def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`"""Create Doc object from given vocab, words and annotations."""`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`pos = pos or [""] * len(words)`
			`tags = tags or [""] * len(words)`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`heads = heads or [0] * len(words)`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`deps = deps or [""] * len(words)`
			`for value in deps + tags + pos:`
Finish stringstore change. Also xfail vectors tests 2017-05-28 16:10:22 +03:00			`vocab.strings.add(value)`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00
			`doc = Doc(vocab, words=words)`
			`attrs = doc.to_array([POS, HEAD, DEP])`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):`
			`attrs[i, 0] = doc.vocab.strings[p]`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`attrs[i, 1] = head`
			`attrs[i, 2] = doc.vocab.strings[dep]`
			`doc.from_array([POS, HEAD, DEP], attrs)`
Allow setting ents in get_doc 2017-01-12 14:25:10 +03:00			`if ents:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`doc.ents = [`
			`Span(doc, start, end, label=doc.vocab.strings[label])`
			`for start, end, label in ents`
			`]`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`if tags:`
			`for token in doc:`
			`token.tag_ = tags[token.i]`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`return doc`
Add apply_transition_sequence util function to utils 2017-01-11 23:30:14 +03:00

			`def apply_transition_sequence(parser, doc, sequence):`
			`"""Perform a series of pre-specified transitions, to put the parser in a`
			`desired state."""`
			`for action_name in sequence:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`if "-" in action_name:`
			`move, label = action_name.split("-")`
Add apply_transition_sequence util function to utils 2017-01-11 23:30:14 +03:00			`parser.add_label(label)`
			`with parser.step_through(doc) as stepwise:`
			`for transition in sequence:`
			`stepwise.transition(transition)`
Add get_cosine util function 2017-01-12 18:49:57 +03:00

Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`def add_vecs_to_vocab(vocab, vectors):`
			`"""Add list of vector tuples to given vocab. All vectors need to have the`
			`same length. Format: [("text", [1, 2, 3])]"""`
			`length = len(vectors[0][1])`
Revise and simplify Vectors class 2017-10-31 20:25:08 +03:00			`vocab.reset_vectors(width=length)`
Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`for word, vec in vectors:`
Revise and simplify Vectors class 2017-10-31 20:25:08 +03:00			`vocab.set_vector(word, vector=vec)`
Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`return vocab`


Add get_cosine util function 2017-01-12 18:49:57 +03:00			`def get_cosine(vec1, vec2):`
			`"""Get cosine for two given vectors"""`
			`return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00

			`def assert_docs_equal(doc1, doc2):`
Reformat add_docs_equal and add docstring 2017-01-13 16:25:53 +03:00			`"""Compare two Doc objects and assert that they're equal. Tests for tokens,`
			`tags, dependencies and entities."""`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.orth for t in doc1] == [t.orth for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.pos for t in doc1] == [t.pos for t in doc2]`
			`assert [t.tag for t in doc1] == [t.tag for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.head.i for t in doc1] == [t.head.i for t in doc2]`
			`assert [t.dep for t in doc1] == [t.dep for t in doc2]`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00			`if doc1.is_parsed and doc2.is_parsed:`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [s for s in doc1.sents] == [s for s in doc2.sents]`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]`
			`assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]`
			`assert [ent for ent in doc1.ents] == [ent for ent in doc2.ents]`
Add assert_packed_msg_equal util function 2017-06-03 18:04:30 +03:00

			`def assert_packed_msg_equal(b1, b2):`
			`"""Assert that two packed msgpack messages are equal."""`
💫 Tidy up and auto-format tests (#2967) * Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility 2018-11-27 03:09:36 +03:00			`msg1 = msgpack.loads(b1, encoding="utf8")`
			`msg2 = msgpack.loads(b2, encoding="utf8")`
Add assert_packed_msg_equal util function 2017-06-03 18:04:30 +03:00			`assert sorted(msg1.keys()) == sorted(msg2.keys())`
			`for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):`
			`assert k1 == k2`
			`assert v1 == v2`