spaCy/spacy/tests/util.py

# coding: utf-8
from __future__ import unicode_literals

from ..tokens import Doc
from ..attrs import ORTH, POS, HEAD, DEP
from ..compat import path2str

import pytest
import numpy
import tempfile
import shutil
import contextlib
import msgpack
from pathlib import Path


MODELS = {}


def load_test_model(model):
    """Load a model if it's installed as a package, otherwise skip."""
    if model not in MODELS:
        module = pytest.importorskip(model)
        MODELS[model] = module.load()
    return MODELS[model]


@contextlib.contextmanager
def make_tempfile(mode='r'):
    f = tempfile.TemporaryFile(mode=mode)
    yield f
    f.close()


@contextlib.contextmanager
def make_tempdir():
    d = Path(tempfile.mkdtemp())
    yield d
    shutil.rmtree(path2str(d))


def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [''] * len(words)
    tags = tags or [''] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [''] * len(words)
    for value in (deps+tags+pos):
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
        attrs[i, 0] = doc.vocab.strings[p]
        attrs[i, 1] = head
        attrs[i, 2] = doc.vocab.strings[dep]
    doc.from_array([POS, HEAD, DEP], attrs)
    if ents:
        doc.ents = [(ent_id, doc.vocab.strings[label], start, end) for ent_id, label, start, end in ents]
    if tags:
        for token in doc:
            token.tag_ = tags[token.i]
    return doc


def apply_transition_sequence(parser, doc, sequence):
    """Perform a series of pre-specified transitions, to put the parser in a
    desired state."""
    for action_name in sequence:
        if '-' in action_name:
            move, label = action_name.split('-')
            parser.add_label(label)
    with parser.step_through(doc) as stepwise:
        for transition in sequence:
            stepwise.transition(transition)


def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
    vocab.reset_vectors(width=length)
    for word, vec in vectors:
        vocab.set_vector(word, vector=vec)
    return vocab


def get_cosine(vec1, vec2):
    """Get cosine for two given vectors"""
    return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))


def assert_docs_equal(doc1, doc2):
    """Compare two Doc objects and assert that they're equal. Tests for tokens,
    tags, dependencies and entities."""
    assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]

    assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]
    assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]

    assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]
    assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]
    if doc1.is_parsed and doc2.is_parsed:
        assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]

    assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
    assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
    assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]


def assert_packed_msg_equal(b1, b2):
    """Assert that two packed msgpack messages are equal."""
    msg1 = msgpack.loads(b1, encoding='utf8')
    msg2 = msgpack.loads(b2, encoding='utf8')
    assert sorted(msg1.keys()) == sorted(msg2.keys())
    for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
        assert k1 == k2
        assert v1 == v2
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`from ..tokens import Doc`
			`from ..attrs import ORTH, POS, HEAD, DEP`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`from ..compat import path2str`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00
Add load_test_model function with importorskip() Loads model only if it can be imported, i.e. if it's installed as a package. 2017-05-29 23:11:31 +03:00			`import pytest`
Add get_cosine util function 2017-01-12 18:49:57 +03:00			`import numpy`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`import tempfile`
			`import shutil`
			`import contextlib`
Add assert_packed_msg_equal util function 2017-06-03 18:04:30 +03:00			`import msgpack`
Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`from pathlib import Path`
Add get_cosine util function 2017-01-12 18:49:57 +03:00
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00
Add load_test_model function with importorskip() Loads model only if it can be imported, i.e. if it's installed as a package. 2017-05-29 23:11:31 +03:00			`MODELS = {}`


			`def load_test_model(model):`
Update tests README with info on model tests 2017-05-31 13:22:58 +03:00			`"""Load a model if it's installed as a package, otherwise skip."""`
Add load_test_model function with importorskip() Loads model only if it can be imported, i.e. if it's installed as a package. 2017-05-29 23:11:31 +03:00			`if model not in MODELS:`
			`module = pytest.importorskip(model)`
			`MODELS[model] = module.load()`
			`return MODELS[model]`


Add test utils for temp file and temp dir 2017-06-02 11:56:09 +03:00			`@contextlib.contextmanager`
			`def make_tempfile(mode='r'):`
			`f = tempfile.TemporaryFile(mode=mode)`
			`yield f`
			`f.close()`


			`@contextlib.contextmanager`
			`def make_tempdir():`
			`d = Path(tempfile.mkdtemp())`
			`yield d`
			`shutil.rmtree(path2str(d))`


Allow setting ents in get_doc 2017-01-12 14:25:10 +03:00			`def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`"""Create Doc object from given vocab, words and annotations."""`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`pos = pos or [''] * len(words)`
Finish stringstore change. Also xfail vectors tests 2017-05-28 16:10:22 +03:00			`tags = tags or [''] * len(words)`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`heads = heads or [0] * len(words)`
			`deps = deps or [''] * len(words)`
Finish stringstore change. Also xfail vectors tests 2017-05-28 16:10:22 +03:00			`for value in (deps+tags+pos):`
			`vocab.strings.add(value)`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00
			`doc = Doc(vocab, words=words)`
			`attrs = doc.to_array([POS, HEAD, DEP])`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):`
			`attrs[i, 0] = doc.vocab.strings[p]`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`attrs[i, 1] = head`
			`attrs[i, 2] = doc.vocab.strings[dep]`
			`doc.from_array([POS, HEAD, DEP], attrs)`
Allow setting ents in get_doc 2017-01-12 14:25:10 +03:00			`if ents:`
			`doc.ents = [(ent_id, doc.vocab.strings[label], start, end) for ent_id, label, start, end in ents]`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`if tags:`
			`for token in doc:`
			`token.tag_ = tags[token.i]`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`return doc`
Add apply_transition_sequence util function to utils 2017-01-11 23:30:14 +03:00

			`def apply_transition_sequence(parser, doc, sequence):`
			`"""Perform a series of pre-specified transitions, to put the parser in a`
			`desired state."""`
			`for action_name in sequence:`
			`if '-' in action_name:`
			`move, label = action_name.split('-')`
			`parser.add_label(label)`
			`with parser.step_through(doc) as stepwise:`
			`for transition in sequence:`
			`stepwise.transition(transition)`
Add get_cosine util function 2017-01-12 18:49:57 +03:00

Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`def add_vecs_to_vocab(vocab, vectors):`
			`"""Add list of vector tuples to given vocab. All vectors need to have the`
			`same length. Format: [("text", [1, 2, 3])]"""`
			`length = len(vectors[0][1])`
Revise and simplify Vectors class 2017-10-31 20:25:08 +03:00			`vocab.reset_vectors(width=length)`
Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`for word, vec in vectors:`
Revise and simplify Vectors class 2017-10-31 20:25:08 +03:00			`vocab.set_vector(word, vector=vec)`
Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`return vocab`


Add get_cosine util function 2017-01-12 18:49:57 +03:00			`def get_cosine(vec1, vec2):`
			`"""Get cosine for two given vectors"""`
			`return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00

			`def assert_docs_equal(doc1, doc2):`
Reformat add_docs_equal and add docstring 2017-01-13 16:25:53 +03:00			`"""Compare two Doc objects and assert that they're equal. Tests for tokens,`
			`tags, dependencies and entities."""`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00			`assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]`

			`assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]`
			`assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]`

			`assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]`
			`assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]`
			`if doc1.is_parsed and doc2.is_parsed:`
			`assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]`

			`assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]`
			`assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]`
			`assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]`
Add assert_packed_msg_equal util function 2017-06-03 18:04:30 +03:00

			`def assert_packed_msg_equal(b1, b2):`
			`"""Assert that two packed msgpack messages are equal."""`
			`msg1 = msgpack.loads(b1, encoding='utf8')`
			`msg2 = msgpack.loads(b2, encoding='utf8')`
			`assert sorted(msg1.keys()) == sorted(msg2.keys())`
			`for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):`
			`assert k1 == k2`
			`assert v1 == v2`