spaCy/spacy/tests/util.py

# coding: utf-8
from __future__ import unicode_literals

from ..tokens import Doc
from ..attrs import ORTH, POS, HEAD, DEP

import numpy


def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [''] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [''] * len(words)

    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
        attrs[i, 0] = doc.vocab.strings[p]
        attrs[i, 1] = head
        attrs[i, 2] = doc.vocab.strings[dep]
    doc.from_array([POS, HEAD, DEP], attrs)
    if ents:
        doc.ents = [(ent_id, doc.vocab.strings[label], start, end) for ent_id, label, start, end in ents]
    if tags:
        for token in doc:
            token.tag_ = tags[token.i]
    return doc


def apply_transition_sequence(parser, doc, sequence):
    """Perform a series of pre-specified transitions, to put the parser in a
    desired state."""
    for action_name in sequence:
        if '-' in action_name:
            move, label = action_name.split('-')
            parser.add_label(label)
    with parser.step_through(doc) as stepwise:
        for transition in sequence:
            stepwise.transition(transition)


def add_vecs_to_vocab(vocab, vectors):
    """Add list of vector tuples to given vocab. All vectors need to have the
    same length. Format: [("text", [1, 2, 3])]"""
    length = len(vectors[0][1])
    vocab.resize_vectors(length)
    for word, vec in vectors:
        vocab[word].vector = vec
    return vocab


def get_cosine(vec1, vec2):
    """Get cosine for two given vectors"""
    return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))


def assert_docs_equal(doc1, doc2):
    """Compare two Doc objects and assert that they're equal. Tests for tokens,
    tags, dependencies and entities."""
    assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]

    assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]
    assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]

    assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]
    assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]
    if doc1.is_parsed and doc2.is_parsed:
        assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]

    assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
    assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
    assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`from ..tokens import Doc`
			`from ..attrs import ORTH, POS, HEAD, DEP`

Add get_cosine util function 2017-01-12 18:49:57 +03:00			`import numpy`

Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00
Allow setting ents in get_doc 2017-01-12 14:25:10 +03:00			`def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`"""Create Doc object from given vocab, words and annotations."""`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`pos = pos or [''] * len(words)`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`heads = heads or [0] * len(words)`
			`deps = deps or [''] * len(words)`

			`doc = Doc(vocab, words=words)`
			`attrs = doc.to_array([POS, HEAD, DEP])`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):`
			`attrs[i, 0] = doc.vocab.strings[p]`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`attrs[i, 1] = head`
			`attrs[i, 2] = doc.vocab.strings[dep]`
			`doc.from_array([POS, HEAD, DEP], attrs)`
Allow setting ents in get_doc 2017-01-12 14:25:10 +03:00			`if ents:`
			`doc.ents = [(ent_id, doc.vocab.strings[label], start, end) for ent_id, label, start, end in ents]`
Rename tags to pos in get_doc and allow adding tags to tokens 2017-01-12 13:18:36 +03:00			`if tags:`
			`for token in doc:`
			`token.tag_ = tags[token.i]`
Add test utils and get_doc helper function Create Doc object from given vocab, words and annotations to allow tests not to depend on loading the models. 2017-01-11 15:55:33 +03:00			`return doc`
Add apply_transition_sequence util function to utils 2017-01-11 23:30:14 +03:00

			`def apply_transition_sequence(parser, doc, sequence):`
			`"""Perform a series of pre-specified transitions, to put the parser in a`
			`desired state."""`
			`for action_name in sequence:`
			`if '-' in action_name:`
			`move, label = action_name.split('-')`
			`parser.add_label(label)`
			`with parser.step_through(doc) as stepwise:`
			`for transition in sequence:`
			`stepwise.transition(transition)`
Add get_cosine util function 2017-01-12 18:49:57 +03:00

Add util function to add vectors to vocab 2017-01-13 16:26:30 +03:00			`def add_vecs_to_vocab(vocab, vectors):`
			`"""Add list of vector tuples to given vocab. All vectors need to have the`
			`same length. Format: [("text", [1, 2, 3])]"""`
			`length = len(vectors[0][1])`
			`vocab.resize_vectors(length)`
			`for word, vec in vectors:`
			`vocab[word].vector = vec`
			`return vocab`


Add get_cosine util function 2017-01-12 18:49:57 +03:00			`def get_cosine(vec1, vec2):`
			`"""Get cosine for two given vectors"""`
			`return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00

			`def assert_docs_equal(doc1, doc2):`
Reformat add_docs_equal and add docstring 2017-01-13 16:25:53 +03:00			`"""Compare two Doc objects and assert that they're equal. Tests for tokens,`
			`tags, dependencies and entities."""`
Add assert_docs_equal util to compare two docs 2017-01-12 23:56:52 +03:00			`assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]`

			`assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]`
			`assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]`

			`assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]`
			`assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]`
			`if doc1.is_parsed and doc2.is_parsed:`
			`assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]`

			`assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]`
			`assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]`
			`assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]`