mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
75f3234404
## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
107 lines
3.4 KiB
Python
107 lines
3.4 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import numpy
|
|
import tempfile
|
|
import shutil
|
|
import contextlib
|
|
import msgpack
|
|
from pathlib import Path
|
|
from spacy.tokens import Doc, Span
|
|
from spacy.attrs import POS, HEAD, DEP
|
|
from spacy.compat import path2str
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def make_tempfile(mode='r'):
|
|
f = tempfile.TemporaryFile(mode=mode)
|
|
yield f
|
|
f.close()
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def make_tempdir():
|
|
d = Path(tempfile.mkdtemp())
|
|
yield d
|
|
shutil.rmtree(path2str(d))
|
|
|
|
|
|
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
|
"""Create Doc object from given vocab, words and annotations."""
|
|
pos = pos or [''] * len(words)
|
|
tags = tags or [''] * len(words)
|
|
heads = heads or [0] * len(words)
|
|
deps = deps or [''] * len(words)
|
|
for value in (deps+tags+pos):
|
|
vocab.strings.add(value)
|
|
|
|
doc = Doc(vocab, words=words)
|
|
attrs = doc.to_array([POS, HEAD, DEP])
|
|
for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
|
|
attrs[i, 0] = doc.vocab.strings[p]
|
|
attrs[i, 1] = head
|
|
attrs[i, 2] = doc.vocab.strings[dep]
|
|
doc.from_array([POS, HEAD, DEP], attrs)
|
|
if ents:
|
|
doc.ents = [Span(doc, start, end, label=doc.vocab.strings[label])
|
|
for start, end, label in ents]
|
|
if tags:
|
|
for token in doc:
|
|
token.tag_ = tags[token.i]
|
|
return doc
|
|
|
|
|
|
def apply_transition_sequence(parser, doc, sequence):
|
|
"""Perform a series of pre-specified transitions, to put the parser in a
|
|
desired state."""
|
|
for action_name in sequence:
|
|
if '-' in action_name:
|
|
move, label = action_name.split('-')
|
|
parser.add_label(label)
|
|
with parser.step_through(doc) as stepwise:
|
|
for transition in sequence:
|
|
stepwise.transition(transition)
|
|
|
|
|
|
def add_vecs_to_vocab(vocab, vectors):
|
|
"""Add list of vector tuples to given vocab. All vectors need to have the
|
|
same length. Format: [("text", [1, 2, 3])]"""
|
|
length = len(vectors[0][1])
|
|
vocab.reset_vectors(width=length)
|
|
for word, vec in vectors:
|
|
vocab.set_vector(word, vector=vec)
|
|
return vocab
|
|
|
|
|
|
def get_cosine(vec1, vec2):
|
|
"""Get cosine for two given vectors"""
|
|
return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))
|
|
|
|
|
|
def assert_docs_equal(doc1, doc2):
|
|
"""Compare two Doc objects and assert that they're equal. Tests for tokens,
|
|
tags, dependencies and entities."""
|
|
assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]
|
|
|
|
assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]
|
|
assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]
|
|
|
|
assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]
|
|
assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]
|
|
if doc1.is_parsed and doc2.is_parsed:
|
|
assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]
|
|
|
|
assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
|
|
assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
|
|
assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
|
|
|
|
|
|
def assert_packed_msg_equal(b1, b2):
|
|
"""Assert that two packed msgpack messages are equal."""
|
|
msg1 = msgpack.loads(b1, encoding='utf8')
|
|
msg2 = msgpack.loads(b2, encoding='utf8')
|
|
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
|
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
|
assert k1 == k2
|
|
assert v1 == v2
|