spaCy/spacy/tests/util.py
Sofie Van Landeghem 311133e579
Train textcat with config (#5143)
* bring back default build_text_classifier method

* remove _set_dims_ hack in favor of proper dim inference

* add tok2vec initialize to unit test

* small fixes

* add unit test for various textcat config settings

* logistic output layer does not have nO

* fix window_size setting

* proper fix

* fix W initialization

* Update textcat training example

* Use ml_datasets
* Convert training data to `Example` format
* Use `n_texts` to set proportionate dev size

* fix _init renaming on latest thinc

* avoid setting a non-existing dim

* update to thinc==8.0.0a2

* add BOW and CNN defaults for easy testing

* various experiments with train_textcat script, fix softmax activation in textcat bow

* allow textcat train script to work on other datasets as well

* have dataset as a parameter

* train textcat from config, with example config

* add config for training textcat

* formatting

* fix exclusive_classes

* fixing BOW for GPU

* bump thinc to 8.0.0a3 (not published yet so CI will fail)

* add in link_vectors_to_models which got deleted

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2020-03-29 19:40:36 +02:00

151 lines
4.6 KiB
Python

import numpy
import tempfile
import shutil
import contextlib
import srsly
from pathlib import Path
from spacy import Errors
from spacy.tokens import Doc, Span
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA
from spacy.vocab import Vocab
@contextlib.contextmanager
def make_tempfile(mode="r"):
f = tempfile.TemporaryFile(mode=mode)
yield f
f.close()
@contextlib.contextmanager
def make_tempdir():
d = Path(tempfile.mkdtemp())
yield d
shutil.rmtree(str(d))
def get_doc(
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
):
"""Create Doc object from given vocab, words and annotations."""
if deps and not heads:
heads = [0] * len(deps)
headings = []
values = []
annotations = [pos, heads, deps, lemmas, tags]
possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
for a, annot in enumerate(annotations):
if annot is not None:
if len(annot) != len(words):
raise ValueError(Errors.E189)
headings.append(possible_headings[a])
if annot is not heads:
values.extend(annot)
for value in values:
vocab.strings.add(value)
doc = Doc(vocab, words=words)
# if there are any other annotations, set them
if headings:
attrs = doc.to_array(headings)
j = 0
for annot in annotations:
if annot:
if annot is heads:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = heads[i]
else:
attrs[i, j] = heads[i]
else:
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = doc.vocab.strings[annot[i]]
else:
attrs[i, j] = doc.vocab.strings[annot[i]]
j += 1
doc.from_array(headings, attrs)
# finally, set the entities
if ents:
doc.ents = [
Span(doc, start, end, label=doc.vocab.strings[label])
for start, end, label in ents
]
return doc
def get_batch(batch_size):
vocab = Vocab()
docs = []
start = 0
for size in range(1, batch_size + 1):
# Make the words numbers, so that they're distinct
# across the batch, and easy to track.
numbers = [str(i) for i in range(start, start + size)]
docs.append(Doc(vocab, words=numbers))
start += size
return docs
def apply_transition_sequence(parser, doc, sequence):
"""Perform a series of pre-specified transitions, to put the parser in a
desired state."""
for action_name in sequence:
if "-" in action_name:
move, label = action_name.split("-")
parser.add_label(label)
with parser.step_through(doc) as stepwise:
for transition in sequence:
stepwise.transition(transition)
def add_vecs_to_vocab(vocab, vectors):
"""Add list of vector tuples to given vocab. All vectors need to have the
same length. Format: [("text", [1, 2, 3])]"""
length = len(vectors[0][1])
vocab.reset_vectors(width=length)
for word, vec in vectors:
vocab.set_vector(word, vector=vec)
return vocab
def get_cosine(vec1, vec2):
"""Get cosine for two given vectors"""
return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))
def assert_docs_equal(doc1, doc2):
"""Compare two Doc objects and assert that they're equal. Tests for tokens,
tags, dependencies and entities."""
assert [t.orth for t in doc1] == [t.orth for t in doc2]
assert [t.pos for t in doc1] == [t.pos for t in doc2]
assert [t.tag for t in doc1] == [t.tag for t in doc2]
assert [t.head.i for t in doc1] == [t.head.i for t in doc2]
assert [t.dep for t in doc1] == [t.dep for t in doc2]
assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2]
assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]
assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]
for ent1, ent2 in zip(doc1.ents, doc2.ents):
assert ent1.start == ent2.start
assert ent1.end == ent2.end
assert ent1.label == ent2.label
assert ent1.kb_id == ent2.kb_id
def assert_packed_msg_equal(b1, b2):
"""Assert that two packed msgpack messages are equal."""
msg1 = srsly.msgpack_loads(b1)
msg2 = srsly.msgpack_loads(b2)
assert sorted(msg1.keys()) == sorted(msg2.keys())
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2
assert v1 == v2