2017-01-12 18:49:57 +03:00
|
|
|
import numpy
|
2017-06-02 11:56:09 +03:00
|
|
|
import tempfile
|
|
|
|
import contextlib
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
import srsly
|
2020-03-02 13:49:28 +03:00
|
|
|
|
|
|
|
from spacy import Errors
|
2018-07-25 00:38:44 +03:00
|
|
|
from spacy.tokens import Doc, Span
|
2020-06-26 20:34:12 +03:00
|
|
|
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
|
2017-05-29 23:11:31 +03:00
|
|
|
|
2020-03-29 20:40:36 +03:00
|
|
|
from spacy.vocab import Vocab
|
2020-06-27 14:03:03 +03:00
|
|
|
from spacy.util import make_tempdir # noqa: F401
|
2020-03-29 20:40:36 +03:00
|
|
|
|
2017-05-29 23:11:31 +03:00
|
|
|
|
2017-06-02 11:56:09 +03:00
|
|
|
@contextlib.contextmanager
|
2018-11-27 03:09:36 +03:00
|
|
|
def make_tempfile(mode="r"):
|
2017-06-02 11:56:09 +03:00
|
|
|
f = tempfile.TemporaryFile(mode=mode)
|
|
|
|
yield f
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
2020-03-25 14:28:12 +03:00
|
|
|
def get_doc(
|
2020-06-26 20:34:12 +03:00
|
|
|
vocab,
|
|
|
|
words=[],
|
|
|
|
pos=None,
|
|
|
|
heads=None,
|
|
|
|
deps=None,
|
|
|
|
tags=None,
|
|
|
|
ents=None,
|
|
|
|
lemmas=None,
|
|
|
|
morphs=None,
|
2020-03-25 14:28:12 +03:00
|
|
|
):
|
2017-01-11 15:55:33 +03:00
|
|
|
"""Create Doc object from given vocab, words and annotations."""
|
2020-03-02 13:49:28 +03:00
|
|
|
if deps and not heads:
|
|
|
|
heads = [0] * len(deps)
|
|
|
|
headings = []
|
|
|
|
values = []
|
2020-06-26 20:34:12 +03:00
|
|
|
annotations = [pos, heads, deps, lemmas, tags, morphs]
|
|
|
|
possible_headings = [POS, HEAD, DEP, LEMMA, TAG, MORPH]
|
2020-03-02 13:49:28 +03:00
|
|
|
for a, annot in enumerate(annotations):
|
|
|
|
if annot is not None:
|
|
|
|
if len(annot) != len(words):
|
|
|
|
raise ValueError(Errors.E189)
|
|
|
|
headings.append(possible_headings[a])
|
|
|
|
if annot is not heads:
|
|
|
|
values.extend(annot)
|
|
|
|
for value in values:
|
2017-05-28 16:10:22 +03:00
|
|
|
vocab.strings.add(value)
|
2017-01-11 15:55:33 +03:00
|
|
|
|
|
|
|
doc = Doc(vocab, words=words)
|
2020-03-02 13:49:28 +03:00
|
|
|
|
|
|
|
# if there are any other annotations, set them
|
|
|
|
if headings:
|
|
|
|
attrs = doc.to_array(headings)
|
|
|
|
|
|
|
|
j = 0
|
|
|
|
for annot in annotations:
|
|
|
|
if annot:
|
|
|
|
if annot is heads:
|
|
|
|
for i in range(len(words)):
|
|
|
|
if attrs.ndim == 1:
|
|
|
|
attrs[i] = heads[i]
|
|
|
|
else:
|
2020-03-25 14:28:12 +03:00
|
|
|
attrs[i, j] = heads[i]
|
2020-06-26 20:34:12 +03:00
|
|
|
elif annot is morphs:
|
|
|
|
for i in range(len(words)):
|
|
|
|
morph_key = vocab.morphology.add(morphs[i])
|
|
|
|
if attrs.ndim == 1:
|
|
|
|
attrs[i] = morph_key
|
|
|
|
else:
|
|
|
|
attrs[i, j] = morph_key
|
2020-03-02 13:49:28 +03:00
|
|
|
else:
|
|
|
|
for i in range(len(words)):
|
|
|
|
if attrs.ndim == 1:
|
|
|
|
attrs[i] = doc.vocab.strings[annot[i]]
|
|
|
|
else:
|
|
|
|
attrs[i, j] = doc.vocab.strings[annot[i]]
|
|
|
|
j += 1
|
|
|
|
doc.from_array(headings, attrs)
|
|
|
|
|
|
|
|
# finally, set the entities
|
2017-01-12 14:25:10 +03:00
|
|
|
if ents:
|
2018-11-27 03:09:36 +03:00
|
|
|
doc.ents = [
|
|
|
|
Span(doc, start, end, label=doc.vocab.strings[label])
|
|
|
|
for start, end, label in ents
|
|
|
|
]
|
2017-01-11 15:55:33 +03:00
|
|
|
return doc
|
2017-01-11 23:30:14 +03:00
|
|
|
|
|
|
|
|
2020-03-29 20:40:36 +03:00
|
|
|
def get_batch(batch_size):
|
|
|
|
vocab = Vocab()
|
|
|
|
docs = []
|
|
|
|
start = 0
|
|
|
|
for size in range(1, batch_size + 1):
|
|
|
|
# Make the words numbers, so that they're distinct
|
|
|
|
# across the batch, and easy to track.
|
|
|
|
numbers = [str(i) for i in range(start, start + size)]
|
|
|
|
docs.append(Doc(vocab, words=numbers))
|
|
|
|
start += size
|
|
|
|
return docs
|
|
|
|
|
|
|
|
|
2020-06-02 23:24:57 +03:00
|
|
|
def get_random_doc(n_words):
|
2020-06-02 19:26:21 +03:00
|
|
|
vocab = Vocab()
|
|
|
|
# Make the words numbers, so that they're easy to track.
|
|
|
|
numbers = [str(i) for i in range(0, n_words)]
|
|
|
|
return Doc(vocab, words=numbers)
|
|
|
|
|
|
|
|
|
2017-01-11 23:30:14 +03:00
|
|
|
def apply_transition_sequence(parser, doc, sequence):
|
|
|
|
"""Perform a series of pre-specified transitions, to put the parser in a
|
|
|
|
desired state."""
|
|
|
|
for action_name in sequence:
|
2018-11-27 03:09:36 +03:00
|
|
|
if "-" in action_name:
|
|
|
|
move, label = action_name.split("-")
|
2017-01-11 23:30:14 +03:00
|
|
|
parser.add_label(label)
|
|
|
|
with parser.step_through(doc) as stepwise:
|
|
|
|
for transition in sequence:
|
|
|
|
stepwise.transition(transition)
|
2017-01-12 18:49:57 +03:00
|
|
|
|
|
|
|
|
2017-01-13 16:26:30 +03:00
|
|
|
def add_vecs_to_vocab(vocab, vectors):
|
|
|
|
"""Add list of vector tuples to given vocab. All vectors need to have the
|
|
|
|
same length. Format: [("text", [1, 2, 3])]"""
|
|
|
|
length = len(vectors[0][1])
|
2017-10-31 20:25:08 +03:00
|
|
|
vocab.reset_vectors(width=length)
|
2017-01-13 16:26:30 +03:00
|
|
|
for word, vec in vectors:
|
2017-10-31 20:25:08 +03:00
|
|
|
vocab.set_vector(word, vector=vec)
|
2017-01-13 16:26:30 +03:00
|
|
|
return vocab
|
|
|
|
|
|
|
|
|
2017-01-12 18:49:57 +03:00
|
|
|
def get_cosine(vec1, vec2):
|
|
|
|
"""Get cosine for two given vectors"""
|
|
|
|
return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))
|
2017-01-12 23:56:52 +03:00
|
|
|
|
|
|
|
|
|
|
|
def assert_docs_equal(doc1, doc2):
|
2017-01-13 16:25:53 +03:00
|
|
|
"""Compare two Doc objects and assert that they're equal. Tests for tokens,
|
|
|
|
tags, dependencies and entities."""
|
2018-11-27 03:09:36 +03:00
|
|
|
assert [t.orth for t in doc1] == [t.orth for t in doc2]
|
2017-01-12 23:56:52 +03:00
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
assert [t.pos for t in doc1] == [t.pos for t in doc2]
|
|
|
|
assert [t.tag for t in doc1] == [t.tag for t in doc2]
|
2017-01-12 23:56:52 +03:00
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
assert [t.head.i for t in doc1] == [t.head.i for t in doc2]
|
|
|
|
assert [t.dep for t in doc1] == [t.dep for t in doc2]
|
2020-03-13 11:25:23 +03:00
|
|
|
assert [t.is_sent_start for t in doc1] == [t.is_sent_start for t in doc2]
|
2017-01-12 23:56:52 +03:00
|
|
|
|
2018-11-27 03:09:36 +03:00
|
|
|
assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2]
|
|
|
|
assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2]
|
2020-02-16 19:20:36 +03:00
|
|
|
for ent1, ent2 in zip(doc1.ents, doc2.ents):
|
|
|
|
assert ent1.start == ent2.start
|
|
|
|
assert ent1.end == ent2.end
|
|
|
|
assert ent1.label == ent2.label
|
|
|
|
assert ent1.kb_id == ent2.kb_id
|
2017-06-03 18:04:30 +03:00
|
|
|
|
|
|
|
|
|
|
|
def assert_packed_msg_equal(b1, b2):
|
|
|
|
"""Assert that two packed msgpack messages are equal."""
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
msg1 = srsly.msgpack_loads(b1)
|
|
|
|
msg2 = srsly.msgpack_loads(b2)
|
2017-06-03 18:04:30 +03:00
|
|
|
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
|
|
|
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
|
|
|
assert k1 == k2
|
|
|
|
assert v1 == v2
|