💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)

Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉

See here: https://github.com/explosion/srsly

    Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.

    At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.

    srsly currently includes forks of the following packages:

        ujson
        msgpack
        msgpack-numpy
        cloudpickle



* WIP: replace json/ujson with srsly

* Replace ujson in examples

Use regular json instead of srsly to make code easier to read and follow

* Update requirements

* Fix imports

* Fix typos

* Replace msgpack with srsly

* Fix warning
This commit is contained in:
Ines Montani 2018-12-03 01:28:22 +01:00 committed by Matthew Honnibal
parent 40b57ea4ac
commit f37863093a
33 changed files with 130 additions and 238 deletions

View File

@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()`
helper function. helper function.
```python ```python
from .compat import unicode_, json_dumps, is_config from .compat import unicode_, is_config
compatible_unicode = unicode_('hello world') compatible_unicode = unicode_('hello world')
compatible_json = json_dumps({'key': 'value'})
if is_config(windows=True, python2=True): if is_config(windows=True, python2=True):
print("You are using Python 2 on Windows.") print("You are using Python 2 on Windows.")
``` ```

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals
import bz2 import bz2
import regex as re import regex as re
import ujson import srsly
import sys import sys
import random import random
import datetime import datetime
@ -44,7 +44,7 @@ class Reddit(object):
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
comment = ujson.loads(line) comment = srsly.json_loads(line)
if self.is_valid(comment): if self.is_valid(comment):
text = self.strip_tags(comment["body"]) text = self.strip_tags(comment["body"])
yield {"text": text} yield {"text": text}
@ -75,7 +75,7 @@ class Reddit(object):
def main(path): def main(path):
reddit = Reddit(path) reddit = Reddit(path)
for comment in reddit: for comment in reddit:
print(ujson.dumps(comment)) print(srsly.json_dumps(comment))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -45,7 +45,7 @@ from __future__ import print_function, unicode_literals, division
from bz2 import BZ2File from bz2 import BZ2File
import time import time
import plac import plac
import ujson import json
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
import spacy import spacy
@ -71,7 +71,7 @@ def main(patterns_loc, text_loc, n=10000, lang="en"):
def read_gazetteer(tokenizer, loc, n=-1): def read_gazetteer(tokenizer, loc, n=-1):
for i, line in enumerate(open(loc)): for i, line in enumerate(open(loc)):
data = ujson.loads(line.strip()) data = json.loads(line.strip())
phrase = tokenizer(data["text"]) phrase = tokenizer(data["text"])
for w in phrase: for w in phrase:
_ = tokenizer.vocab[w.text] _ = tokenizer.vocab[w.text]
@ -82,7 +82,7 @@ def read_gazetteer(tokenizer, loc, n=-1):
def read_text(bz2_loc, n=10000): def read_text(bz2_loc, n=10000):
with BZ2File(bz2_loc) as file_: with BZ2File(bz2_loc) as file_:
for i, line in enumerate(file_): for i, line in enumerate(file_):
data = ujson.loads(line) data = json.loads(line)
yield data["body"] yield data["body"]
if i >= n: if i >= n:
break break

View File

@ -1,5 +1,5 @@
import numpy as np import numpy as np
import ujson as json import json
from keras.utils import to_categorical from keras.utils import to_categorical
import plac import plac
import sys import sys

View File

@ -77,7 +77,7 @@
} }
], ],
"source": [ "source": [
"import ujson as json\n", "import json\n",
"from keras.utils import to_categorical\n", "from keras.utils import to_categorical\n",
"\n", "\n",
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n", "LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",

View File

@ -6,12 +6,12 @@ blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cytoolz>=0.9.0,<0.10.0 cytoolz>=0.9.0,<0.10.0
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
ujson>=1.35
dill>=0.2,<0.3 dill>=0.2,<0.3
regex==2018.01.10 regex==2018.01.10
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
jsonschema>=2.6.0,<3.0.0 jsonschema>=2.6.0,<3.0.0
wasabi>=0.0.8,<1.1.0 wasabi>=0.0.8,<1.1.0
srsly>=0.0.4,<1.1.0
pathlib==1.0.1; python_version < "3.4" pathlib==1.0.1; python_version < "3.4"
# Development dependencies # Development dependencies
cython>=0.25 cython>=0.25

View File

@ -203,12 +203,12 @@ def setup_package():
"thinc==7.0.0.dev4", "thinc==7.0.0.dev4",
"blis>=0.2.2,<0.3.0", "blis>=0.2.2,<0.3.0",
"plac<1.0.0,>=0.9.6", "plac<1.0.0,>=0.9.6",
"ujson>=1.35",
"regex==2018.01.10", "regex==2018.01.10",
"dill>=0.2,<0.3", "dill>=0.2,<0.3",
"requests>=2.13.0,<3.0.0", "requests>=2.13.0,<3.0.0",
"jsonschema>=2.6.0,<3.0.0", "jsonschema>=2.6.0,<3.0.0",
"wasabi>=0.0.8,<1.1.0", "wasabi>=0.0.8,<1.1.0",
"srsly>=0.0.4,<1.1.0",
'pathlib==1.0.1; python_version < "3.4"', 'pathlib==1.0.1; python_version < "3.4"',
], ],
setup_requires=["wheel"], setup_requires=["wheel"],

View File

@ -4,9 +4,9 @@ from __future__ import unicode_literals
import plac import plac
from pathlib import Path from pathlib import Path
from wasabi import Printer from wasabi import Printer
import srsly
from ..util import write_jsonl, write_json from ..compat import path2str
from ..compat import json_dumps, path2str
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
from .converters import ner_jsonl2json from .converters import ner_jsonl2json
from ._messages import Messages from ._messages import Messages
@ -77,9 +77,9 @@ def convert(
suffix = ".{}".format(file_type) suffix = ".{}".format(file_type)
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json": if file_type == "json":
write_json(output_file, data) srsly.write_json(output_file, data)
elif file_type == "jsonl": elif file_type == "jsonl":
write_jsonl(output_file, data) srsly.write_jsonl(output_file, data)
msg.good( msg.good(
Messages.M032.format(name=path2str(output_file)), Messages.M032.format(name=path2str(output_file)),
Messages.M033.format(n_docs=len(data)), Messages.M033.format(n_docs=len(data)),
@ -87,7 +87,6 @@ def convert(
else: else:
# Print to stdout # Print to stdout
if file_type == "json": if file_type == "json":
print(json_dumps(data)) srsly.write_json("-", data)
elif file_type == "jsonl": elif file_type == "jsonl":
for line in data: srsly.write_jsonl("-", data)
print(json_dumps(line))

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson import srsly
from ...util import get_lang_class from ...util import get_lang_class
from .._messages import Messages from .._messages import Messages
@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
if lang is None: if lang is None:
raise ValueError(Messages.M054) raise ValueError(Messages.M054)
json_docs = [] json_docs = []
input_tuples = [ujson.loads(line) for line in input_data] input_tuples = [srsly.json_loads(line) for line in input_data]
nlp = get_lang_class(lang)() nlp = get_lang_class(lang)()
for i, (raw_text, ents) in enumerate(input_tuples): for i, (raw_text, ents) in enumerate(input_tuples):
doc = nlp.make_doc(raw_text) doc = nlp.make_doc(raw_text)

View File

@ -5,10 +5,11 @@ from pathlib import Path
from collections import Counter from collections import Counter
import plac import plac
import sys import sys
import srsly
from wasabi import Printer, MESSAGES from wasabi import Printer, MESSAGES
from ..gold import GoldCorpus, read_json_object from ..gold import GoldCorpus, read_json_object
from ..util import load_model, get_lang_class, read_json, read_jsonl from ..util import load_model, get_lang_class
# from .schemas import get_schema, validate_json # from .schemas import get_schema, validate_json
from ._messages import Messages from ._messages import Messages
@ -320,11 +321,11 @@ def debug_data(
def _load_file(file_path, msg): def _load_file(file_path, msg):
file_name = file_path.parts[-1] file_name = file_path.parts[-1]
if file_path.suffix == ".json": if file_path.suffix == ".json":
data = read_json(file_path) data = srsly.read_json(file_path)
msg.good("Loaded {}".format(file_name)) msg.good("Loaded {}".format(file_name))
return data return data
elif file_path.suffix == ".jsonl": elif file_path.suffix == ".jsonl":
data = read_jsonl(file_path) data = srsly.read_jsonl(file_path)
msg.good("Loaded {}".format(file_name)) msg.good("Loaded {}".format(file_name))
return data return data
msg.fail( msg.fail(

View File

@ -5,6 +5,7 @@ import plac
import platform import platform
from pathlib import Path from pathlib import Path
from wasabi import Printer from wasabi import Printer
import srsly
from ._messages import Messages from ._messages import Messages
from ..compat import path2str, basestring_, unicode_ from ..compat import path2str, basestring_, unicode_
@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False):
meta_path = model_path / "meta.json" meta_path = model_path / "meta.json"
if not meta_path.is_file(): if not meta_path.is_file():
msg.fail(Messages.M020, meta_path, exits=1) msg.fail(Messages.M020, meta_path, exits=1)
meta = util.read_json(meta_path) meta = srsly.read_json(meta_path)
if model_path.resolve() != model_path: if model_path.resolve() != model_path:
meta["link"] = path2str(model_path) meta["link"] = path2str(model_path)
meta["source"] = path2str(model_path.resolve()) meta["source"] = path2str(model_path.resolve())

View File

@ -11,12 +11,13 @@ from preshed.counter import PreshCounter
import tarfile import tarfile
import gzip import gzip
import zipfile import zipfile
import srsly
from wasabi import Printer from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning from ..errors import Errors, Warnings, user_warning
from ..util import ensure_path, get_lang_class, read_jsonl from ..util import ensure_path, get_lang_class
try: try:
import ftfy import ftfy
@ -59,7 +60,7 @@ def init_model(
settings.append("-c") settings.append("-c")
msg.warn(Messages.M063, Messages.M064) msg.warn(Messages.M063, Messages.M064)
jsonl_loc = ensure_path(jsonl_loc) jsonl_loc = ensure_path(jsonl_loc)
lex_attrs = read_jsonl(jsonl_loc) lex_attrs = srsly.read_jsonl(jsonl_loc)
else: else:
clusters_loc = ensure_path(clusters_loc) clusters_loc = ensure_path(clusters_loc)
freqs_loc = ensure_path(freqs_loc) freqs_loc = ensure_path(freqs_loc)

View File

@ -5,9 +5,10 @@ import plac
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import Printer, get_raw_input from wasabi import Printer, get_raw_input
import srsly
from ._messages import Messages from ._messages import Messages
from ..compat import path2str, json_dumps from ..compat import path2str
from .. import util from .. import util
from .. import about from .. import about
@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
meta_path = meta_path or input_path / "meta.json" meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file(): if meta_path.is_file():
meta = util.read_json(meta_path) meta = srsly.read_json(meta_path)
if not create_meta: # only print if user doesn't want to overwrite if not create_meta: # only print if user doesn't want to overwrite
msg.good(Messages.M041, meta_path) msg.good(Messages.M041, meta_path)
else: else:
@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
) )
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
create_file(main_path / "meta.json", json_dumps(meta)) create_file(main_path / "meta.json", srsly.json_dumps(meta))
create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT) create_file(package_path / "__init__.py", TEMPLATE_INIT)

View File

@ -5,8 +5,6 @@ import plac
import random import random
import numpy import numpy
import time import time
import ujson
import sys
from collections import Counter from collections import Counter
from pathlib import Path from pathlib import Path
from thinc.v2v import Affine, Maxout from thinc.v2v import Affine, Maxout
@ -14,10 +12,10 @@ from thinc.api import wrap
from thinc.misc import LayerNorm as LN from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu from thinc.neural.util import prefer_gpu
from wasabi import Printer from wasabi import Printer
import srsly
from ..tokens import Doc from ..tokens import Doc
from ..attrs import ID, HEAD from ..attrs import ID, HEAD
from ..compat import json_dumps
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from .. import util from .. import util
@ -72,7 +70,7 @@ def pretrain(
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
msg.good("Created output directory") msg.good("Created output directory")
util.write_json(output_dir / "config.json", config) srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json") msg.good("Saved settings to config.json")
# Load texts from file or stdin # Load texts from file or stdin
@ -81,12 +79,12 @@ def pretrain(
if not texts_loc.exists(): if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1) msg.fail("Input text file doesn't exist", texts_loc, exits=1)
with msg.loading("Loading input texts..."): with msg.loading("Loading input texts..."):
texts = list(util.read_jsonl(texts_loc)) texts = list(srsly.read_jsonl(texts_loc))
msg.good("Loaded input texts") msg.good("Loaded input texts")
random.shuffle(texts) random.shuffle(texts)
else: # reading from stdin else: # reading from stdin
msg.text("Reading input text from stdin...") msg.text("Reading input text from stdin...")
texts = stream_texts() texts = srsly.read_jsonl("-")
with msg.loading("Loading model '{}'...".format(vectors_model)): with msg.loading("Loading model '{}'...".format(vectors_model)):
nlp = util.load_model(vectors_model) nlp = util.load_model(vectors_model)
@ -130,18 +128,13 @@ def pretrain(
"epoch": epoch, "epoch": epoch,
} }
with (output_dir / "log.jsonl").open("a") as file_: with (output_dir / "log.jsonl").open("a") as file_:
file_.write(json_dumps(log) + "\n") file_.write(srsly.json_dumps(log) + "\n")
tracker.epoch_loss = 0.0 tracker.epoch_loss = 0.0
if texts_loc != "-": if texts_loc != "-":
# Reshuffle the texts if texts were loaded from a file # Reshuffle the texts if texts were loaded from a file
random.shuffle(texts) random.shuffle(texts)
def stream_texts():
for line in sys.stdin:
yield ujson.loads(line)
def make_update(model, docs, optimizer, drop=0.0): def make_update(model, docs, optimizer, drop=0.0):
"""Perform an update over a single batch of documents. """Perform an update over a single batch of documents.

View File

@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
import plac import plac
from pathlib import Path from pathlib import Path
import ujson import srsly
import cProfile import cProfile
import pstats import pstats
import sys import sys
@ -64,6 +64,6 @@ def _read_inputs(loc, msg):
msg.info("Using data from {}".format(input_path.parts[-1])) msg.info("Using data from {}".format(input_path.parts[-1]))
file_ = input_path.open() file_ = input_path.open()
for line in file_: for line in file_:
data = ujson.loads(line) data = srsly.json_loads(line)
text = data["text"] text = data["text"]
yield text yield text

View File

@ -3,9 +3,9 @@ from __future__ import unicode_literals
from pathlib import Path from pathlib import Path
from jsonschema import Draft4Validator from jsonschema import Draft4Validator
import srsly
from ...errors import Errors from ...errors import Errors
from ...util import read_json
SCHEMAS = {} SCHEMAS = {}
@ -25,7 +25,7 @@ def get_schema(name):
schema_path = Path(__file__).parent / "{}.json".format(name) schema_path = Path(__file__).parent / "{}.json".format(name)
if not schema_path.exists(): if not schema_path.exists():
raise ValueError(Errors.E104.format(name=name)) raise ValueError(Errors.E104.format(name=name))
schema = read_json(schema_path) schema = srsly.read_json(schema_path)
# TODO: replace with (stable) Draft6Validator, if available # TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema) validator = Draft4Validator(schema)
validator.check_schema(schema) validator.check_schema(schema)

View File

@ -7,6 +7,7 @@ import tqdm
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from timeit import default_timer as timer from timeit import default_timer as timer
import shutil import shutil
import srsly
from wasabi import Printer from wasabi import Printer
from ._messages import Messages from ._messages import Messages
@ -111,7 +112,7 @@ def train(
msg.fail(Messages.M051, dev_path, exits=1) msg.fail(Messages.M051, dev_path, exits=1)
if meta_path is not None and not meta_path.exists(): if meta_path is not None and not meta_path.exists():
msg.fail(Messages.M020, meta_path, exits=1) msg.fail(Messages.M020, meta_path, exits=1)
meta = util.read_json(meta_path) if meta_path else {} meta = srsly.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict): if not isinstance(meta, dict):
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1) msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
@ -226,7 +227,7 @@ def train(
end_time = timer() end_time = timer()
cpu_wps = nwords / (end_time - start_time) cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json" acc_loc = output_path / ("model%d" % i) / "accuracy.json"
util.write_json(acc_loc, scorer.scores) srsly.write_json(acc_loc, scorer.scores)
# Update model meta.json # Update model meta.json
meta["lang"] = nlp.lang meta["lang"] = nlp.lang
@ -242,7 +243,7 @@ def train(
meta.setdefault("name", "model%d" % i) meta.setdefault("name", "model%d" % i)
meta.setdefault("version", version) meta.setdefault("version", version)
meta_loc = output_path / ("model%d" % i) / "meta.json" meta_loc = output_path / ("model%d" % i) / "meta.json"
util.write_json(meta_loc, meta) srsly.write_json(meta_loc, meta)
util.set_env_log(verbose) util.set_env_log(verbose)
@ -293,17 +294,17 @@ def _collate_best_model(meta, output_path, components):
for component, best_component_src in bests.items(): for component, best_component_src in bests.items():
shutil.rmtree(best_dest / component) shutil.rmtree(best_dest / component)
shutil.copytree(best_component_src / component, best_dest / component) shutil.copytree(best_component_src / component, best_dest / component)
accs = util.read_json(best_component_src / "accuracy.json") accs = srsly.read_json(best_component_src / "accuracy.json")
for metric in _get_metrics(component): for metric in _get_metrics(component):
meta["accuracy"][metric] = accs[metric] meta["accuracy"][metric] = accs[metric]
util.write_json(best_dest / "meta.json", meta) srsly.write_json(best_dest / "meta.json", meta)
def _find_best(experiment_dir, component): def _find_best(experiment_dir, component):
accuracies = [] accuracies = []
for epoch_model in experiment_dir.iterdir(): for epoch_model in experiment_dir.iterdir():
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
accs = util.read_json(epoch_model / "accuracy.json") accs = srsly.read_json(epoch_model / "accuracy.json")
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
accuracies.append((scores, epoch_model)) accuracies.append((scores, epoch_model))
if accuracies: if accuracies:

View File

@ -9,7 +9,7 @@ import tqdm
from pathlib import Path from pathlib import Path
import re import re
import sys import sys
import json import srsly
import spacy import spacy
import spacy.util import spacy.util
@ -44,7 +44,7 @@ from ...lang import ru
# Data reading # # Data reading #
################ ################
space_re = re.compile("\s+") space_re = re.compile(r"\s+")
def split_text(text): def split_text(text):
@ -332,8 +332,7 @@ def main(test_data_dir, experiment_dir, corpus):
/ corpus / corpus
/ "{section}-accuracy.json".format(section=section) / "{section}-accuracy.json".format(section=section)
) )
with open(acc_path, "w") as file_: srsly.write_json(acc_path, accuracy)
file_.write(json.dumps(accuracy, indent=2))
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -5,11 +5,12 @@ import pkg_resources
from pathlib import Path from pathlib import Path
import sys import sys
import requests import requests
import srsly
from wasabi import Printer from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from ..compat import path2str from ..compat import path2str
from ..util import get_data_path, read_json from ..util import get_data_path
from .. import about from .. import about
@ -84,7 +85,7 @@ def get_model_links(compat):
meta_path = Path(model) / "meta.json" meta_path = Path(model) / "meta.json"
if not meta_path.exists(): if not meta_path.exists():
continue continue
meta = read_json(meta_path) meta = srsly.read_json(meta_path)
link = model.parts[-1] link = model.parts[-1]
name = meta["lang"] + "_" + meta["name"] name = meta["lang"] + "_" + meta["name"]
links[link] = { links[link] = {

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
import os import os
import sys import sys
import ujson
import itertools import itertools
from thinc.neural.util import copy_array from thinc.neural.util import copy_array
@ -54,9 +53,6 @@ if is_python2:
unicode_ = unicode # noqa: F821 unicode_ = unicode # noqa: F821
basestring_ = basestring # noqa: F821 basestring_ = basestring # noqa: F821
input_ = raw_input # noqa: F821 input_ = raw_input # noqa: F821
json_dumps = lambda data, indent=2: ujson.dumps(
data, indent=indent, escape_forward_slashes=False
).decode("utf8")
path2str = lambda path: str(path).decode("utf8") path2str = lambda path: str(path).decode("utf8")
elif is_python3: elif is_python3:
@ -64,9 +60,6 @@ elif is_python3:
unicode_ = str unicode_ = str
basestring_ = str basestring_ = str
input_ = input input_ = input
json_dumps = lambda data, indent=2: ujson.dumps(
data, indent=indent, escape_forward_slashes=False
)
path2str = lambda path: str(path) path2str = lambda path: str(path)

View File

@ -10,10 +10,7 @@ import numpy
import tempfile import tempfile
import shutil import shutil
from pathlib import Path from pathlib import Path
import msgpack import srsly
import json
import ujson
from . import _align from . import _align
from .syntax import nonproj from .syntax import nonproj
@ -21,7 +18,6 @@ from .tokens import Doc
from .errors import Errors from .errors import Errors
from . import util from . import util
from .util import minibatch, itershuffle from .util import minibatch, itershuffle
from .compat import json_dumps
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
@ -123,12 +119,11 @@ class GoldCorpus(object):
directory.mkdir() directory.mkdir()
n = 0 n = 0
for i, doc_tuple in enumerate(doc_tuples): for i, doc_tuple in enumerate(doc_tuples):
with open(directory / '{}.msg'.format(i), 'wb') as file_: srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple])
msgpack.dump([doc_tuple], file_, use_bin_type=True)
n += len(doc_tuple[1]) n += len(doc_tuple[1])
if limit and n >= limit: if limit and n >= limit:
break break
@staticmethod @staticmethod
def walk_corpus(path): def walk_corpus(path):
path = util.ensure_path(path) path = util.ensure_path(path)
@ -157,8 +152,7 @@ class GoldCorpus(object):
if loc.parts[-1].endswith('json'): if loc.parts[-1].endswith('json'):
gold_tuples = read_json_file(loc) gold_tuples = read_json_file(loc)
elif loc.parts[-1].endswith('msg'): elif loc.parts[-1].endswith('msg'):
with loc.open('rb') as file_: gold_tuples = srsly.read_msgpack(loc)
gold_tuples = msgpack.load(file_, raw=False)
else: else:
msg = "Cannot read from file: %s. Supported formats: .json, .msg" msg = "Cannot read from file: %s. Supported formats: .json, .msg"
raise ValueError(msg % loc) raise ValueError(msg % loc)
@ -378,7 +372,7 @@ def _json_iterate(loc):
if square_depth == 1 and curly_depth == 0: if square_depth == 1 and curly_depth == 0:
py_str = py_raw[start : i+1].decode('utf8') py_str = py_raw[start : i+1].decode('utf8')
try: try:
yield json.loads(py_str) yield srsly.json_loads(py_str)
except Exception: except Exception:
print(py_str) print(py_str)
raise raise

View File

@ -2,7 +2,6 @@
from __future__ import absolute_import, unicode_literals from __future__ import absolute_import, unicode_literals
import random import random
import ujson
import itertools import itertools
import weakref import weakref
import functools import functools
@ -10,6 +9,7 @@ from collections import OrderedDict
from contextlib import contextmanager from contextlib import contextmanager
from copy import copy from copy import copy
from thinc.neural import Model from thinc.neural import Model
import srsly
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
@ -18,7 +18,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler from .pipeline import EntityRuler
from .compat import json_dumps, izip, basestring_ from .compat import izip, basestring_
from .gold import GoldParse from .gold import GoldParse
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer from ._ml import link_vectors_to_models, create_default_optimizer
@ -640,7 +640,7 @@ class Language(object):
serializers = OrderedDict( serializers = OrderedDict(
( (
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)), ("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
("meta.json", lambda p: p.open("w").write(json_dumps(self.meta))), ("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
) )
) )
for name, proc in self.pipeline: for name, proc in self.pipeline:
@ -671,7 +671,7 @@ class Language(object):
path = util.ensure_path(path) path = util.ensure_path(path)
deserializers = OrderedDict( deserializers = OrderedDict(
( (
("meta.json", lambda p: self.meta.update(util.read_json(p))), ("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
( (
"vocab", "vocab",
lambda p: ( lambda p: (
@ -705,7 +705,7 @@ class Language(object):
( (
("vocab", lambda: self.vocab.to_bytes()), ("vocab", lambda: self.vocab.to_bytes()),
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)), ("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
("meta", lambda: json_dumps(self.meta)), ("meta", lambda: srsly.json_dumps(self.meta)),
) )
) )
for i, (name, proc) in enumerate(self.pipeline): for i, (name, proc) in enumerate(self.pipeline):
@ -725,7 +725,7 @@ class Language(object):
""" """
deserializers = OrderedDict( deserializers = OrderedDict(
( (
("meta", lambda b: self.meta.update(ujson.loads(b))), ("meta", lambda b: self.meta.update(srsly.json_loads(b))),
( (
"vocab", "vocab",
lambda b: ( lambda b: (

View File

@ -7,10 +7,7 @@ import numpy
cimport numpy as np cimport numpy as np
import cytoolz import cytoolz
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
import ujson import srsly
from .util import msgpack
from .util import msgpack_numpy
from thinc.api import chain from thinc.api import chain
from thinc.v2v import Affine, Maxout, Softmax from thinc.v2v import Affine, Maxout, Softmax
@ -27,7 +24,6 @@ from .syntax.arc_eager cimport ArcEager
from .morphology cimport Morphology from .morphology cimport Morphology
from .vocab cimport Vocab from .vocab cimport Vocab
from .syntax import nonproj from .syntax import nonproj
from .compat import json_dumps
from .matcher import Matcher from .matcher import Matcher
from .matcher import Matcher, PhraseMatcher from .matcher import Matcher, PhraseMatcher
@ -38,7 +34,7 @@ from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import link_vectors_to_models, zero_init, flatten
from ._ml import create_default_optimizer from ._ml import create_default_optimizer
from .errors import Errors, TempErrors from .errors import Errors, TempErrors
from .compat import json_dumps, basestring_ from .compat import basestring_
from . import util from . import util
@ -235,7 +231,7 @@ class EntityRuler(object):
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler. RETURNS (EntityRuler): The loaded entity ruler.
""" """
patterns = msgpack.loads(patterns_bytes, raw=False) patterns = srsly.msgpack_loads(patterns_bytes)
self.add_patterns(patterns) self.add_patterns(patterns)
return self return self
@ -244,7 +240,7 @@ class EntityRuler(object):
RETURNS (bytes): The serialized patterns. RETURNS (bytes): The serialized patterns.
""" """
return msgpack.dumps(self.patterns, use_bin_type=True) return srsly.msgpack_dumps(self.patterns)
def from_disk(self, path, **kwargs): def from_disk(self, path, **kwargs):
"""Load the entity ruler from a file. Expects a file containing """Load the entity ruler from a file. Expects a file containing
@ -256,7 +252,7 @@ class EntityRuler(object):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
path = path.with_suffix('.jsonl') path = path.with_suffix('.jsonl')
patterns = util.read_jsonl(path) patterns = srsly.read_jsonl(path)
self.add_patterns(patterns) self.add_patterns(patterns)
return self return self
@ -270,8 +266,7 @@ class EntityRuler(object):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
path = path.with_suffix('.jsonl') path = path.with_suffix('.jsonl')
data = [json_dumps(line, indent=0) for line in self.patterns] srsly.write_jsonl(path, self.patterns)
path.open('w').write('\n'.join(data))
class Pipe(object): class Pipe(object):
@ -368,7 +363,7 @@ class Pipe(object):
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the pipe to a bytestring.""" """Serialize the pipe to a bytestring."""
serialize = OrderedDict() serialize = OrderedDict()
serialize['cfg'] = lambda: json_dumps(self.cfg) serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
if self.model in (True, False, None): if self.model in (True, False, None):
serialize['model'] = lambda: self.model serialize['model'] = lambda: self.model
else: else:
@ -387,7 +382,7 @@ class Pipe(object):
self.model.from_bytes(b) self.model.from_bytes(b)
deserialize = OrderedDict(( deserialize = OrderedDict((
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('model', load_model), ('model', load_model),
)) ))
@ -397,7 +392,7 @@ class Pipe(object):
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Serialize the pipe to disk.""" """Serialize the pipe to disk."""
serialize = OrderedDict() serialize = OrderedDict()
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg)) serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
serialize['vocab'] = lambda p: self.vocab.to_disk(p) serialize['vocab'] = lambda p: self.vocab.to_disk(p)
if self.model not in (None, True, False): if self.model not in (None, True, False):
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
@ -424,8 +419,7 @@ class Pipe(object):
def _load_cfg(path): def _load_cfg(path):
if path.exists(): if path.exists():
with path.open() as file_: return srsly.read_json(path)
return ujson.load(file_)
else: else:
return {} return {}
@ -745,10 +739,9 @@ class Tagger(Pipe):
else: else:
serialize['model'] = self.model.to_bytes serialize['model'] = self.model.to_bytes
serialize['vocab'] = self.vocab.to_bytes serialize['vocab'] = self.vocab.to_bytes
serialize['cfg'] = lambda: ujson.dumps(self.cfg) serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize['tag_map'] = lambda: msgpack.dumps( serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map)
tag_map, use_bin_type=True)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
@ -766,7 +759,7 @@ class Tagger(Pipe):
self.model.from_bytes(b) self.model.from_bytes(b)
def load_tag_map(b): def load_tag_map(b):
tag_map = msgpack.loads(b, raw=False) tag_map = srsly.msgpack_loads(b)
self.vocab.morphology = Morphology( self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map, self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer, lemmatizer=self.vocab.morphology.lemmatizer,
@ -775,7 +768,7 @@ class Tagger(Pipe):
deserialize = OrderedDict(( deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('tag_map', load_tag_map), ('tag_map', load_tag_map),
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('model', lambda b: load_model(b)), ('model', lambda b: load_model(b)),
)) ))
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
@ -785,10 +778,9 @@ class Tagger(Pipe):
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
serialize = OrderedDict(( serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)), ('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('wb').write(msgpack.dumps( ('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
tag_map, use_bin_type=True))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())), ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))) ('cfg', lambda p: srsly.write_json(p, self.cfg))
)) ))
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
@ -803,8 +795,7 @@ class Tagger(Pipe):
self.model.from_bytes(file_.read()) self.model.from_bytes(file_.read())
def load_tag_map(p): def load_tag_map(p):
with p.open('rb') as file_: tag_map = srsly.read_msgpack(p)
tag_map = msgpack.loads(file_.read(), raw=False)
self.vocab.morphology = Morphology( self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map, self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer, lemmatizer=self.vocab.morphology.lemmatizer,

View File

@ -7,12 +7,11 @@ from libc.string cimport memcpy
from libcpp.set cimport set from libcpp.set cimport set
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
import ujson import srsly
from .symbols import IDS as SYMBOLS_BY_STR from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .compat import json_dumps
from .errors import Errors from .errors import Errors
from . import util from . import util
@ -197,8 +196,7 @@ cdef class StringStore:
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
strings = list(self) strings = list(self)
with path.open('w') as file_: srsly.write_json(path, strings)
file_.write(json_dumps(strings))
def from_disk(self, path): def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
@ -209,8 +207,7 @@ cdef class StringStore:
RETURNS (StringStore): The modified `StringStore` object. RETURNS (StringStore): The modified `StringStore` object.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
with path.open('r') as file_: strings = srsly.read_json(path)
strings = ujson.load(file_)
prev = list(self) prev = list(self)
self._reset_and_load(strings) self._reset_and_load(strings)
for word in prev: for word in prev:
@ -223,7 +220,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object. RETURNS (bytes): The serialized form of the `StringStore` object.
""" """
return json_dumps(list(self)) return srsly.json_dumps(list(self))
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string. """Load state from a binary string.
@ -232,7 +229,7 @@ cdef class StringStore:
**exclude: Named attributes to prevent from being loaded. **exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object. RETURNS (StringStore): The `StringStore` object.
""" """
strings = ujson.loads(bytes_data) strings = srsly.json_loads(bytes_data)
prev = list(self) prev = list(self)
self._reset_and_load(strings) self._reset_and_load(strings)
for word in prev: for word in prev:

View File

@ -5,8 +5,6 @@
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from collections import OrderedDict from collections import OrderedDict
import ujson
import json
import numpy import numpy
cimport cython.parallel cimport cython.parallel
import cytoolz import cytoolz
@ -29,7 +27,7 @@ cimport blis.cy
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models, create_default_optimizer from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import json_dumps, copy_array from ..compat import copy_array
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..errors import Errors, TempErrors from ..errors import Errors, TempErrors
@ -119,7 +117,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
VecVec.add_i(&A.scores[i*n.classes], VecVec.add_i(&A.scores[i*n.classes],
W.hidden_bias, 1., n.classes) W.hidden_bias, 1., n.classes)
cdef void sum_state_features(float* output, cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil: const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i cdef int idx, b, f, i
@ -165,7 +163,7 @@ cdef void cpu_log_loss(float* d_scores,
else: else:
d_scores[i] = exp(scores[i]-max_) / Z d_scores[i] = exp(scores[i]-max_) / Z
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
const int* is_valid, int n) nogil: const int* is_valid, int n) nogil:
# Find minimum cost # Find minimum cost
@ -218,15 +216,15 @@ class ParserModel(Model):
def begin_training(self, X, y=None): def begin_training(self, X, y=None):
self.lower.begin_training(X, y=y) self.lower.begin_training(X, y=y)
@property @property
def tok2vec(self): def tok2vec(self):
return self._layers[0] return self._layers[0]
@property @property
def lower(self): def lower(self):
return self._layers[1] return self._layers[1]
@property @property
def upper(self): def upper(self):
return self._layers[2] return self._layers[2]
@ -405,4 +403,3 @@ cdef class precompute_hiddens:
else: else:
return self.ops.backprop_maxout(d_best, mask, self.nP) return self.ops.backprop_maxout(d_best, mask, self.nP)
return state_vector, backprop_nonlinearity return state_vector, backprop_nonlinearity

View File

@ -5,8 +5,6 @@
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from collections import OrderedDict from collections import OrderedDict
import ujson
import json
import numpy import numpy
cimport cython.parallel cimport cython.parallel
import cytoolz import cytoolz
@ -27,6 +25,7 @@ from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec from thinc.linalg cimport Vec, VecVec
import srsly
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
@ -34,7 +33,7 @@ from ._parser_model cimport get_c_weights, get_c_sizes
from ._parser_model import ParserModel from ._parser_model import ParserModel
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models, create_default_optimizer from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import json_dumps, copy_array from ..compat import copy_array
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..errors import Errors, TempErrors from ..errors import Errors, TempErrors
@ -539,7 +538,7 @@ cdef class Parser:
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
'vocab': lambda p: self.vocab.to_disk(p), 'vocab': lambda p: self.vocab.to_disk(p),
'moves': lambda p: self.moves.to_disk(p, strings=False), 'moves': lambda p: self.moves.to_disk(p, strings=False),
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg)) 'cfg': lambda p: srsly.write_json(p, self.cfg)
} }
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
@ -547,7 +546,7 @@ cdef class Parser:
deserializers = { deserializers = {
'vocab': lambda p: self.vocab.from_disk(p), 'vocab': lambda p: self.vocab.from_disk(p),
'moves': lambda p: self.moves.from_disk(p, strings=False), 'moves': lambda p: self.moves.from_disk(p, strings=False),
'cfg': lambda p: self.cfg.update(util.read_json(p)), 'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
'model': lambda p: None 'model': lambda p: None
} }
util.from_disk(path, deserializers, exclude) util.from_disk(path, deserializers, exclude)
@ -568,7 +567,7 @@ cdef class Parser:
('model', lambda: (self.model.to_bytes() if self.model is not True else True)), ('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
('vocab', lambda: self.vocab.to_bytes()), ('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)), ('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True)) ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
)) ))
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
@ -576,7 +575,7 @@ cdef class Parser:
deserializers = OrderedDict(( deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(json.loads(b))), ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
('model', lambda b: None) ('model', lambda b: None)
)) ))
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)

View File

@ -7,14 +7,13 @@ from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from collections import OrderedDict, Counter from collections import OrderedDict, Counter
import ujson import srsly
from . cimport _beam_utils from . cimport _beam_utils
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..compat import json_dumps
from ..errors import Errors from ..errors import Errors
from .. import util from .. import util
@ -153,13 +152,13 @@ cdef class TransitionSystem:
# Make sure we take a copy here, and that we get a Counter # Make sure we take a copy here, and that we get a Counter
self.labels[action] = Counter() self.labels[action] = Counter()
# Have to be careful here: Sorting must be stable, or our model # Have to be careful here: Sorting must be stable, or our model
# won't be read back in correctly. # won't be read back in correctly.
sorted_labels = [(f, L) for L, f in label_freqs.items()] sorted_labels = [(f, L) for L, f in label_freqs.items()]
sorted_labels.sort() sorted_labels.sort()
sorted_labels.reverse() sorted_labels.reverse()
for freq, label_str in sorted_labels: for freq, label_str in sorted_labels:
self.add_action(int(action), label_str) self.add_action(int(action), label_str)
self.labels[action][label_str] = freq self.labels[action][label_str] = freq
def add_action(self, int action, label_name): def add_action(self, int action, label_name):
cdef attr_t label_id cdef attr_t label_id
@ -204,7 +203,7 @@ cdef class TransitionSystem:
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
transitions = [] transitions = []
serializers = { serializers = {
'moves': lambda: json_dumps(self.labels), 'moves': lambda: srsly.json_dumps(self.labels),
'strings': lambda: self.strings.to_bytes() 'strings': lambda: self.strings.to_bytes()
} }
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
@ -212,7 +211,7 @@ cdef class TransitionSystem:
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
labels = {} labels = {}
deserializers = { deserializers = {
'moves': lambda b: labels.update(ujson.loads(b)), 'moves': lambda b: labels.update(srsly.json_loads(b)),
'strings': lambda b: self.strings.from_bytes(b) 'strings': lambda b: self.strings.from_bytes(b)
} }
msg = util.from_bytes(bytes_data, deserializers, exclude) msg = util.from_bytes(bytes_data, deserializers, exclude)

View File

@ -5,7 +5,7 @@ import numpy
import tempfile import tempfile
import shutil import shutil
import contextlib import contextlib
import msgpack import srsly
from pathlib import Path from pathlib import Path
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.attrs import POS, HEAD, DEP from spacy.attrs import POS, HEAD, DEP
@ -100,8 +100,8 @@ def assert_docs_equal(doc1, doc2):
def assert_packed_msg_equal(b1, b2): def assert_packed_msg_equal(b1, b2):
"""Assert that two packed msgpack messages are equal.""" """Assert that two packed msgpack messages are equal."""
msg1 = msgpack.loads(b1, encoding="utf8") msg1 = srsly.msgpack_loads(b1)
msg2 = msgpack.loads(b2, encoding="utf8") msg2 = srsly.msgpack_loads(b2)
assert sorted(msg1.keys()) == sorted(msg2.keys()) assert sorted(msg1.keys()) == sorted(msg2.keys())
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2 assert k1 == k2

View File

@ -1,8 +1,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import numpy import numpy
import msgpack
import gzip import gzip
import srsly
from thinc.neural.ops import NumpyOps from thinc.neural.ops import NumpyOps
from ..compat import copy_reg from ..compat import copy_reg
@ -74,11 +74,11 @@ class Binder(object):
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings), "strings": list(self.strings),
} }
return gzip.compress(msgpack.dumps(msg)) return gzip.compress(srsly.msgpack_dumps(msg))
def from_bytes(self, string): def from_bytes(self, string):
"""Deserialize the binder's annotations from a byte string.""" """Deserialize the binder's annotations from a byte string."""
msg = msgpack.loads(gzip.decompress(string)) msg = srsly.msgpack_loads(gzip.decompress(string))
self.attrs = msg["attrs"] self.attrs = msg["attrs"]
self.strings = set(msg["strings"]) self.strings = set(msg["strings"])
lengths = numpy.fromstring(msg["lengths"], dtype="int32") lengths = numpy.fromstring(msg["lengths"], dtype="int32")

View File

@ -10,8 +10,8 @@ import numpy
import numpy.linalg import numpy.linalg
import struct import struct
import dill import dill
import msgpack
from thinc.neural.util import get_array_module, copy_array from thinc.neural.util import get_array_module, copy_array
import srsly
from libc.string cimport memcpy, memset from libc.string cimport memcpy, memset
from libc.math cimport sqrt from libc.math cimport sqrt
@ -28,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, SENT_START from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice, is_json_serializable from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle, basestring_ from ..compat import is_config, copy_reg, pickle, basestring_
from ..errors import deprecation_warning, models_warning, user_warning from ..errors import deprecation_warning, models_warning, user_warning
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
@ -807,8 +807,8 @@ cdef class Doc:
} }
if 'user_data' not in exclude and self.user_data: if 'user_data' not in exclude and self.user_data:
user_data_keys, user_data_values = list(zip(*self.user_data.items())) user_data_keys, user_data_values = list(zip(*self.user_data.items()))
serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys) serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys)
serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values) serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values)
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
@ -836,9 +836,8 @@ cdef class Doc:
# keys, we must have tuples. In values we just have to hope # keys, we must have tuples. In values we just have to hope
# users don't mind getting a list instead of a tuple. # users don't mind getting a list instead of a tuple.
if 'user_data' not in exclude and 'user_data_keys' in msg: if 'user_data' not in exclude and 'user_data_keys' in msg:
user_data_keys = msgpack.loads(msg['user_data_keys'], user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False)
use_list=False, raw=False) user_data_values = srsly.msgpack_loads(msg['user_data_values'])
user_data_values = msgpack.loads(msg['user_data_values'], raw=False)
for key, value in zip(user_data_keys, user_data_values): for key, value in zip(user_data_keys, user_data_values):
self.user_data[key] = value self.user_data[key] = value
@ -996,7 +995,7 @@ cdef class Doc:
if not self.has_extension(attr): if not self.has_extension(attr):
raise ValueError(Errors.E106.format(attr=attr, opts=underscore)) raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
value = self._.get(attr) value = self._.get(attr)
if not is_json_serializable(value): if not srsly.is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
data['_'][attr] = value data['_'][attr] = value
return data return data

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import os import os
import ujson
import pkg_resources import pkg_resources
import importlib import importlib
import regex as re import regex as re
@ -15,18 +14,13 @@ import functools
import cytoolz import cytoolz
import itertools import itertools
import numpy.random import numpy.random
import srsly
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, unicode_ from .compat import cupy, CudaStream, path2str, basestring_, unicode_
from .compat import import_file, json_dumps from .compat import import_file
from .errors import Errors from .errors import Errors
# Import these directly from Thinc, so that we're sure we always have the
# same version.
from thinc.neural._classes.model import msgpack # noqa: F401
from thinc.neural._classes.model import msgpack_numpy # noqa: F401
LANGUAGES = {} LANGUAGES = {}
_data_path = Path(__file__).parent / "data" _data_path = Path(__file__).parent / "data"
@ -185,7 +179,7 @@ def get_model_meta(path):
meta_path = model_path / "meta.json" meta_path = model_path / "meta.json"
if not meta_path.is_file(): if not meta_path.is_file():
raise IOError(Errors.E053.format(path=meta_path)) raise IOError(Errors.E053.format(path=meta_path))
meta = read_json(meta_path) meta = srsly.read_json(meta_path)
for setting in ["lang", "name", "version"]: for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]: if setting not in meta or not meta[setting]:
raise ValueError(Errors.E054.format(setting=setting)) raise ValueError(Errors.E054.format(setting=setting))
@ -529,74 +523,16 @@ def itershuffle(iterable, bufsize=1000):
raise StopIteration raise StopIteration
def read_json(location):
"""Open and load JSON from file.
location (Path): Path to JSON file.
RETURNS (dict): Loaded JSON content.
"""
location = ensure_path(location)
with location.open("r", encoding="utf8") as f:
return ujson.load(f)
def write_json(file_path, contents):
"""Create a .json file and dump contents.
file_path (unicode / Path): The path to the output file.
contents: The JSON-serializable contents to output.
"""
with Path(file_path).open("w", encoding="utf8") as f:
f.write(json_dumps(contents))
def read_jsonl(file_path):
"""Read a .jsonl file and yield its contents line by line.
file_path (unicode / Path): The file path.
YIELDS: The loaded JSON contents of each line.
"""
with Path(file_path).open("r", encoding="utf8") as f:
for line in f:
try: # hack to handle broken jsonl
yield ujson.loads(line.strip())
except ValueError:
continue
def write_jsonl(file_path, lines):
"""Create a .jsonl file and dump contents.
file_path (unicode / Path): The path to the output file.
lines (list): The JSON-serializable contents of each line.
"""
data = [json_dumps(line) for line in lines]
with Path(file_path).open("w", encoding="utf-8") as f:
f.write("\n".join(data))
def is_json_serializable(obj):
"""Check if a Python object is JSON-serializable."""
if hasattr(obj, "__call__"):
# Check this separately here to prevent infinite recursions
return False
try:
ujson.dumps(obj)
return True
except TypeError:
return False
def to_bytes(getters, exclude): def to_bytes(getters, exclude):
serialized = OrderedDict() serialized = OrderedDict()
for key, getter in getters.items(): for key, getter in getters.items():
if key not in exclude: if key not in exclude:
serialized[key] = getter() serialized[key] = getter()
return msgpack.dumps(serialized, use_bin_type=True) return srsly.msgpack_dumps(serialized)
def from_bytes(bytes_data, setters, exclude): def from_bytes(bytes_data, setters, exclude):
msg = msgpack.loads(bytes_data, raw=False) msg = srsly.msgpack_loads(bytes_data)
for key, setter in setters.items(): for key, setter in setters.items():
if key not in exclude and key in msg: if key not in exclude and key in msg:
setter(msg[key]) setter(msg[key])

View File

@ -4,9 +4,7 @@ from __future__ import unicode_literals
import functools import functools
import numpy import numpy
from collections import OrderedDict from collections import OrderedDict
import srsly
from .util import msgpack
from .util import msgpack_numpy
cimport numpy as np cimport numpy as np
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
@ -353,7 +351,7 @@ cdef class Vectors:
save_array = lambda arr, file_: xp.save(file_, arr) save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict(( serializers = OrderedDict((
('vectors', lambda p: save_array(self.data, p.open('wb'))), ('vectors', lambda p: save_array(self.data, p.open('wb'))),
('key2row', lambda p: msgpack.dump(self.key2row, p.open('wb'))) ('key2row', lambda p: srsly.write_msgpack(p, self.key2row))
)) ))
return util.to_disk(path, serializers, exclude) return util.to_disk(path, serializers, exclude)
@ -366,8 +364,7 @@ cdef class Vectors:
""" """
def load_key2row(path): def load_key2row(path):
if path.exists(): if path.exists():
with path.open('rb') as file_: self.key2row = srsly.read_msgpack(path)
self.key2row = msgpack.load(file_)
for key, row in self.key2row.items(): for key, row in self.key2row.items():
if self._unset.count(row): if self._unset.count(row):
self._unset.erase(self._unset.find(row)) self._unset.erase(self._unset.find(row))
@ -401,9 +398,9 @@ cdef class Vectors:
if hasattr(self.data, 'to_bytes'): if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes() return self.data.to_bytes()
else: else:
return msgpack.dumps(self.data) return srsly.msgpack_dumps(self.data)
serializers = OrderedDict(( serializers = OrderedDict((
('key2row', lambda: msgpack.dumps(self.key2row)), ('key2row', lambda: srsly.msgpack_dumps(self.key2row)),
('vectors', serialize_weights) ('vectors', serialize_weights)
)) ))
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
@ -419,10 +416,10 @@ cdef class Vectors:
if hasattr(self.data, 'from_bytes'): if hasattr(self.data, 'from_bytes'):
self.data.from_bytes() self.data.from_bytes()
else: else:
self.data = msgpack.loads(b) self.data = srsly.msgpack_loads(b)
deserializers = OrderedDict(( deserializers = OrderedDict((
('key2row', lambda b: self.key2row.update(msgpack.loads(b))), ('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))),
('vectors', deserialize_weights) ('vectors', deserialize_weights)
)) ))
util.from_bytes(data, deserializers, exclude) util.from_bytes(data, deserializers, exclude)

View File

@ -9,10 +9,9 @@ p
| underscore, e.e #[code unicode_]. | underscore, e.e #[code unicode_].
+aside-code("Example"). +aside-code("Example").
from spacy.compat import unicode_, json_dumps from spacy.compat import unicode_
compatible_unicode = unicode_('hello world') compatible_unicode = unicode_('hello world')
compatible_json = json_dumps({'key': 'value'})
+table(["Name", "Python 2", "Python 3"]) +table(["Name", "Python 2", "Python 3"])
+row +row
@ -35,11 +34,6 @@ p
+cell #[code raw_input] +cell #[code raw_input]
+cell #[code input] +cell #[code input]
+row
+cell #[code compat.json_dumps]
+cell #[code ujson.dumps] with #[code .decode('utf8')]
+cell #[code ujson.dumps]
+row +row
+cell #[code compat.path2str] +cell #[code compat.path2str]
+cell #[code str(path)] with #[code .decode('utf8')] +cell #[code str(path)] with #[code .decode('utf8')]