diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cb10a1718..22cad91d6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()` helper function. ```python -from .compat import unicode_, json_dumps, is_config +from .compat import unicode_, is_config compatible_unicode = unicode_('hello world') -compatible_json = json_dumps({'key': 'value'}) if is_config(windows=True, python2=True): print("You are using Python 2 on Windows.") ``` diff --git a/bin/load_reddit.py b/bin/load_reddit.py index 5affa0fb5..507ce58c2 100644 --- a/bin/load_reddit.py +++ b/bin/load_reddit.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import bz2 import regex as re -import ujson +import srsly import sys import random import datetime @@ -44,7 +44,7 @@ class Reddit(object): line = line.strip() if not line: continue - comment = ujson.loads(line) + comment = srsly.json_loads(line) if self.is_valid(comment): text = self.strip_tags(comment["body"]) yield {"text": text} @@ -75,7 +75,7 @@ class Reddit(object): def main(path): reddit = Reddit(path) for comment in reddit: - print(ujson.dumps(comment)) + print(srsly.json_dumps(comment)) if __name__ == "__main__": diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py index b49cb88e8..3cdc9cc86 100644 --- a/examples/information_extraction/phrase_matcher.py +++ b/examples/information_extraction/phrase_matcher.py @@ -45,7 +45,7 @@ from __future__ import print_function, unicode_literals, division from bz2 import BZ2File import time import plac -import ujson +import json from spacy.matcher import PhraseMatcher import spacy @@ -71,7 +71,7 @@ def main(patterns_loc, text_loc, n=10000, lang="en"): def read_gazetteer(tokenizer, loc, n=-1): for i, line in enumerate(open(loc)): - data = ujson.loads(line.strip()) + data = json.loads(line.strip()) phrase = tokenizer(data["text"]) for w in phrase: _ = tokenizer.vocab[w.text] @@ -82,7 +82,7 @@ def read_gazetteer(tokenizer, loc, n=-1): def read_text(bz2_loc, n=10000): with BZ2File(bz2_loc) as file_: for i, line in enumerate(file_): - data = ujson.loads(line) + data = json.loads(line) yield data["body"] if i >= n: break diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py index 7cd66a20c..14df8e3d4 100644 --- a/examples/keras_parikh_entailment/__main__.py +++ b/examples/keras_parikh_entailment/__main__.py @@ -1,5 +1,5 @@ import numpy as np -import ujson as json +import json from keras.utils import to_categorical import plac import sys diff --git a/examples/notebooks/Decompositional Attention.ipynb b/examples/notebooks/Decompositional Attention.ipynb index b61dc9df7..8baaf7d33 100644 --- a/examples/notebooks/Decompositional Attention.ipynb +++ b/examples/notebooks/Decompositional Attention.ipynb @@ -77,7 +77,7 @@ } ], "source": [ - "import ujson as json\n", + "import json\n", "from keras.utils import to_categorical\n", "\n", "LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n", diff --git a/requirements.txt b/requirements.txt index 3d495277e..d68ac7a31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,12 +6,12 @@ blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 cytoolz>=0.9.0,<0.10.0 plac<1.0.0,>=0.9.6 -ujson>=1.35 dill>=0.2,<0.3 regex==2018.01.10 requests>=2.13.0,<3.0.0 jsonschema>=2.6.0,<3.0.0 wasabi>=0.0.8,<1.1.0 +srsly>=0.0.4,<1.1.0 pathlib==1.0.1; python_version < "3.4" # Development dependencies cython>=0.25 diff --git a/setup.py b/setup.py index 05d074f28..99ae655bb 100755 --- a/setup.py +++ b/setup.py @@ -203,12 +203,12 @@ def setup_package(): "thinc==7.0.0.dev4", "blis>=0.2.2,<0.3.0", "plac<1.0.0,>=0.9.6", - "ujson>=1.35", "regex==2018.01.10", "dill>=0.2,<0.3", "requests>=2.13.0,<3.0.0", "jsonschema>=2.6.0,<3.0.0", "wasabi>=0.0.8,<1.1.0", + "srsly>=0.0.4,<1.1.0", 'pathlib==1.0.1; python_version < "3.4"', ], setup_requires=["wheel"], diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index b41b22036..a2c1d20e0 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -4,9 +4,9 @@ from __future__ import unicode_literals import plac from pathlib import Path from wasabi import Printer +import srsly -from ..util import write_jsonl, write_json -from ..compat import json_dumps, path2str +from ..compat import path2str from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json from .converters import ner_jsonl2json from ._messages import Messages @@ -77,9 +77,9 @@ def convert( suffix = ".{}".format(file_type) output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) if file_type == "json": - write_json(output_file, data) + srsly.write_json(output_file, data) elif file_type == "jsonl": - write_jsonl(output_file, data) + srsly.write_jsonl(output_file, data) msg.good( Messages.M032.format(name=path2str(output_file)), Messages.M033.format(n_docs=len(data)), @@ -87,7 +87,6 @@ def convert( else: # Print to stdout if file_type == "json": - print(json_dumps(data)) + srsly.write_json("-", data) elif file_type == "jsonl": - for line in data: - print(json_dumps(line)) + srsly.write_jsonl("-", data) diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 26fdca302..a281db86d 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -import ujson +import srsly from ...util import get_lang_class from .._messages import Messages @@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False): if lang is None: raise ValueError(Messages.M054) json_docs = [] - input_tuples = [ujson.loads(line) for line in input_data] + input_tuples = [srsly.json_loads(line) for line in input_data] nlp = get_lang_class(lang)() for i, (raw_text, ents) in enumerate(input_tuples): doc = nlp.make_doc(raw_text) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 5bf602828..06f648124 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -5,10 +5,11 @@ from pathlib import Path from collections import Counter import plac import sys +import srsly from wasabi import Printer, MESSAGES from ..gold import GoldCorpus, read_json_object -from ..util import load_model, get_lang_class, read_json, read_jsonl +from ..util import load_model, get_lang_class # from .schemas import get_schema, validate_json from ._messages import Messages @@ -320,11 +321,11 @@ def debug_data( def _load_file(file_path, msg): file_name = file_path.parts[-1] if file_path.suffix == ".json": - data = read_json(file_path) + data = srsly.read_json(file_path) msg.good("Loaded {}".format(file_name)) return data elif file_path.suffix == ".jsonl": - data = read_jsonl(file_path) + data = srsly.read_jsonl(file_path) msg.good("Loaded {}".format(file_name)) return data msg.fail( diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 5df9ddadb..7339faaab 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -5,6 +5,7 @@ import plac import platform from pathlib import Path from wasabi import Printer +import srsly from ._messages import Messages from ..compat import path2str, basestring_, unicode_ @@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False): meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail(Messages.M020, meta_path, exits=1) - meta = util.read_json(meta_path) + meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: meta["link"] = path2str(model_path) meta["source"] = path2str(model_path.resolve()) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 4b3406ab0..8dc2a8cf2 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -11,12 +11,13 @@ from preshed.counter import PreshCounter import tarfile import gzip import zipfile +import srsly from wasabi import Printer from ._messages import Messages from ..vectors import Vectors from ..errors import Errors, Warnings, user_warning -from ..util import ensure_path, get_lang_class, read_jsonl +from ..util import ensure_path, get_lang_class try: import ftfy @@ -59,7 +60,7 @@ def init_model( settings.append("-c") msg.warn(Messages.M063, Messages.M064) jsonl_loc = ensure_path(jsonl_loc) - lex_attrs = read_jsonl(jsonl_loc) + lex_attrs = srsly.read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 84288ac72..916dbc1f2 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -5,9 +5,10 @@ import plac import shutil from pathlib import Path from wasabi import Printer, get_raw_input +import srsly from ._messages import Messages -from ..compat import path2str, json_dumps +from ..compat import path2str from .. import util from .. import about @@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals meta_path = meta_path or input_path / "meta.json" if meta_path.is_file(): - meta = util.read_json(meta_path) + meta = srsly.read_json(meta_path) if not create_meta: # only print if user doesn't want to overwrite msg.good(Messages.M041, meta_path) else: @@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals ) Path.mkdir(package_path, parents=True) shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) - create_file(main_path / "meta.json", json_dumps(meta)) + create_file(main_path / "meta.json", srsly.json_dumps(meta)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 20d097047..70cab05c2 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -5,8 +5,6 @@ import plac import random import numpy import time -import ujson -import sys from collections import Counter from pathlib import Path from thinc.v2v import Affine, Maxout @@ -14,10 +12,10 @@ from thinc.api import wrap from thinc.misc import LayerNorm as LN from thinc.neural.util import prefer_gpu from wasabi import Printer +import srsly from ..tokens import Doc from ..attrs import ID, HEAD -from ..compat import json_dumps from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer from .. import util @@ -72,7 +70,7 @@ def pretrain( if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory") - util.write_json(output_dir / "config.json", config) + srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin @@ -81,12 +79,12 @@ def pretrain( if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): - texts = list(util.read_jsonl(texts_loc)) + texts = list(srsly.read_jsonl(texts_loc)) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") - texts = stream_texts() + texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) @@ -130,18 +128,13 @@ def pretrain( "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: - file_.write(json_dumps(log) + "\n") + file_.write(srsly.json_dumps(log) + "\n") tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) -def stream_texts(): - for line in sys.stdin: - yield ujson.loads(line) - - def make_update(model, docs, optimizer, drop=0.0): """Perform an update over a single batch of documents. diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 506e55871..439ef79a1 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function import plac from pathlib import Path -import ujson +import srsly import cProfile import pstats import sys @@ -64,6 +64,6 @@ def _read_inputs(loc, msg): msg.info("Using data from {}".format(input_path.parts[-1])) file_ = input_path.open() for line in file_: - data = ujson.loads(line) + data = srsly.json_loads(line) text = data["text"] yield text diff --git a/spacy/cli/schemas/__init__.py b/spacy/cli/schemas/__init__.py index f478c7a9a..c502c6493 100644 --- a/spacy/cli/schemas/__init__.py +++ b/spacy/cli/schemas/__init__.py @@ -3,9 +3,9 @@ from __future__ import unicode_literals from pathlib import Path from jsonschema import Draft4Validator +import srsly from ...errors import Errors -from ...util import read_json SCHEMAS = {} @@ -25,7 +25,7 @@ def get_schema(name): schema_path = Path(__file__).parent / "{}.json".format(name) if not schema_path.exists(): raise ValueError(Errors.E104.format(name=name)) - schema = read_json(schema_path) + schema = srsly.read_json(schema_path) # TODO: replace with (stable) Draft6Validator, if available validator = Draft4Validator(schema) validator.check_schema(schema) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 9dec5d4bd..8d322e32d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -7,6 +7,7 @@ import tqdm from thinc.neural._classes.model import Model from timeit import default_timer as timer import shutil +import srsly from wasabi import Printer from ._messages import Messages @@ -111,7 +112,7 @@ def train( msg.fail(Messages.M051, dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail(Messages.M020, meta_path, exits=1) - meta = util.read_json(meta_path) if meta_path else {} + meta = srsly.read_json(meta_path) if meta_path else {} if not isinstance(meta, dict): msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1) if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: @@ -226,7 +227,7 @@ def train( end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" - util.write_json(acc_loc, scorer.scores) + srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang @@ -242,7 +243,7 @@ def train( meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta_loc = output_path / ("model%d" % i) / "meta.json" - util.write_json(meta_loc, meta) + srsly.write_json(meta_loc, meta) util.set_env_log(verbose) @@ -293,17 +294,17 @@ def _collate_best_model(meta, output_path, components): for component, best_component_src in bests.items(): shutil.rmtree(best_dest / component) shutil.copytree(best_component_src / component, best_dest / component) - accs = util.read_json(best_component_src / "accuracy.json") + accs = srsly.read_json(best_component_src / "accuracy.json") for metric in _get_metrics(component): meta["accuracy"][metric] = accs[metric] - util.write_json(best_dest / "meta.json", meta) + srsly.write_json(best_dest / "meta.json", meta) def _find_best(experiment_dir, component): accuracies = [] for epoch_model in experiment_dir.iterdir(): if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": - accs = util.read_json(epoch_model / "accuracy.json") + accs = srsly.read_json(epoch_model / "accuracy.json") scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] accuracies.append((scores, epoch_model)) if accuracies: diff --git a/spacy/cli/ud/ud_run_test.py b/spacy/cli/ud/ud_run_test.py index f36df2f80..e3771fa92 100644 --- a/spacy/cli/ud/ud_run_test.py +++ b/spacy/cli/ud/ud_run_test.py @@ -9,7 +9,7 @@ import tqdm from pathlib import Path import re import sys -import json +import srsly import spacy import spacy.util @@ -44,7 +44,7 @@ from ...lang import ru # Data reading # ################ -space_re = re.compile("\s+") +space_re = re.compile(r"\s+") def split_text(text): @@ -332,8 +332,7 @@ def main(test_data_dir, experiment_dir, corpus): / corpus / "{section}-accuracy.json".format(section=section) ) - with open(acc_path, "w") as file_: - file_.write(json.dumps(accuracy, indent=2)) + srsly.write_json(acc_path, accuracy) if __name__ == "__main__": diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index caeaf5ca9..4b5581972 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -5,11 +5,12 @@ import pkg_resources from pathlib import Path import sys import requests +import srsly from wasabi import Printer from ._messages import Messages from ..compat import path2str -from ..util import get_data_path, read_json +from ..util import get_data_path from .. import about @@ -84,7 +85,7 @@ def get_model_links(compat): meta_path = Path(model) / "meta.json" if not meta_path.exists(): continue - meta = read_json(meta_path) + meta = srsly.read_json(meta_path) link = model.parts[-1] name = meta["lang"] + "_" + meta["name"] links[link] = { diff --git a/spacy/compat.py b/spacy/compat.py index f00e2c417..c1869b85f 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals import os import sys -import ujson import itertools from thinc.neural.util import copy_array @@ -54,9 +53,6 @@ if is_python2: unicode_ = unicode # noqa: F821 basestring_ = basestring # noqa: F821 input_ = raw_input # noqa: F821 - json_dumps = lambda data, indent=2: ujson.dumps( - data, indent=indent, escape_forward_slashes=False - ).decode("utf8") path2str = lambda path: str(path).decode("utf8") elif is_python3: @@ -64,9 +60,6 @@ elif is_python3: unicode_ = str basestring_ = str input_ = input - json_dumps = lambda data, indent=2: ujson.dumps( - data, indent=indent, escape_forward_slashes=False - ) path2str = lambda path: str(path) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 26ff9753a..9c0c00652 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -10,10 +10,7 @@ import numpy import tempfile import shutil from pathlib import Path -import msgpack -import json - -import ujson +import srsly from . import _align from .syntax import nonproj @@ -21,7 +18,6 @@ from .tokens import Doc from .errors import Errors from . import util from .util import minibatch, itershuffle -from .compat import json_dumps from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek @@ -123,12 +119,11 @@ class GoldCorpus(object): directory.mkdir() n = 0 for i, doc_tuple in enumerate(doc_tuples): - with open(directory / '{}.msg'.format(i), 'wb') as file_: - msgpack.dump([doc_tuple], file_, use_bin_type=True) + srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple]) n += len(doc_tuple[1]) if limit and n >= limit: break - + @staticmethod def walk_corpus(path): path = util.ensure_path(path) @@ -157,8 +152,7 @@ class GoldCorpus(object): if loc.parts[-1].endswith('json'): gold_tuples = read_json_file(loc) elif loc.parts[-1].endswith('msg'): - with loc.open('rb') as file_: - gold_tuples = msgpack.load(file_, raw=False) + gold_tuples = srsly.read_msgpack(loc) else: msg = "Cannot read from file: %s. Supported formats: .json, .msg" raise ValueError(msg % loc) @@ -378,7 +372,7 @@ def _json_iterate(loc): if square_depth == 1 and curly_depth == 0: py_str = py_raw[start : i+1].decode('utf8') try: - yield json.loads(py_str) + yield srsly.json_loads(py_str) except Exception: print(py_str) raise diff --git a/spacy/language.py b/spacy/language.py index f8afe84f7..4c3bfd5c8 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2,7 +2,6 @@ from __future__ import absolute_import, unicode_literals import random -import ujson import itertools import weakref import functools @@ -10,6 +9,7 @@ from collections import OrderedDict from contextlib import contextmanager from copy import copy from thinc.neural import Model +import srsly from .tokenizer import Tokenizer from .vocab import Vocab @@ -18,7 +18,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler -from .compat import json_dumps, izip, basestring_ +from .compat import izip, basestring_ from .gold import GoldParse from .scorer import Scorer from ._ml import link_vectors_to_models, create_default_optimizer @@ -640,7 +640,7 @@ class Language(object): serializers = OrderedDict( ( ("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)), - ("meta.json", lambda p: p.open("w").write(json_dumps(self.meta))), + ("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))), ) ) for name, proc in self.pipeline: @@ -671,7 +671,7 @@ class Language(object): path = util.ensure_path(path) deserializers = OrderedDict( ( - ("meta.json", lambda p: self.meta.update(util.read_json(p))), + ("meta.json", lambda p: self.meta.update(srsly.read_json(p))), ( "vocab", lambda p: ( @@ -705,7 +705,7 @@ class Language(object): ( ("vocab", lambda: self.vocab.to_bytes()), ("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)), - ("meta", lambda: json_dumps(self.meta)), + ("meta", lambda: srsly.json_dumps(self.meta)), ) ) for i, (name, proc) in enumerate(self.pipeline): @@ -725,7 +725,7 @@ class Language(object): """ deserializers = OrderedDict( ( - ("meta", lambda b: self.meta.update(ujson.loads(b))), + ("meta", lambda b: self.meta.update(srsly.json_loads(b))), ( "vocab", lambda b: ( diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 3a09af644..c3b8f5fae 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -7,10 +7,7 @@ import numpy cimport numpy as np import cytoolz from collections import OrderedDict, defaultdict -import ujson - -from .util import msgpack -from .util import msgpack_numpy +import srsly from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax @@ -27,7 +24,6 @@ from .syntax.arc_eager cimport ArcEager from .morphology cimport Morphology from .vocab cimport Vocab from .syntax import nonproj -from .compat import json_dumps from .matcher import Matcher from .matcher import Matcher, PhraseMatcher @@ -38,7 +34,7 @@ from ._ml import Tok2Vec, build_text_classifier, build_tagger_model from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import create_default_optimizer from .errors import Errors, TempErrors -from .compat import json_dumps, basestring_ +from .compat import basestring_ from . import util @@ -235,7 +231,7 @@ class EntityRuler(object): **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. """ - patterns = msgpack.loads(patterns_bytes, raw=False) + patterns = srsly.msgpack_loads(patterns_bytes) self.add_patterns(patterns) return self @@ -244,7 +240,7 @@ class EntityRuler(object): RETURNS (bytes): The serialized patterns. """ - return msgpack.dumps(self.patterns, use_bin_type=True) + return srsly.msgpack_dumps(self.patterns) def from_disk(self, path, **kwargs): """Load the entity ruler from a file. Expects a file containing @@ -256,7 +252,7 @@ class EntityRuler(object): """ path = util.ensure_path(path) path = path.with_suffix('.jsonl') - patterns = util.read_jsonl(path) + patterns = srsly.read_jsonl(path) self.add_patterns(patterns) return self @@ -270,8 +266,7 @@ class EntityRuler(object): """ path = util.ensure_path(path) path = path.with_suffix('.jsonl') - data = [json_dumps(line, indent=0) for line in self.patterns] - path.open('w').write('\n'.join(data)) + srsly.write_jsonl(path, self.patterns) class Pipe(object): @@ -368,7 +363,7 @@ class Pipe(object): def to_bytes(self, **exclude): """Serialize the pipe to a bytestring.""" serialize = OrderedDict() - serialize['cfg'] = lambda: json_dumps(self.cfg) + serialize['cfg'] = lambda: srsly.json_dumps(self.cfg) if self.model in (True, False, None): serialize['model'] = lambda: self.model else: @@ -387,7 +382,7 @@ class Pipe(object): self.model.from_bytes(b) deserialize = OrderedDict(( - ('cfg', lambda b: self.cfg.update(ujson.loads(b))), + ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), ('vocab', lambda b: self.vocab.from_bytes(b)), ('model', load_model), )) @@ -397,7 +392,7 @@ class Pipe(object): def to_disk(self, path, **exclude): """Serialize the pipe to disk.""" serialize = OrderedDict() - serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg)) + serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg) serialize['vocab'] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) @@ -424,8 +419,7 @@ class Pipe(object): def _load_cfg(path): if path.exists(): - with path.open() as file_: - return ujson.load(file_) + return srsly.read_json(path) else: return {} @@ -745,10 +739,9 @@ class Tagger(Pipe): else: serialize['model'] = self.model.to_bytes serialize['vocab'] = self.vocab.to_bytes - serialize['cfg'] = lambda: ujson.dumps(self.cfg) + serialize['cfg'] = lambda: srsly.json_dumps(self.cfg) tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) - serialize['tag_map'] = lambda: msgpack.dumps( - tag_map, use_bin_type=True) + serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map) return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): @@ -766,7 +759,7 @@ class Tagger(Pipe): self.model.from_bytes(b) def load_tag_map(b): - tag_map = msgpack.loads(b, raw=False) + tag_map = srsly.msgpack_loads(b) self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, @@ -775,7 +768,7 @@ class Tagger(Pipe): deserialize = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('tag_map', load_tag_map), - ('cfg', lambda b: self.cfg.update(ujson.loads(b))), + ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), ('model', lambda b: load_model(b)), )) util.from_bytes(bytes_data, deserialize, exclude) @@ -785,10 +778,9 @@ class Tagger(Pipe): tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) serialize = OrderedDict(( ('vocab', lambda p: self.vocab.to_disk(p)), - ('tag_map', lambda p: p.open('wb').write(msgpack.dumps( - tag_map, use_bin_type=True))), + ('tag_map', lambda p: srsly.write_msgpack(p, tag_map)), ('model', lambda p: p.open('wb').write(self.model.to_bytes())), - ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg))) + ('cfg', lambda p: srsly.write_json(p, self.cfg)) )) util.to_disk(path, serialize, exclude) @@ -803,8 +795,7 @@ class Tagger(Pipe): self.model.from_bytes(file_.read()) def load_tag_map(p): - with p.open('rb') as file_: - tag_map = msgpack.loads(file_.read(), raw=False) + tag_map = srsly.read_msgpack(p) self.vocab.morphology = Morphology( self.vocab.strings, tag_map=tag_map, lemmatizer=self.vocab.morphology.lemmatizer, diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b54e3f155..2c8d5fcb4 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -7,12 +7,11 @@ from libc.string cimport memcpy from libcpp.set cimport set from libc.stdint cimport uint32_t from murmurhash.mrmr cimport hash64, hash32 -import ujson +import srsly from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT from .typedefs cimport hash_t -from .compat import json_dumps from .errors import Errors from . import util @@ -197,8 +196,7 @@ cdef class StringStore: """ path = util.ensure_path(path) strings = list(self) - with path.open('w') as file_: - file_.write(json_dumps(strings)) + srsly.write_json(path, strings) def from_disk(self, path): """Loads state from a directory. Modifies the object in place and @@ -209,8 +207,7 @@ cdef class StringStore: RETURNS (StringStore): The modified `StringStore` object. """ path = util.ensure_path(path) - with path.open('r') as file_: - strings = ujson.load(file_) + strings = srsly.read_json(path) prev = list(self) self._reset_and_load(strings) for word in prev: @@ -223,7 +220,7 @@ cdef class StringStore: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `StringStore` object. """ - return json_dumps(list(self)) + return srsly.json_dumps(list(self)) def from_bytes(self, bytes_data, **exclude): """Load state from a binary string. @@ -232,7 +229,7 @@ cdef class StringStore: **exclude: Named attributes to prevent from being loaded. RETURNS (StringStore): The `StringStore` object. """ - strings = ujson.loads(bytes_data) + strings = srsly.json_loads(bytes_data) prev = list(self) self._reset_and_load(strings) for word in prev: diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index cfaa8ddf0..9796193f6 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -5,8 +5,6 @@ from __future__ import unicode_literals, print_function from collections import OrderedDict -import ujson -import json import numpy cimport cython.parallel import cytoolz @@ -29,7 +27,7 @@ cimport blis.cy from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten from .._ml import link_vectors_to_models, create_default_optimizer -from ..compat import json_dumps, copy_array +from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse from ..errors import Errors, TempErrors @@ -119,7 +117,7 @@ cdef void predict_states(ActivationsC* A, StateC** states, VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes) - + cdef void sum_state_features(float* output, const float* cached, const int* token_ids, int B, int F, int O) nogil: cdef int idx, b, f, i @@ -165,7 +163,7 @@ cdef void cpu_log_loss(float* d_scores, else: d_scores[i] = exp(scores[i]-max_) / Z - + cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, const int* is_valid, int n) nogil: # Find minimum cost @@ -218,15 +216,15 @@ class ParserModel(Model): def begin_training(self, X, y=None): self.lower.begin_training(X, y=y) - + @property def tok2vec(self): return self._layers[0] - + @property def lower(self): return self._layers[1] - + @property def upper(self): return self._layers[2] @@ -405,4 +403,3 @@ cdef class precompute_hiddens: else: return self.ops.backprop_maxout(d_best, mask, self.nP) return state_vector, backprop_nonlinearity - diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0663c1289..186c5c16c 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -5,8 +5,6 @@ from __future__ import unicode_literals, print_function from collections import OrderedDict -import ujson -import json import numpy cimport cython.parallel import cytoolz @@ -27,6 +25,7 @@ from thinc.misc import LayerNorm from thinc.neural.ops import CupyOps from thinc.neural.util import get_array_module from thinc.linalg cimport Vec, VecVec +import srsly from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss @@ -34,7 +33,7 @@ from ._parser_model cimport get_c_weights, get_c_sizes from ._parser_model import ParserModel from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten from .._ml import link_vectors_to_models, create_default_optimizer -from ..compat import json_dumps, copy_array +from ..compat import copy_array from ..tokens.doc cimport Doc from ..gold cimport GoldParse from ..errors import Errors, TempErrors @@ -539,7 +538,7 @@ cdef class Parser: 'model': lambda p: (self.model.to_disk(p) if self.model is not True else True), 'vocab': lambda p: self.vocab.to_disk(p), 'moves': lambda p: self.moves.to_disk(p, strings=False), - 'cfg': lambda p: p.open('w').write(json_dumps(self.cfg)) + 'cfg': lambda p: srsly.write_json(p, self.cfg) } util.to_disk(path, serializers, exclude) @@ -547,7 +546,7 @@ cdef class Parser: deserializers = { 'vocab': lambda p: self.vocab.from_disk(p), 'moves': lambda p: self.moves.from_disk(p, strings=False), - 'cfg': lambda p: self.cfg.update(util.read_json(p)), + 'cfg': lambda p: self.cfg.update(srsly.read_json(p)), 'model': lambda p: None } util.from_disk(path, deserializers, exclude) @@ -568,7 +567,7 @@ cdef class Parser: ('model', lambda: (self.model.to_bytes() if self.model is not True else True)), ('vocab', lambda: self.vocab.to_bytes()), ('moves', lambda: self.moves.to_bytes(strings=False)), - ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True)) + ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)) )) return util.to_bytes(serializers, exclude) @@ -576,7 +575,7 @@ cdef class Parser: deserializers = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)), - ('cfg', lambda b: self.cfg.update(json.loads(b))), + ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), ('model', lambda b: None) )) msg = util.from_bytes(bytes_data, deserializers, exclude) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index fc84fc23a..6d64a4fb4 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -7,14 +7,13 @@ from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam from collections import OrderedDict, Counter -import ujson +import srsly from . cimport _beam_utils from ..tokens.doc cimport Doc from ..structs cimport TokenC from .stateclass cimport StateClass from ..typedefs cimport attr_t -from ..compat import json_dumps from ..errors import Errors from .. import util @@ -153,13 +152,13 @@ cdef class TransitionSystem: # Make sure we take a copy here, and that we get a Counter self.labels[action] = Counter() # Have to be careful here: Sorting must be stable, or our model - # won't be read back in correctly. + # won't be read back in correctly. sorted_labels = [(f, L) for L, f in label_freqs.items()] sorted_labels.sort() sorted_labels.reverse() for freq, label_str in sorted_labels: self.add_action(int(action), label_str) - self.labels[action][label_str] = freq + self.labels[action][label_str] = freq def add_action(self, int action, label_name): cdef attr_t label_id @@ -204,7 +203,7 @@ cdef class TransitionSystem: def to_bytes(self, **exclude): transitions = [] serializers = { - 'moves': lambda: json_dumps(self.labels), + 'moves': lambda: srsly.json_dumps(self.labels), 'strings': lambda: self.strings.to_bytes() } return util.to_bytes(serializers, exclude) @@ -212,7 +211,7 @@ cdef class TransitionSystem: def from_bytes(self, bytes_data, **exclude): labels = {} deserializers = { - 'moves': lambda b: labels.update(ujson.loads(b)), + 'moves': lambda b: labels.update(srsly.json_loads(b)), 'strings': lambda b: self.strings.from_bytes(b) } msg = util.from_bytes(bytes_data, deserializers, exclude) diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 80fbb5b1c..175480fe7 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -5,7 +5,7 @@ import numpy import tempfile import shutil import contextlib -import msgpack +import srsly from pathlib import Path from spacy.tokens import Doc, Span from spacy.attrs import POS, HEAD, DEP @@ -100,8 +100,8 @@ def assert_docs_equal(doc1, doc2): def assert_packed_msg_equal(b1, b2): """Assert that two packed msgpack messages are equal.""" - msg1 = msgpack.loads(b1, encoding="utf8") - msg2 = msgpack.loads(b2, encoding="utf8") + msg1 = srsly.msgpack_loads(b1) + msg2 = srsly.msgpack_loads(b2) assert sorted(msg1.keys()) == sorted(msg2.keys()) for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): assert k1 == k2 diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 683a3974f..5c3bf9c70 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,8 +1,8 @@ from __future__ import unicode_literals import numpy -import msgpack import gzip +import srsly from thinc.neural.ops import NumpyOps from ..compat import copy_reg @@ -74,11 +74,11 @@ class Binder(object): "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "strings": list(self.strings), } - return gzip.compress(msgpack.dumps(msg)) + return gzip.compress(srsly.msgpack_dumps(msg)) def from_bytes(self, string): """Deserialize the binder's annotations from a byte string.""" - msg = msgpack.loads(gzip.decompress(string)) + msg = srsly.msgpack_loads(gzip.decompress(string)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) lengths = numpy.fromstring(msg["lengths"], dtype="int32") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b845b4eb7..cd2428d79 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -10,8 +10,8 @@ import numpy import numpy.linalg import struct import dill -import msgpack from thinc.neural.util import get_array_module, copy_array +import srsly from libc.string cimport memcpy, memset from libc.math cimport sqrt @@ -28,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport ENT_TYPE, SENT_START from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t -from ..util import normalize_slice, is_json_serializable +from ..util import normalize_slice from ..compat import is_config, copy_reg, pickle, basestring_ from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings @@ -807,8 +807,8 @@ cdef class Doc: } if 'user_data' not in exclude and self.user_data: user_data_keys, user_data_values = list(zip(*self.user_data.items())) - serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys) - serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values) + serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys) + serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values) return util.to_bytes(serializers, exclude) @@ -836,9 +836,8 @@ cdef class Doc: # keys, we must have tuples. In values we just have to hope # users don't mind getting a list instead of a tuple. if 'user_data' not in exclude and 'user_data_keys' in msg: - user_data_keys = msgpack.loads(msg['user_data_keys'], - use_list=False, raw=False) - user_data_values = msgpack.loads(msg['user_data_values'], raw=False) + user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False) + user_data_values = srsly.msgpack_loads(msg['user_data_values']) for key, value in zip(user_data_keys, user_data_values): self.user_data[key] = value @@ -996,7 +995,7 @@ cdef class Doc: if not self.has_extension(attr): raise ValueError(Errors.E106.format(attr=attr, opts=underscore)) value = self._.get(attr) - if not is_json_serializable(value): + if not srsly.is_json_serializable(value): raise ValueError(Errors.E107.format(attr=attr, value=repr(value))) data['_'][attr] = value return data diff --git a/spacy/util.py b/spacy/util.py index d8c82da89..7e700be03 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals, print_function import os -import ujson import pkg_resources import importlib import regex as re @@ -15,18 +14,13 @@ import functools import cytoolz import itertools import numpy.random - +import srsly from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, unicode_ -from .compat import import_file, json_dumps +from .compat import import_file from .errors import Errors -# Import these directly from Thinc, so that we're sure we always have the -# same version. -from thinc.neural._classes.model import msgpack # noqa: F401 -from thinc.neural._classes.model import msgpack_numpy # noqa: F401 - LANGUAGES = {} _data_path = Path(__file__).parent / "data" @@ -185,7 +179,7 @@ def get_model_meta(path): meta_path = model_path / "meta.json" if not meta_path.is_file(): raise IOError(Errors.E053.format(path=meta_path)) - meta = read_json(meta_path) + meta = srsly.read_json(meta_path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) @@ -529,74 +523,16 @@ def itershuffle(iterable, bufsize=1000): raise StopIteration -def read_json(location): - """Open and load JSON from file. - - location (Path): Path to JSON file. - RETURNS (dict): Loaded JSON content. - """ - location = ensure_path(location) - with location.open("r", encoding="utf8") as f: - return ujson.load(f) - - -def write_json(file_path, contents): - """Create a .json file and dump contents. - - file_path (unicode / Path): The path to the output file. - contents: The JSON-serializable contents to output. - """ - with Path(file_path).open("w", encoding="utf8") as f: - f.write(json_dumps(contents)) - - -def read_jsonl(file_path): - """Read a .jsonl file and yield its contents line by line. - - file_path (unicode / Path): The file path. - YIELDS: The loaded JSON contents of each line. - """ - with Path(file_path).open("r", encoding="utf8") as f: - for line in f: - try: # hack to handle broken jsonl - yield ujson.loads(line.strip()) - except ValueError: - continue - - -def write_jsonl(file_path, lines): - """Create a .jsonl file and dump contents. - - file_path (unicode / Path): The path to the output file. - lines (list): The JSON-serializable contents of each line. - """ - data = [json_dumps(line) for line in lines] - with Path(file_path).open("w", encoding="utf-8") as f: - f.write("\n".join(data)) - - -def is_json_serializable(obj): - """Check if a Python object is JSON-serializable.""" - if hasattr(obj, "__call__"): - # Check this separately here to prevent infinite recursions - return False - try: - ujson.dumps(obj) - return True - except TypeError: - return False - - def to_bytes(getters, exclude): serialized = OrderedDict() for key, getter in getters.items(): if key not in exclude: serialized[key] = getter() - return msgpack.dumps(serialized, use_bin_type=True) + return srsly.msgpack_dumps(serialized) def from_bytes(bytes_data, setters, exclude): - msg = msgpack.loads(bytes_data, raw=False) + msg = srsly.msgpack_loads(bytes_data) for key, setter in setters.items(): if key not in exclude and key in msg: setter(msg[key]) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 3e3268bfa..911eff08e 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -4,9 +4,7 @@ from __future__ import unicode_literals import functools import numpy from collections import OrderedDict - -from .util import msgpack -from .util import msgpack_numpy +import srsly cimport numpy as np from thinc.neural.util import get_array_module @@ -353,7 +351,7 @@ cdef class Vectors: save_array = lambda arr, file_: xp.save(file_, arr) serializers = OrderedDict(( ('vectors', lambda p: save_array(self.data, p.open('wb'))), - ('key2row', lambda p: msgpack.dump(self.key2row, p.open('wb'))) + ('key2row', lambda p: srsly.write_msgpack(p, self.key2row)) )) return util.to_disk(path, serializers, exclude) @@ -366,8 +364,7 @@ cdef class Vectors: """ def load_key2row(path): if path.exists(): - with path.open('rb') as file_: - self.key2row = msgpack.load(file_) + self.key2row = srsly.read_msgpack(path) for key, row in self.key2row.items(): if self._unset.count(row): self._unset.erase(self._unset.find(row)) @@ -401,9 +398,9 @@ cdef class Vectors: if hasattr(self.data, 'to_bytes'): return self.data.to_bytes() else: - return msgpack.dumps(self.data) + return srsly.msgpack_dumps(self.data) serializers = OrderedDict(( - ('key2row', lambda: msgpack.dumps(self.key2row)), + ('key2row', lambda: srsly.msgpack_dumps(self.key2row)), ('vectors', serialize_weights) )) return util.to_bytes(serializers, exclude) @@ -419,10 +416,10 @@ cdef class Vectors: if hasattr(self.data, 'from_bytes'): self.data.from_bytes() else: - self.data = msgpack.loads(b) + self.data = srsly.msgpack_loads(b) deserializers = OrderedDict(( - ('key2row', lambda b: self.key2row.update(msgpack.loads(b))), + ('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))), ('vectors', deserialize_weights) )) util.from_bytes(data, deserializers, exclude) diff --git a/website/api/_top-level/_compat.jade b/website/api/_top-level/_compat.jade index c9b023647..7de2f4102 100644 --- a/website/api/_top-level/_compat.jade +++ b/website/api/_top-level/_compat.jade @@ -9,10 +9,9 @@ p | underscore, e.e #[code unicode_]. +aside-code("Example"). - from spacy.compat import unicode_, json_dumps + from spacy.compat import unicode_ compatible_unicode = unicode_('hello world') - compatible_json = json_dumps({'key': 'value'}) +table(["Name", "Python 2", "Python 3"]) +row @@ -35,11 +34,6 @@ p +cell #[code raw_input] +cell #[code input] - +row - +cell #[code compat.json_dumps] - +cell #[code ujson.dumps] with #[code .decode('utf8')] - +cell #[code ujson.dumps] - +row +cell #[code compat.path2str] +cell #[code str(path)] with #[code .decode('utf8')]