mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉 See here: https://github.com/explosion/srsly Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place. At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel. srsly currently includes forks of the following packages: ujson msgpack msgpack-numpy cloudpickle * WIP: replace json/ujson with srsly * Replace ujson in examples Use regular json instead of srsly to make code easier to read and follow * Update requirements * Fix imports * Fix typos * Replace msgpack with srsly * Fix warning
This commit is contained in:
parent
40b57ea4ac
commit
f37863093a
|
@ -292,10 +292,9 @@ for example to show more specific error messages, you can use the `is_config()`
|
||||||
helper function.
|
helper function.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from .compat import unicode_, json_dumps, is_config
|
from .compat import unicode_, is_config
|
||||||
|
|
||||||
compatible_unicode = unicode_('hello world')
|
compatible_unicode = unicode_('hello world')
|
||||||
compatible_json = json_dumps({'key': 'value'})
|
|
||||||
if is_config(windows=True, python2=True):
|
if is_config(windows=True, python2=True):
|
||||||
print("You are using Python 2 on Windows.")
|
print("You are using Python 2 on Windows.")
|
||||||
```
|
```
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import bz2
|
import bz2
|
||||||
import regex as re
|
import regex as re
|
||||||
import ujson
|
import srsly
|
||||||
import sys
|
import sys
|
||||||
import random
|
import random
|
||||||
import datetime
|
import datetime
|
||||||
|
@ -44,7 +44,7 @@ class Reddit(object):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
comment = ujson.loads(line)
|
comment = srsly.json_loads(line)
|
||||||
if self.is_valid(comment):
|
if self.is_valid(comment):
|
||||||
text = self.strip_tags(comment["body"])
|
text = self.strip_tags(comment["body"])
|
||||||
yield {"text": text}
|
yield {"text": text}
|
||||||
|
@ -75,7 +75,7 @@ class Reddit(object):
|
||||||
def main(path):
|
def main(path):
|
||||||
reddit = Reddit(path)
|
reddit = Reddit(path)
|
||||||
for comment in reddit:
|
for comment in reddit:
|
||||||
print(ujson.dumps(comment))
|
print(srsly.json_dumps(comment))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -45,7 +45,7 @@ from __future__ import print_function, unicode_literals, division
|
||||||
from bz2 import BZ2File
|
from bz2 import BZ2File
|
||||||
import time
|
import time
|
||||||
import plac
|
import plac
|
||||||
import ujson
|
import json
|
||||||
|
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -71,7 +71,7 @@ def main(patterns_loc, text_loc, n=10000, lang="en"):
|
||||||
|
|
||||||
def read_gazetteer(tokenizer, loc, n=-1):
|
def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
for i, line in enumerate(open(loc)):
|
for i, line in enumerate(open(loc)):
|
||||||
data = ujson.loads(line.strip())
|
data = json.loads(line.strip())
|
||||||
phrase = tokenizer(data["text"])
|
phrase = tokenizer(data["text"])
|
||||||
for w in phrase:
|
for w in phrase:
|
||||||
_ = tokenizer.vocab[w.text]
|
_ = tokenizer.vocab[w.text]
|
||||||
|
@ -82,7 +82,7 @@ def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
def read_text(bz2_loc, n=10000):
|
def read_text(bz2_loc, n=10000):
|
||||||
with BZ2File(bz2_loc) as file_:
|
with BZ2File(bz2_loc) as file_:
|
||||||
for i, line in enumerate(file_):
|
for i, line in enumerate(file_):
|
||||||
data = ujson.loads(line)
|
data = json.loads(line)
|
||||||
yield data["body"]
|
yield data["body"]
|
||||||
if i >= n:
|
if i >= n:
|
||||||
break
|
break
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import ujson as json
|
import json
|
||||||
from keras.utils import to_categorical
|
from keras.utils import to_categorical
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
|
|
|
@ -77,7 +77,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import ujson as json\n",
|
"import json\n",
|
||||||
"from keras.utils import to_categorical\n",
|
"from keras.utils import to_categorical\n",
|
||||||
"\n",
|
"\n",
|
||||||
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
|
"LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}\n",
|
||||||
|
|
|
@ -6,12 +6,12 @@ blis>=0.2.2,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cytoolz>=0.9.0,<0.10.0
|
cytoolz>=0.9.0,<0.10.0
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
ujson>=1.35
|
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
regex==2018.01.10
|
regex==2018.01.10
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
jsonschema>=2.6.0,<3.0.0
|
jsonschema>=2.6.0,<3.0.0
|
||||||
wasabi>=0.0.8,<1.1.0
|
wasabi>=0.0.8,<1.1.0
|
||||||
|
srsly>=0.0.4,<1.1.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -203,12 +203,12 @@ def setup_package():
|
||||||
"thinc==7.0.0.dev4",
|
"thinc==7.0.0.dev4",
|
||||||
"blis>=0.2.2,<0.3.0",
|
"blis>=0.2.2,<0.3.0",
|
||||||
"plac<1.0.0,>=0.9.6",
|
"plac<1.0.0,>=0.9.6",
|
||||||
"ujson>=1.35",
|
|
||||||
"regex==2018.01.10",
|
"regex==2018.01.10",
|
||||||
"dill>=0.2,<0.3",
|
"dill>=0.2,<0.3",
|
||||||
"requests>=2.13.0,<3.0.0",
|
"requests>=2.13.0,<3.0.0",
|
||||||
"jsonschema>=2.6.0,<3.0.0",
|
"jsonschema>=2.6.0,<3.0.0",
|
||||||
"wasabi>=0.0.8,<1.1.0",
|
"wasabi>=0.0.8,<1.1.0",
|
||||||
|
"srsly>=0.0.4,<1.1.0",
|
||||||
'pathlib==1.0.1; python_version < "3.4"',
|
'pathlib==1.0.1; python_version < "3.4"',
|
||||||
],
|
],
|
||||||
setup_requires=["wheel"],
|
setup_requires=["wheel"],
|
||||||
|
|
|
@ -4,9 +4,9 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ..util import write_jsonl, write_json
|
from ..compat import path2str
|
||||||
from ..compat import json_dumps, path2str
|
|
||||||
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
||||||
from .converters import ner_jsonl2json
|
from .converters import ner_jsonl2json
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
@ -77,9 +77,9 @@ def convert(
|
||||||
suffix = ".{}".format(file_type)
|
suffix = ".{}".format(file_type)
|
||||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
write_json(output_file, data)
|
srsly.write_json(output_file, data)
|
||||||
elif file_type == "jsonl":
|
elif file_type == "jsonl":
|
||||||
write_jsonl(output_file, data)
|
srsly.write_jsonl(output_file, data)
|
||||||
msg.good(
|
msg.good(
|
||||||
Messages.M032.format(name=path2str(output_file)),
|
Messages.M032.format(name=path2str(output_file)),
|
||||||
Messages.M033.format(n_docs=len(data)),
|
Messages.M033.format(n_docs=len(data)),
|
||||||
|
@ -87,7 +87,6 @@ def convert(
|
||||||
else:
|
else:
|
||||||
# Print to stdout
|
# Print to stdout
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
print(json_dumps(data))
|
srsly.write_json("-", data)
|
||||||
elif file_type == "jsonl":
|
elif file_type == "jsonl":
|
||||||
for line in data:
|
srsly.write_jsonl("-", data)
|
||||||
print(json_dumps(line))
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from ...util import get_lang_class
|
from ...util import get_lang_class
|
||||||
from .._messages import Messages
|
from .._messages import Messages
|
||||||
|
@ -11,7 +11,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
raise ValueError(Messages.M054)
|
raise ValueError(Messages.M054)
|
||||||
json_docs = []
|
json_docs = []
|
||||||
input_tuples = [ujson.loads(line) for line in input_data]
|
input_tuples = [srsly.json_loads(line) for line in input_data]
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
|
|
|
@ -5,10 +5,11 @@ from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
|
import srsly
|
||||||
from wasabi import Printer, MESSAGES
|
from wasabi import Printer, MESSAGES
|
||||||
|
|
||||||
from ..gold import GoldCorpus, read_json_object
|
from ..gold import GoldCorpus, read_json_object
|
||||||
from ..util import load_model, get_lang_class, read_json, read_jsonl
|
from ..util import load_model, get_lang_class
|
||||||
|
|
||||||
# from .schemas import get_schema, validate_json
|
# from .schemas import get_schema, validate_json
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
@ -320,11 +321,11 @@ def debug_data(
|
||||||
def _load_file(file_path, msg):
|
def _load_file(file_path, msg):
|
||||||
file_name = file_path.parts[-1]
|
file_name = file_path.parts[-1]
|
||||||
if file_path.suffix == ".json":
|
if file_path.suffix == ".json":
|
||||||
data = read_json(file_path)
|
data = srsly.read_json(file_path)
|
||||||
msg.good("Loaded {}".format(file_name))
|
msg.good("Loaded {}".format(file_name))
|
||||||
return data
|
return data
|
||||||
elif file_path.suffix == ".jsonl":
|
elif file_path.suffix == ".jsonl":
|
||||||
data = read_jsonl(file_path)
|
data = srsly.read_jsonl(file_path)
|
||||||
msg.good("Loaded {}".format(file_name))
|
msg.good("Loaded {}".format(file_name))
|
||||||
return data
|
return data
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
|
|
@ -5,6 +5,7 @@ import plac
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, basestring_, unicode_
|
from ..compat import path2str, basestring_, unicode_
|
||||||
|
@ -32,7 +33,7 @@ def info(model=None, markdown=False, silent=False):
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
msg.fail(Messages.M020, meta_path, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta["link"] = path2str(model_path)
|
meta["link"] = path2str(model_path)
|
||||||
meta["source"] = path2str(model_path.resolve())
|
meta["source"] = path2str(model_path.resolve())
|
||||||
|
|
|
@ -11,12 +11,13 @@ from preshed.counter import PreshCounter
|
||||||
import tarfile
|
import tarfile
|
||||||
import gzip
|
import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
|
import srsly
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings, user_warning
|
||||||
from ..util import ensure_path, get_lang_class, read_jsonl
|
from ..util import ensure_path, get_lang_class
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
|
@ -59,7 +60,7 @@ def init_model(
|
||||||
settings.append("-c")
|
settings.append("-c")
|
||||||
msg.warn(Messages.M063, Messages.M064)
|
msg.warn(Messages.M063, Messages.M064)
|
||||||
jsonl_loc = ensure_path(jsonl_loc)
|
jsonl_loc = ensure_path(jsonl_loc)
|
||||||
lex_attrs = read_jsonl(jsonl_loc)
|
lex_attrs = srsly.read_jsonl(jsonl_loc)
|
||||||
else:
|
else:
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
clusters_loc = ensure_path(clusters_loc)
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
freqs_loc = ensure_path(freqs_loc)
|
||||||
|
|
|
@ -5,9 +5,10 @@ import plac
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, get_raw_input
|
from wasabi import Printer, get_raw_input
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, json_dumps
|
from ..compat import path2str
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -40,7 +41,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
||||||
|
|
||||||
meta_path = meta_path or input_path / "meta.json"
|
meta_path = meta_path or input_path / "meta.json"
|
||||||
if meta_path.is_file():
|
if meta_path.is_file():
|
||||||
meta = util.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if not create_meta: # only print if user doesn't want to overwrite
|
if not create_meta: # only print if user doesn't want to overwrite
|
||||||
msg.good(Messages.M041, meta_path)
|
msg.good(Messages.M041, meta_path)
|
||||||
else:
|
else:
|
||||||
|
@ -64,7 +65,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
||||||
)
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||||
create_file(main_path / "meta.json", json_dumps(meta))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta))
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||||
|
|
|
@ -5,8 +5,6 @@ import plac
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
import time
|
import time
|
||||||
import ujson
|
|
||||||
import sys
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.v2v import Affine, Maxout
|
from thinc.v2v import Affine, Maxout
|
||||||
|
@ -14,10 +12,10 @@ from thinc.api import wrap
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
from thinc.neural.util import prefer_gpu
|
from thinc.neural.util import prefer_gpu
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID, HEAD
|
from ..attrs import ID, HEAD
|
||||||
from ..compat import json_dumps
|
|
||||||
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -72,7 +70,7 @@ def pretrain(
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
msg.good("Created output directory")
|
msg.good("Created output directory")
|
||||||
util.write_json(output_dir / "config.json", config)
|
srsly.write_json(output_dir / "config.json", config)
|
||||||
msg.good("Saved settings to config.json")
|
msg.good("Saved settings to config.json")
|
||||||
|
|
||||||
# Load texts from file or stdin
|
# Load texts from file or stdin
|
||||||
|
@ -81,12 +79,12 @@ def pretrain(
|
||||||
if not texts_loc.exists():
|
if not texts_loc.exists():
|
||||||
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||||
with msg.loading("Loading input texts..."):
|
with msg.loading("Loading input texts..."):
|
||||||
texts = list(util.read_jsonl(texts_loc))
|
texts = list(srsly.read_jsonl(texts_loc))
|
||||||
msg.good("Loaded input texts")
|
msg.good("Loaded input texts")
|
||||||
random.shuffle(texts)
|
random.shuffle(texts)
|
||||||
else: # reading from stdin
|
else: # reading from stdin
|
||||||
msg.text("Reading input text from stdin...")
|
msg.text("Reading input text from stdin...")
|
||||||
texts = stream_texts()
|
texts = srsly.read_jsonl("-")
|
||||||
|
|
||||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||||
nlp = util.load_model(vectors_model)
|
nlp = util.load_model(vectors_model)
|
||||||
|
@ -130,18 +128,13 @@ def pretrain(
|
||||||
"epoch": epoch,
|
"epoch": epoch,
|
||||||
}
|
}
|
||||||
with (output_dir / "log.jsonl").open("a") as file_:
|
with (output_dir / "log.jsonl").open("a") as file_:
|
||||||
file_.write(json_dumps(log) + "\n")
|
file_.write(srsly.json_dumps(log) + "\n")
|
||||||
tracker.epoch_loss = 0.0
|
tracker.epoch_loss = 0.0
|
||||||
if texts_loc != "-":
|
if texts_loc != "-":
|
||||||
# Reshuffle the texts if texts were loaded from a file
|
# Reshuffle the texts if texts were loaded from a file
|
||||||
random.shuffle(texts)
|
random.shuffle(texts)
|
||||||
|
|
||||||
|
|
||||||
def stream_texts():
|
|
||||||
for line in sys.stdin:
|
|
||||||
yield ujson.loads(line)
|
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, drop=0.0):
|
def make_update(model, docs, optimizer, drop=0.0):
|
||||||
"""Perform an update over a single batch of documents.
|
"""Perform an update over a single batch of documents.
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import ujson
|
import srsly
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
import sys
|
import sys
|
||||||
|
@ -64,6 +64,6 @@ def _read_inputs(loc, msg):
|
||||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||||
file_ = input_path.open()
|
file_ = input_path.open()
|
||||||
for line in file_:
|
for line in file_:
|
||||||
data = ujson.loads(line)
|
data = srsly.json_loads(line)
|
||||||
text = data["text"]
|
text = data["text"]
|
||||||
yield text
|
yield text
|
||||||
|
|
|
@ -3,9 +3,9 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from jsonschema import Draft4Validator
|
from jsonschema import Draft4Validator
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ...util import read_json
|
|
||||||
|
|
||||||
|
|
||||||
SCHEMAS = {}
|
SCHEMAS = {}
|
||||||
|
@ -25,7 +25,7 @@ def get_schema(name):
|
||||||
schema_path = Path(__file__).parent / "{}.json".format(name)
|
schema_path = Path(__file__).parent / "{}.json".format(name)
|
||||||
if not schema_path.exists():
|
if not schema_path.exists():
|
||||||
raise ValueError(Errors.E104.format(name=name))
|
raise ValueError(Errors.E104.format(name=name))
|
||||||
schema = read_json(schema_path)
|
schema = srsly.read_json(schema_path)
|
||||||
# TODO: replace with (stable) Draft6Validator, if available
|
# TODO: replace with (stable) Draft6Validator, if available
|
||||||
validator = Draft4Validator(schema)
|
validator = Draft4Validator(schema)
|
||||||
validator.check_schema(schema)
|
validator.check_schema(schema)
|
||||||
|
|
|
@ -7,6 +7,7 @@ import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import shutil
|
import shutil
|
||||||
|
import srsly
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
@ -111,7 +112,7 @@ def train(
|
||||||
msg.fail(Messages.M051, dev_path, exits=1)
|
msg.fail(Messages.M051, dev_path, exits=1)
|
||||||
if meta_path is not None and not meta_path.exists():
|
if meta_path is not None and not meta_path.exists():
|
||||||
msg.fail(Messages.M020, meta_path, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path) if meta_path else {}
|
meta = srsly.read_json(meta_path) if meta_path else {}
|
||||||
if not isinstance(meta, dict):
|
if not isinstance(meta, dict):
|
||||||
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
||||||
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||||
|
@ -226,7 +227,7 @@ def train(
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||||
util.write_json(acc_loc, scorer.scores)
|
srsly.write_json(acc_loc, scorer.scores)
|
||||||
|
|
||||||
# Update model meta.json
|
# Update model meta.json
|
||||||
meta["lang"] = nlp.lang
|
meta["lang"] = nlp.lang
|
||||||
|
@ -242,7 +243,7 @@ def train(
|
||||||
meta.setdefault("name", "model%d" % i)
|
meta.setdefault("name", "model%d" % i)
|
||||||
meta.setdefault("version", version)
|
meta.setdefault("version", version)
|
||||||
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
||||||
util.write_json(meta_loc, meta)
|
srsly.write_json(meta_loc, meta)
|
||||||
|
|
||||||
util.set_env_log(verbose)
|
util.set_env_log(verbose)
|
||||||
|
|
||||||
|
@ -293,17 +294,17 @@ def _collate_best_model(meta, output_path, components):
|
||||||
for component, best_component_src in bests.items():
|
for component, best_component_src in bests.items():
|
||||||
shutil.rmtree(best_dest / component)
|
shutil.rmtree(best_dest / component)
|
||||||
shutil.copytree(best_component_src / component, best_dest / component)
|
shutil.copytree(best_component_src / component, best_dest / component)
|
||||||
accs = util.read_json(best_component_src / "accuracy.json")
|
accs = srsly.read_json(best_component_src / "accuracy.json")
|
||||||
for metric in _get_metrics(component):
|
for metric in _get_metrics(component):
|
||||||
meta["accuracy"][metric] = accs[metric]
|
meta["accuracy"][metric] = accs[metric]
|
||||||
util.write_json(best_dest / "meta.json", meta)
|
srsly.write_json(best_dest / "meta.json", meta)
|
||||||
|
|
||||||
|
|
||||||
def _find_best(experiment_dir, component):
|
def _find_best(experiment_dir, component):
|
||||||
accuracies = []
|
accuracies = []
|
||||||
for epoch_model in experiment_dir.iterdir():
|
for epoch_model in experiment_dir.iterdir():
|
||||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||||
accs = util.read_json(epoch_model / "accuracy.json")
|
accs = srsly.read_json(epoch_model / "accuracy.json")
|
||||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||||
accuracies.append((scores, epoch_model))
|
accuracies.append((scores, epoch_model))
|
||||||
if accuracies:
|
if accuracies:
|
||||||
|
|
|
@ -9,7 +9,7 @@ import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import json
|
import srsly
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
|
@ -44,7 +44,7 @@ from ...lang import ru
|
||||||
# Data reading #
|
# Data reading #
|
||||||
################
|
################
|
||||||
|
|
||||||
space_re = re.compile("\s+")
|
space_re = re.compile(r"\s+")
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
def split_text(text):
|
||||||
|
@ -332,8 +332,7 @@ def main(test_data_dir, experiment_dir, corpus):
|
||||||
/ corpus
|
/ corpus
|
||||||
/ "{section}-accuracy.json".format(section=section)
|
/ "{section}-accuracy.json".format(section=section)
|
||||||
)
|
)
|
||||||
with open(acc_path, "w") as file_:
|
srsly.write_json(acc_path, accuracy)
|
||||||
file_.write(json.dumps(accuracy, indent=2))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -5,11 +5,12 @@ import pkg_resources
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
|
import srsly
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str
|
from ..compat import path2str
|
||||||
from ..util import get_data_path, read_json
|
from ..util import get_data_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,7 +85,7 @@ def get_model_links(compat):
|
||||||
meta_path = Path(model) / "meta.json"
|
meta_path = Path(model) / "meta.json"
|
||||||
if not meta_path.exists():
|
if not meta_path.exists():
|
||||||
continue
|
continue
|
||||||
meta = read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
link = model.parts[-1]
|
link = model.parts[-1]
|
||||||
name = meta["lang"] + "_" + meta["name"]
|
name = meta["lang"] + "_" + meta["name"]
|
||||||
links[link] = {
|
links[link] = {
|
||||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
@ -54,9 +53,6 @@ if is_python2:
|
||||||
unicode_ = unicode # noqa: F821
|
unicode_ = unicode # noqa: F821
|
||||||
basestring_ = basestring # noqa: F821
|
basestring_ = basestring # noqa: F821
|
||||||
input_ = raw_input # noqa: F821
|
input_ = raw_input # noqa: F821
|
||||||
json_dumps = lambda data, indent=2: ujson.dumps(
|
|
||||||
data, indent=indent, escape_forward_slashes=False
|
|
||||||
).decode("utf8")
|
|
||||||
path2str = lambda path: str(path).decode("utf8")
|
path2str = lambda path: str(path).decode("utf8")
|
||||||
|
|
||||||
elif is_python3:
|
elif is_python3:
|
||||||
|
@ -64,9 +60,6 @@ elif is_python3:
|
||||||
unicode_ = str
|
unicode_ = str
|
||||||
basestring_ = str
|
basestring_ = str
|
||||||
input_ = input
|
input_ = input
|
||||||
json_dumps = lambda data, indent=2: ujson.dumps(
|
|
||||||
data, indent=indent, escape_forward_slashes=False
|
|
||||||
)
|
|
||||||
path2str = lambda path: str(path)
|
path2str = lambda path: str(path)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -10,10 +10,7 @@ import numpy
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import msgpack
|
import srsly
|
||||||
import json
|
|
||||||
|
|
||||||
import ujson
|
|
||||||
|
|
||||||
from . import _align
|
from . import _align
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
|
@ -21,7 +18,6 @@ from .tokens import Doc
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch, itershuffle
|
from .util import minibatch, itershuffle
|
||||||
from .compat import json_dumps
|
|
||||||
|
|
||||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
||||||
|
|
||||||
|
@ -123,12 +119,11 @@ class GoldCorpus(object):
|
||||||
directory.mkdir()
|
directory.mkdir()
|
||||||
n = 0
|
n = 0
|
||||||
for i, doc_tuple in enumerate(doc_tuples):
|
for i, doc_tuple in enumerate(doc_tuples):
|
||||||
with open(directory / '{}.msg'.format(i), 'wb') as file_:
|
srsly.write_msgpack(directory / '{}.msg'.format(i), [doc_tuple])
|
||||||
msgpack.dump([doc_tuple], file_, use_bin_type=True)
|
|
||||||
n += len(doc_tuple[1])
|
n += len(doc_tuple[1])
|
||||||
if limit and n >= limit:
|
if limit and n >= limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def walk_corpus(path):
|
def walk_corpus(path):
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
|
@ -157,8 +152,7 @@ class GoldCorpus(object):
|
||||||
if loc.parts[-1].endswith('json'):
|
if loc.parts[-1].endswith('json'):
|
||||||
gold_tuples = read_json_file(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
elif loc.parts[-1].endswith('msg'):
|
elif loc.parts[-1].endswith('msg'):
|
||||||
with loc.open('rb') as file_:
|
gold_tuples = srsly.read_msgpack(loc)
|
||||||
gold_tuples = msgpack.load(file_, raw=False)
|
|
||||||
else:
|
else:
|
||||||
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
|
msg = "Cannot read from file: %s. Supported formats: .json, .msg"
|
||||||
raise ValueError(msg % loc)
|
raise ValueError(msg % loc)
|
||||||
|
@ -378,7 +372,7 @@ def _json_iterate(loc):
|
||||||
if square_depth == 1 and curly_depth == 0:
|
if square_depth == 1 and curly_depth == 0:
|
||||||
py_str = py_raw[start : i+1].decode('utf8')
|
py_str = py_raw[start : i+1].decode('utf8')
|
||||||
try:
|
try:
|
||||||
yield json.loads(py_str)
|
yield srsly.json_loads(py_str)
|
||||||
except Exception:
|
except Exception:
|
||||||
print(py_str)
|
print(py_str)
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import ujson
|
|
||||||
import itertools
|
import itertools
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
|
@ -10,6 +9,7 @@ from collections import OrderedDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from thinc.neural import Model
|
from thinc.neural import Model
|
||||||
|
import srsly
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -18,7 +18,7 @@ from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
||||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||||
from .pipeline import EntityRuler
|
from .pipeline import EntityRuler
|
||||||
from .compat import json_dumps, izip, basestring_
|
from .compat import izip, basestring_
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||||
|
@ -640,7 +640,7 @@ class Language(object):
|
||||||
serializers = OrderedDict(
|
serializers = OrderedDict(
|
||||||
(
|
(
|
||||||
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
("tokenizer", lambda p: self.tokenizer.to_disk(p, vocab=False)),
|
||||||
("meta.json", lambda p: p.open("w").write(json_dumps(self.meta))),
|
("meta.json", lambda p: p.open("w").write(srsly.json_dumps(self.meta))),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
|
@ -671,7 +671,7 @@ class Language(object):
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
deserializers = OrderedDict(
|
deserializers = OrderedDict(
|
||||||
(
|
(
|
||||||
("meta.json", lambda p: self.meta.update(util.read_json(p))),
|
("meta.json", lambda p: self.meta.update(srsly.read_json(p))),
|
||||||
(
|
(
|
||||||
"vocab",
|
"vocab",
|
||||||
lambda p: (
|
lambda p: (
|
||||||
|
@ -705,7 +705,7 @@ class Language(object):
|
||||||
(
|
(
|
||||||
("vocab", lambda: self.vocab.to_bytes()),
|
("vocab", lambda: self.vocab.to_bytes()),
|
||||||
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
|
("tokenizer", lambda: self.tokenizer.to_bytes(vocab=False)),
|
||||||
("meta", lambda: json_dumps(self.meta)),
|
("meta", lambda: srsly.json_dumps(self.meta)),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
for i, (name, proc) in enumerate(self.pipeline):
|
for i, (name, proc) in enumerate(self.pipeline):
|
||||||
|
@ -725,7 +725,7 @@ class Language(object):
|
||||||
"""
|
"""
|
||||||
deserializers = OrderedDict(
|
deserializers = OrderedDict(
|
||||||
(
|
(
|
||||||
("meta", lambda b: self.meta.update(ujson.loads(b))),
|
("meta", lambda b: self.meta.update(srsly.json_loads(b))),
|
||||||
(
|
(
|
||||||
"vocab",
|
"vocab",
|
||||||
lambda b: (
|
lambda b: (
|
||||||
|
|
|
@ -7,10 +7,7 @@ import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import cytoolz
|
import cytoolz
|
||||||
from collections import OrderedDict, defaultdict
|
from collections import OrderedDict, defaultdict
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from .util import msgpack
|
|
||||||
from .util import msgpack_numpy
|
|
||||||
|
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Affine, Maxout, Softmax
|
from thinc.v2v import Affine, Maxout, Softmax
|
||||||
|
@ -27,7 +24,6 @@ from .syntax.arc_eager cimport ArcEager
|
||||||
from .morphology cimport Morphology
|
from .morphology cimport Morphology
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .compat import json_dumps
|
|
||||||
from .matcher import Matcher
|
from .matcher import Matcher
|
||||||
|
|
||||||
from .matcher import Matcher, PhraseMatcher
|
from .matcher import Matcher, PhraseMatcher
|
||||||
|
@ -38,7 +34,7 @@ from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||||
from ._ml import link_vectors_to_models, zero_init, flatten
|
from ._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from ._ml import create_default_optimizer
|
from ._ml import create_default_optimizer
|
||||||
from .errors import Errors, TempErrors
|
from .errors import Errors, TempErrors
|
||||||
from .compat import json_dumps, basestring_
|
from .compat import basestring_
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -235,7 +231,7 @@ class EntityRuler(object):
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
"""
|
"""
|
||||||
patterns = msgpack.loads(patterns_bytes, raw=False)
|
patterns = srsly.msgpack_loads(patterns_bytes)
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -244,7 +240,7 @@ class EntityRuler(object):
|
||||||
|
|
||||||
RETURNS (bytes): The serialized patterns.
|
RETURNS (bytes): The serialized patterns.
|
||||||
"""
|
"""
|
||||||
return msgpack.dumps(self.patterns, use_bin_type=True)
|
return srsly.msgpack_dumps(self.patterns)
|
||||||
|
|
||||||
def from_disk(self, path, **kwargs):
|
def from_disk(self, path, **kwargs):
|
||||||
"""Load the entity ruler from a file. Expects a file containing
|
"""Load the entity ruler from a file. Expects a file containing
|
||||||
|
@ -256,7 +252,7 @@ class EntityRuler(object):
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix('.jsonl')
|
path = path.with_suffix('.jsonl')
|
||||||
patterns = util.read_jsonl(path)
|
patterns = srsly.read_jsonl(path)
|
||||||
self.add_patterns(patterns)
|
self.add_patterns(patterns)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
@ -270,8 +266,7 @@ class EntityRuler(object):
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
path = path.with_suffix('.jsonl')
|
path = path.with_suffix('.jsonl')
|
||||||
data = [json_dumps(line, indent=0) for line in self.patterns]
|
srsly.write_jsonl(path, self.patterns)
|
||||||
path.open('w').write('\n'.join(data))
|
|
||||||
|
|
||||||
|
|
||||||
class Pipe(object):
|
class Pipe(object):
|
||||||
|
@ -368,7 +363,7 @@ class Pipe(object):
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
"""Serialize the pipe to a bytestring."""
|
"""Serialize the pipe to a bytestring."""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize['cfg'] = lambda: json_dumps(self.cfg)
|
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
|
||||||
if self.model in (True, False, None):
|
if self.model in (True, False, None):
|
||||||
serialize['model'] = lambda: self.model
|
serialize['model'] = lambda: self.model
|
||||||
else:
|
else:
|
||||||
|
@ -387,7 +382,7 @@ class Pipe(object):
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('model', load_model),
|
('model', load_model),
|
||||||
))
|
))
|
||||||
|
@ -397,7 +392,7 @@ class Pipe(object):
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
"""Serialize the pipe to disk."""
|
"""Serialize the pipe to disk."""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize['cfg'] = lambda p: p.open('w').write(json_dumps(self.cfg))
|
serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||||
if self.model not in (None, True, False):
|
if self.model not in (None, True, False):
|
||||||
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
||||||
|
@ -424,8 +419,7 @@ class Pipe(object):
|
||||||
|
|
||||||
def _load_cfg(path):
|
def _load_cfg(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
with path.open() as file_:
|
return srsly.read_json(path)
|
||||||
return ujson.load(file_)
|
|
||||||
else:
|
else:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -745,10 +739,9 @@ class Tagger(Pipe):
|
||||||
else:
|
else:
|
||||||
serialize['model'] = self.model.to_bytes
|
serialize['model'] = self.model.to_bytes
|
||||||
serialize['vocab'] = self.vocab.to_bytes
|
serialize['vocab'] = self.vocab.to_bytes
|
||||||
serialize['cfg'] = lambda: ujson.dumps(self.cfg)
|
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
|
||||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize['tag_map'] = lambda: msgpack.dumps(
|
serialize['tag_map'] = lambda: srsly.msgpack_dumps(tag_map)
|
||||||
tag_map, use_bin_type=True)
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
@ -766,7 +759,7 @@ class Tagger(Pipe):
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
tag_map = msgpack.loads(b, raw=False)
|
tag_map = srsly.msgpack_loads(b)
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
|
@ -775,7 +768,7 @@ class Tagger(Pipe):
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('tag_map', load_tag_map),
|
('tag_map', load_tag_map),
|
||||||
('cfg', lambda b: self.cfg.update(ujson.loads(b))),
|
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('model', lambda b: load_model(b)),
|
('model', lambda b: load_model(b)),
|
||||||
))
|
))
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
|
@ -785,10 +778,9 @@ class Tagger(Pipe):
|
||||||
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
|
||||||
serialize = OrderedDict((
|
serialize = OrderedDict((
|
||||||
('vocab', lambda p: self.vocab.to_disk(p)),
|
('vocab', lambda p: self.vocab.to_disk(p)),
|
||||||
('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
|
('tag_map', lambda p: srsly.write_msgpack(p, tag_map)),
|
||||||
tag_map, use_bin_type=True))),
|
|
||||||
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
|
||||||
('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
|
('cfg', lambda p: srsly.write_json(p, self.cfg))
|
||||||
))
|
))
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
|
@ -803,8 +795,7 @@ class Tagger(Pipe):
|
||||||
self.model.from_bytes(file_.read())
|
self.model.from_bytes(file_.read())
|
||||||
|
|
||||||
def load_tag_map(p):
|
def load_tag_map(p):
|
||||||
with p.open('rb') as file_:
|
tag_map = srsly.read_msgpack(p)
|
||||||
tag_map = msgpack.loads(file_.read(), raw=False)
|
|
||||||
self.vocab.morphology = Morphology(
|
self.vocab.morphology = Morphology(
|
||||||
self.vocab.strings, tag_map=tag_map,
|
self.vocab.strings, tag_map=tag_map,
|
||||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
|
|
|
@ -7,12 +7,11 @@ from libc.string cimport memcpy
|
||||||
from libcpp.set cimport set
|
from libcpp.set cimport set
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from .symbols import IDS as SYMBOLS_BY_STR
|
from .symbols import IDS as SYMBOLS_BY_STR
|
||||||
from .symbols import NAMES as SYMBOLS_BY_INT
|
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .compat import json_dumps
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
@ -197,8 +196,7 @@ cdef class StringStore:
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
strings = list(self)
|
strings = list(self)
|
||||||
with path.open('w') as file_:
|
srsly.write_json(path, strings)
|
||||||
file_.write(json_dumps(strings))
|
|
||||||
|
|
||||||
def from_disk(self, path):
|
def from_disk(self, path):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
@ -209,8 +207,7 @@ cdef class StringStore:
|
||||||
RETURNS (StringStore): The modified `StringStore` object.
|
RETURNS (StringStore): The modified `StringStore` object.
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
with path.open('r') as file_:
|
strings = srsly.read_json(path)
|
||||||
strings = ujson.load(file_)
|
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
|
@ -223,7 +220,7 @@ cdef class StringStore:
|
||||||
**exclude: Named attributes to prevent from being serialized.
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||||
"""
|
"""
|
||||||
return json_dumps(list(self))
|
return srsly.json_dumps(list(self))
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
@ -232,7 +229,7 @@ cdef class StringStore:
|
||||||
**exclude: Named attributes to prevent from being loaded.
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
RETURNS (StringStore): The `StringStore` object.
|
RETURNS (StringStore): The `StringStore` object.
|
||||||
"""
|
"""
|
||||||
strings = ujson.loads(bytes_data)
|
strings = srsly.json_loads(bytes_data)
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
|
|
|
@ -5,8 +5,6 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
|
||||||
import json
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
@ -29,7 +27,7 @@ cimport blis.cy
|
||||||
|
|
||||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||||
from .._ml import link_vectors_to_models, create_default_optimizer
|
from .._ml import link_vectors_to_models, create_default_optimizer
|
||||||
from ..compat import json_dumps, copy_array
|
from ..compat import copy_array
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
|
@ -119,7 +117,7 @@ cdef void predict_states(ActivationsC* A, StateC** states,
|
||||||
VecVec.add_i(&A.scores[i*n.classes],
|
VecVec.add_i(&A.scores[i*n.classes],
|
||||||
W.hidden_bias, 1., n.classes)
|
W.hidden_bias, 1., n.classes)
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(float* output,
|
cdef void sum_state_features(float* output,
|
||||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||||
cdef int idx, b, f, i
|
cdef int idx, b, f, i
|
||||||
|
@ -165,7 +163,7 @@ cdef void cpu_log_loss(float* d_scores,
|
||||||
else:
|
else:
|
||||||
d_scores[i] = exp(scores[i]-max_) / Z
|
d_scores[i] = exp(scores[i]-max_) / Z
|
||||||
|
|
||||||
|
|
||||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
||||||
const int* is_valid, int n) nogil:
|
const int* is_valid, int n) nogil:
|
||||||
# Find minimum cost
|
# Find minimum cost
|
||||||
|
@ -218,15 +216,15 @@ class ParserModel(Model):
|
||||||
|
|
||||||
def begin_training(self, X, y=None):
|
def begin_training(self, X, y=None):
|
||||||
self.lower.begin_training(X, y=y)
|
self.lower.begin_training(X, y=y)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tok2vec(self):
|
def tok2vec(self):
|
||||||
return self._layers[0]
|
return self._layers[0]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lower(self):
|
def lower(self):
|
||||||
return self._layers[1]
|
return self._layers[1]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def upper(self):
|
def upper(self):
|
||||||
return self._layers[2]
|
return self._layers[2]
|
||||||
|
@ -405,4 +403,3 @@ cdef class precompute_hiddens:
|
||||||
else:
|
else:
|
||||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||||
return state_vector, backprop_nonlinearity
|
return state_vector, backprop_nonlinearity
|
||||||
|
|
||||||
|
|
|
@ -5,8 +5,6 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
|
||||||
import json
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
@ -27,6 +25,7 @@ from thinc.misc import LayerNorm
|
||||||
from thinc.neural.ops import CupyOps
|
from thinc.neural.ops import CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
from thinc.linalg cimport Vec, VecVec
|
from thinc.linalg cimport Vec, VecVec
|
||||||
|
import srsly
|
||||||
|
|
||||||
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
|
from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
|
||||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
|
@ -34,7 +33,7 @@ from ._parser_model cimport get_c_weights, get_c_sizes
|
||||||
from ._parser_model import ParserModel
|
from ._parser_model import ParserModel
|
||||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||||
from .._ml import link_vectors_to_models, create_default_optimizer
|
from .._ml import link_vectors_to_models, create_default_optimizer
|
||||||
from ..compat import json_dumps, copy_array
|
from ..compat import copy_array
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
|
@ -539,7 +538,7 @@ cdef class Parser:
|
||||||
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
'model': lambda p: (self.model.to_disk(p) if self.model is not True else True),
|
||||||
'vocab': lambda p: self.vocab.to_disk(p),
|
'vocab': lambda p: self.vocab.to_disk(p),
|
||||||
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
'moves': lambda p: self.moves.to_disk(p, strings=False),
|
||||||
'cfg': lambda p: p.open('w').write(json_dumps(self.cfg))
|
'cfg': lambda p: srsly.write_json(p, self.cfg)
|
||||||
}
|
}
|
||||||
util.to_disk(path, serializers, exclude)
|
util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
|
@ -547,7 +546,7 @@ cdef class Parser:
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'vocab': lambda p: self.vocab.from_disk(p),
|
'vocab': lambda p: self.vocab.from_disk(p),
|
||||||
'moves': lambda p: self.moves.from_disk(p, strings=False),
|
'moves': lambda p: self.moves.from_disk(p, strings=False),
|
||||||
'cfg': lambda p: self.cfg.update(util.read_json(p)),
|
'cfg': lambda p: self.cfg.update(srsly.read_json(p)),
|
||||||
'model': lambda p: None
|
'model': lambda p: None
|
||||||
}
|
}
|
||||||
util.from_disk(path, deserializers, exclude)
|
util.from_disk(path, deserializers, exclude)
|
||||||
|
@ -568,7 +567,7 @@ cdef class Parser:
|
||||||
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
|
('model', lambda: (self.model.to_bytes() if self.model is not True else True)),
|
||||||
('vocab', lambda: self.vocab.to_bytes()),
|
('vocab', lambda: self.vocab.to_bytes()),
|
||||||
('moves', lambda: self.moves.to_bytes(strings=False)),
|
('moves', lambda: self.moves.to_bytes(strings=False)),
|
||||||
('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
|
('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True))
|
||||||
))
|
))
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
|
@ -576,7 +575,7 @@ cdef class Parser:
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||||
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
('moves', lambda b: self.moves.from_bytes(b, strings=False)),
|
||||||
('cfg', lambda b: self.cfg.update(json.loads(b))),
|
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('model', lambda b: None)
|
('model', lambda b: None)
|
||||||
))
|
))
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
|
|
|
@ -7,14 +7,13 @@ from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from collections import OrderedDict, Counter
|
from collections import OrderedDict, Counter
|
||||||
import ujson
|
import srsly
|
||||||
|
|
||||||
from . cimport _beam_utils
|
from . cimport _beam_utils
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..compat import json_dumps
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -153,13 +152,13 @@ cdef class TransitionSystem:
|
||||||
# Make sure we take a copy here, and that we get a Counter
|
# Make sure we take a copy here, and that we get a Counter
|
||||||
self.labels[action] = Counter()
|
self.labels[action] = Counter()
|
||||||
# Have to be careful here: Sorting must be stable, or our model
|
# Have to be careful here: Sorting must be stable, or our model
|
||||||
# won't be read back in correctly.
|
# won't be read back in correctly.
|
||||||
sorted_labels = [(f, L) for L, f in label_freqs.items()]
|
sorted_labels = [(f, L) for L, f in label_freqs.items()]
|
||||||
sorted_labels.sort()
|
sorted_labels.sort()
|
||||||
sorted_labels.reverse()
|
sorted_labels.reverse()
|
||||||
for freq, label_str in sorted_labels:
|
for freq, label_str in sorted_labels:
|
||||||
self.add_action(int(action), label_str)
|
self.add_action(int(action), label_str)
|
||||||
self.labels[action][label_str] = freq
|
self.labels[action][label_str] = freq
|
||||||
|
|
||||||
def add_action(self, int action, label_name):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
|
@ -204,7 +203,7 @@ cdef class TransitionSystem:
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
transitions = []
|
transitions = []
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes()
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
@ -212,7 +211,7 @@ cdef class TransitionSystem:
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
labels = {}
|
labels = {}
|
||||||
deserializers = {
|
deserializers = {
|
||||||
'moves': lambda b: labels.update(ujson.loads(b)),
|
'moves': lambda b: labels.update(srsly.json_loads(b)),
|
||||||
'strings': lambda b: self.strings.from_bytes(b)
|
'strings': lambda b: self.strings.from_bytes(b)
|
||||||
}
|
}
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import numpy
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import contextlib
|
import contextlib
|
||||||
import msgpack
|
import srsly
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.attrs import POS, HEAD, DEP
|
from spacy.attrs import POS, HEAD, DEP
|
||||||
|
@ -100,8 +100,8 @@ def assert_docs_equal(doc1, doc2):
|
||||||
|
|
||||||
def assert_packed_msg_equal(b1, b2):
|
def assert_packed_msg_equal(b1, b2):
|
||||||
"""Assert that two packed msgpack messages are equal."""
|
"""Assert that two packed msgpack messages are equal."""
|
||||||
msg1 = msgpack.loads(b1, encoding="utf8")
|
msg1 = srsly.msgpack_loads(b1)
|
||||||
msg2 = msgpack.loads(b2, encoding="utf8")
|
msg2 = srsly.msgpack_loads(b2)
|
||||||
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
assert sorted(msg1.keys()) == sorted(msg2.keys())
|
||||||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||||
assert k1 == k2
|
assert k1 == k2
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import msgpack
|
|
||||||
import gzip
|
import gzip
|
||||||
|
import srsly
|
||||||
from thinc.neural.ops import NumpyOps
|
from thinc.neural.ops import NumpyOps
|
||||||
|
|
||||||
from ..compat import copy_reg
|
from ..compat import copy_reg
|
||||||
|
@ -74,11 +74,11 @@ class Binder(object):
|
||||||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||||
"strings": list(self.strings),
|
"strings": list(self.strings),
|
||||||
}
|
}
|
||||||
return gzip.compress(msgpack.dumps(msg))
|
return gzip.compress(srsly.msgpack_dumps(msg))
|
||||||
|
|
||||||
def from_bytes(self, string):
|
def from_bytes(self, string):
|
||||||
"""Deserialize the binder's annotations from a byte string."""
|
"""Deserialize the binder's annotations from a byte string."""
|
||||||
msg = msgpack.loads(gzip.decompress(string))
|
msg = srsly.msgpack_loads(gzip.decompress(string))
|
||||||
self.attrs = msg["attrs"]
|
self.attrs = msg["attrs"]
|
||||||
self.strings = set(msg["strings"])
|
self.strings = set(msg["strings"])
|
||||||
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
|
lengths = numpy.fromstring(msg["lengths"], dtype="int32")
|
||||||
|
|
|
@ -10,8 +10,8 @@ import numpy
|
||||||
import numpy.linalg
|
import numpy.linalg
|
||||||
import struct
|
import struct
|
||||||
import dill
|
import dill
|
||||||
import msgpack
|
|
||||||
from thinc.neural.util import get_array_module, copy_array
|
from thinc.neural.util import get_array_module, copy_array
|
||||||
|
import srsly
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memcpy, memset
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
@ -28,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport ENT_TYPE, SENT_START
|
from ..attrs cimport ENT_TYPE, SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..util import normalize_slice, is_json_serializable
|
from ..util import normalize_slice
|
||||||
from ..compat import is_config, copy_reg, pickle, basestring_
|
from ..compat import is_config, copy_reg, pickle, basestring_
|
||||||
from ..errors import deprecation_warning, models_warning, user_warning
|
from ..errors import deprecation_warning, models_warning, user_warning
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
@ -807,8 +807,8 @@ cdef class Doc:
|
||||||
}
|
}
|
||||||
if 'user_data' not in exclude and self.user_data:
|
if 'user_data' not in exclude and self.user_data:
|
||||||
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
user_data_keys, user_data_values = list(zip(*self.user_data.items()))
|
||||||
serializers['user_data_keys'] = lambda: msgpack.dumps(user_data_keys)
|
serializers['user_data_keys'] = lambda: srsly.msgpack_dumps(user_data_keys)
|
||||||
serializers['user_data_values'] = lambda: msgpack.dumps(user_data_values)
|
serializers['user_data_values'] = lambda: srsly.msgpack_dumps(user_data_values)
|
||||||
|
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
||||||
|
@ -836,9 +836,8 @@ cdef class Doc:
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
# users don't mind getting a list instead of a tuple.
|
# users don't mind getting a list instead of a tuple.
|
||||||
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
if 'user_data' not in exclude and 'user_data_keys' in msg:
|
||||||
user_data_keys = msgpack.loads(msg['user_data_keys'],
|
user_data_keys = srsly.msgpack_loads(msg['user_data_keys'], use_list=False)
|
||||||
use_list=False, raw=False)
|
user_data_values = srsly.msgpack_loads(msg['user_data_values'])
|
||||||
user_data_values = msgpack.loads(msg['user_data_values'], raw=False)
|
|
||||||
for key, value in zip(user_data_keys, user_data_values):
|
for key, value in zip(user_data_keys, user_data_values):
|
||||||
self.user_data[key] = value
|
self.user_data[key] = value
|
||||||
|
|
||||||
|
@ -996,7 +995,7 @@ cdef class Doc:
|
||||||
if not self.has_extension(attr):
|
if not self.has_extension(attr):
|
||||||
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
||||||
value = self._.get(attr)
|
value = self._.get(attr)
|
||||||
if not is_json_serializable(value):
|
if not srsly.is_json_serializable(value):
|
||||||
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
data['_'][attr] = value
|
data['_'][attr] = value
|
||||||
return data
|
return data
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import ujson
|
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
import importlib
|
import importlib
|
||||||
import regex as re
|
import regex as re
|
||||||
|
@ -15,18 +14,13 @@ import functools
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
import srsly
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
from .compat import cupy, CudaStream, path2str, basestring_, unicode_
|
||||||
from .compat import import_file, json_dumps
|
from .compat import import_file
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
# Import these directly from Thinc, so that we're sure we always have the
|
|
||||||
# same version.
|
|
||||||
from thinc.neural._classes.model import msgpack # noqa: F401
|
|
||||||
from thinc.neural._classes.model import msgpack_numpy # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
_data_path = Path(__file__).parent / "data"
|
_data_path = Path(__file__).parent / "data"
|
||||||
|
@ -185,7 +179,7 @@ def get_model_meta(path):
|
||||||
meta_path = model_path / "meta.json"
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
raise IOError(Errors.E053.format(path=meta_path))
|
raise IOError(Errors.E053.format(path=meta_path))
|
||||||
meta = read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
for setting in ["lang", "name", "version"]:
|
for setting in ["lang", "name", "version"]:
|
||||||
if setting not in meta or not meta[setting]:
|
if setting not in meta or not meta[setting]:
|
||||||
raise ValueError(Errors.E054.format(setting=setting))
|
raise ValueError(Errors.E054.format(setting=setting))
|
||||||
|
@ -529,74 +523,16 @@ def itershuffle(iterable, bufsize=1000):
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
|
||||||
|
|
||||||
def read_json(location):
|
|
||||||
"""Open and load JSON from file.
|
|
||||||
|
|
||||||
location (Path): Path to JSON file.
|
|
||||||
RETURNS (dict): Loaded JSON content.
|
|
||||||
"""
|
|
||||||
location = ensure_path(location)
|
|
||||||
with location.open("r", encoding="utf8") as f:
|
|
||||||
return ujson.load(f)
|
|
||||||
|
|
||||||
|
|
||||||
def write_json(file_path, contents):
|
|
||||||
"""Create a .json file and dump contents.
|
|
||||||
|
|
||||||
file_path (unicode / Path): The path to the output file.
|
|
||||||
contents: The JSON-serializable contents to output.
|
|
||||||
"""
|
|
||||||
with Path(file_path).open("w", encoding="utf8") as f:
|
|
||||||
f.write(json_dumps(contents))
|
|
||||||
|
|
||||||
|
|
||||||
def read_jsonl(file_path):
|
|
||||||
"""Read a .jsonl file and yield its contents line by line.
|
|
||||||
|
|
||||||
file_path (unicode / Path): The file path.
|
|
||||||
YIELDS: The loaded JSON contents of each line.
|
|
||||||
"""
|
|
||||||
with Path(file_path).open("r", encoding="utf8") as f:
|
|
||||||
for line in f:
|
|
||||||
try: # hack to handle broken jsonl
|
|
||||||
yield ujson.loads(line.strip())
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
def write_jsonl(file_path, lines):
|
|
||||||
"""Create a .jsonl file and dump contents.
|
|
||||||
|
|
||||||
file_path (unicode / Path): The path to the output file.
|
|
||||||
lines (list): The JSON-serializable contents of each line.
|
|
||||||
"""
|
|
||||||
data = [json_dumps(line) for line in lines]
|
|
||||||
with Path(file_path).open("w", encoding="utf-8") as f:
|
|
||||||
f.write("\n".join(data))
|
|
||||||
|
|
||||||
|
|
||||||
def is_json_serializable(obj):
|
|
||||||
"""Check if a Python object is JSON-serializable."""
|
|
||||||
if hasattr(obj, "__call__"):
|
|
||||||
# Check this separately here to prevent infinite recursions
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
ujson.dumps(obj)
|
|
||||||
return True
|
|
||||||
except TypeError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def to_bytes(getters, exclude):
|
def to_bytes(getters, exclude):
|
||||||
serialized = OrderedDict()
|
serialized = OrderedDict()
|
||||||
for key, getter in getters.items():
|
for key, getter in getters.items():
|
||||||
if key not in exclude:
|
if key not in exclude:
|
||||||
serialized[key] = getter()
|
serialized[key] = getter()
|
||||||
return msgpack.dumps(serialized, use_bin_type=True)
|
return srsly.msgpack_dumps(serialized)
|
||||||
|
|
||||||
|
|
||||||
def from_bytes(bytes_data, setters, exclude):
|
def from_bytes(bytes_data, setters, exclude):
|
||||||
msg = msgpack.loads(bytes_data, raw=False)
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
for key, setter in setters.items():
|
for key, setter in setters.items():
|
||||||
if key not in exclude and key in msg:
|
if key not in exclude and key in msg:
|
||||||
setter(msg[key])
|
setter(msg[key])
|
||||||
|
|
|
@ -4,9 +4,7 @@ from __future__ import unicode_literals
|
||||||
import functools
|
import functools
|
||||||
import numpy
|
import numpy
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
import srsly
|
||||||
from .util import msgpack
|
|
||||||
from .util import msgpack_numpy
|
|
||||||
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
@ -353,7 +351,7 @@ cdef class Vectors:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
save_array = lambda arr, file_: xp.save(file_, arr)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('vectors', lambda p: save_array(self.data, p.open('wb'))),
|
('vectors', lambda p: save_array(self.data, p.open('wb'))),
|
||||||
('key2row', lambda p: msgpack.dump(self.key2row, p.open('wb')))
|
('key2row', lambda p: srsly.write_msgpack(p, self.key2row))
|
||||||
))
|
))
|
||||||
return util.to_disk(path, serializers, exclude)
|
return util.to_disk(path, serializers, exclude)
|
||||||
|
|
||||||
|
@ -366,8 +364,7 @@ cdef class Vectors:
|
||||||
"""
|
"""
|
||||||
def load_key2row(path):
|
def load_key2row(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
with path.open('rb') as file_:
|
self.key2row = srsly.read_msgpack(path)
|
||||||
self.key2row = msgpack.load(file_)
|
|
||||||
for key, row in self.key2row.items():
|
for key, row in self.key2row.items():
|
||||||
if self._unset.count(row):
|
if self._unset.count(row):
|
||||||
self._unset.erase(self._unset.find(row))
|
self._unset.erase(self._unset.find(row))
|
||||||
|
@ -401,9 +398,9 @@ cdef class Vectors:
|
||||||
if hasattr(self.data, 'to_bytes'):
|
if hasattr(self.data, 'to_bytes'):
|
||||||
return self.data.to_bytes()
|
return self.data.to_bytes()
|
||||||
else:
|
else:
|
||||||
return msgpack.dumps(self.data)
|
return srsly.msgpack_dumps(self.data)
|
||||||
serializers = OrderedDict((
|
serializers = OrderedDict((
|
||||||
('key2row', lambda: msgpack.dumps(self.key2row)),
|
('key2row', lambda: srsly.msgpack_dumps(self.key2row)),
|
||||||
('vectors', serialize_weights)
|
('vectors', serialize_weights)
|
||||||
))
|
))
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
@ -419,10 +416,10 @@ cdef class Vectors:
|
||||||
if hasattr(self.data, 'from_bytes'):
|
if hasattr(self.data, 'from_bytes'):
|
||||||
self.data.from_bytes()
|
self.data.from_bytes()
|
||||||
else:
|
else:
|
||||||
self.data = msgpack.loads(b)
|
self.data = srsly.msgpack_loads(b)
|
||||||
|
|
||||||
deserializers = OrderedDict((
|
deserializers = OrderedDict((
|
||||||
('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
|
('key2row', lambda b: self.key2row.update(srsly.msgpack_loads(b))),
|
||||||
('vectors', deserialize_weights)
|
('vectors', deserialize_weights)
|
||||||
))
|
))
|
||||||
util.from_bytes(data, deserializers, exclude)
|
util.from_bytes(data, deserializers, exclude)
|
||||||
|
|
|
@ -9,10 +9,9 @@ p
|
||||||
| underscore, e.e #[code unicode_].
|
| underscore, e.e #[code unicode_].
|
||||||
|
|
||||||
+aside-code("Example").
|
+aside-code("Example").
|
||||||
from spacy.compat import unicode_, json_dumps
|
from spacy.compat import unicode_
|
||||||
|
|
||||||
compatible_unicode = unicode_('hello world')
|
compatible_unicode = unicode_('hello world')
|
||||||
compatible_json = json_dumps({'key': 'value'})
|
|
||||||
|
|
||||||
+table(["Name", "Python 2", "Python 3"])
|
+table(["Name", "Python 2", "Python 3"])
|
||||||
+row
|
+row
|
||||||
|
@ -35,11 +34,6 @@ p
|
||||||
+cell #[code raw_input]
|
+cell #[code raw_input]
|
||||||
+cell #[code input]
|
+cell #[code input]
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code compat.json_dumps]
|
|
||||||
+cell #[code ujson.dumps] with #[code .decode('utf8')]
|
|
||||||
+cell #[code ujson.dumps]
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code compat.path2str]
|
+cell #[code compat.path2str]
|
||||||
+cell #[code str(path)] with #[code .decode('utf8')]
|
+cell #[code str(path)] with #[code .decode('utf8')]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user