Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter
This commit is contained in:
Matthew Honnibal 2020-06-20 15:59:39 +02:00
parent 5467cb4aae
commit a5ebfb20f5
10 changed files with 212 additions and 135 deletions

View File

@ -2,24 +2,27 @@ from pathlib import Path
from wasabi import Printer from wasabi import Printer
import srsly import srsly
import re import re
import sys
from .converters import conllu2json, iob2json, conll_ner2json from ..tokens import DocBin
from .converters import ner_jsonl2json from ..gold.converters import iob2docs, conll_ner2docs, json2docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new # matched by file extension and content. To add a converter, add a new
# entry to this dict with the file extension mapped to the converter function # entry to this dict with the file extension mapped to the converter function
# imported from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS = {
"conllubio": conllu2json, #"conllubio": conllu2docs, TODO
"conllu": conllu2json, #"conllu": conllu2docs, TODO
"conll": conllu2json, #"conll": conllu2docs, TODO
"ner": conll_ner2json, "ner": conll_ner2docs,
"iob": iob2json, "iob": iob2docs,
"jsonl": ner_jsonl2json, "json": json2docs,
} }
# File types # File types
FILE_TYPES = ("json", "jsonl", "msg") FILE_TYPES = ("json", "jsonl", "msg")
FILE_TYPES_STDOUT = ("json", "jsonl") FILE_TYPES_STDOUT = ("json", "jsonl")
@ -27,62 +30,35 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
def convert( def convert(
# fmt: off # fmt: off
input_file: ("Input file", "positional", None, str), input_path: ("Input file or directory", "positional", None, Path),
output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-", output_dir: ("Output directory.", "positional", None, Path),
file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json", file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1, n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False, seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None, model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False, morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False, merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto", converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None, ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
lang: ("Language (if tokenizer required)", "option", "l", str) = None, lang: ("Language (if tokenizer required)", "option", "l", str) = None,
# fmt: on # fmt: on
): ):
""" """
Convert files into JSON format for use with train command and other Convert files into json or DocBin format for use with train command and other
experiment management functions. If no output_dir is specified, the data experiment management functions.
is written to stdout, so you can pipe them forward to a JSON file:
$ spacy convert some_file.conllu > some_file.json
""" """
cli_args = locals()
no_print = output_dir == "-" no_print = output_dir == "-"
output_dir = Path(output_dir) if output_dir != "-" else "-"
msg = Printer(no_print=no_print) msg = Printer(no_print=no_print)
input_path = Path(input_file) verify_cli_args(msg, **cli_args)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-": converter = _get_converter(msg, converter, input_path)
# TODO: support msgpack via stdout in srsly? ner_map = srsly.read_json(ner_map) if ner_map is not None else None
msg.fail( for input_loc in walk_directory(input_path):
f"Can't write .{file_type} data to stdout", input_data = input_loc.open("r", encoding="utf-8").read()
"Please specify an output directory.",
exits=1,
)
if not input_path.exists():
msg.fail("Input file not found", input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists():
msg.fail("Output directory not found", output_dir, exits=1)
input_data = input_path.open("r", encoding="utf-8").read()
if converter == "auto":
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":
msg.info("Auto-detected token-per-line NER format")
converter = converter_autodetect
elif converter_autodetect == "iob":
msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
)
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
ner_map = None
if ner_map_path is not None:
ner_map = srsly.read_json(ner_map_path)
# Use converter function to convert data # Use converter function to convert data
func = CONVERTERS[converter] func = CONVERTERS[converter]
data = func( docs = func(
input_data, input_data,
n_sents=n_sents, n_sents=n_sents,
seg_sents=seg_sents, seg_sents=seg_sents,
@ -93,23 +69,19 @@ def convert(
no_print=no_print, no_print=no_print,
ner_map=ner_map, ner_map=ner_map,
) )
if output_dir != "-":
# Export data to a file
suffix = f".{file_type}" suffix = f".{file_type}"
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) subpath = input_loc.relative_to(input_path)
output_file = (output_dir / subpath).with_suffix(suffix)
if not output_file.parent.exists():
output_file.parent.mkdir(parents=True)
if file_type == "json": if file_type == "json":
srsly.write_json(output_file, data) data = docs2json(docs)
elif file_type == "jsonl": srsly.write_json(output_file, docs2json(docs))
srsly.write_jsonl(output_file, data)
elif file_type == "msg":
srsly.write_msgpack(output_file, data)
msg.good(f"Generated output file ({len(data)} documents): {output_file}")
else: else:
# Print to stdout data = DocBin(docs=docs).to_bytes()
if file_type == "json": with output_file.open("wb") as file_:
srsly.write_json("-", data) file_.write(data)
elif file_type == "jsonl": msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
srsly.write_jsonl("-", data)
def autodetect_ner_format(input_data): def autodetect_ner_format(input_data):
@ -129,3 +101,102 @@ def autodetect_ner_format(input_data):
if format_guesses["ner"] == 0 and format_guesses["iob"] > 0: if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
return "iob" return "iob"
return None return None
def walk_directory(path):
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
else:
locs.append(path)
return locs
def verify_cli_args(
msg,
input_path,
output_dir,
file_type,
n_sents,
seg_sents,
model,
morphology,
merge_subtokens,
converter,
ner_map,
lang
):
if converter == "ner" or converter == "iob":
input_data = input_path.open("r", encoding="utf-8").read()
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":
msg.info("Auto-detected token-per-line NER format")
converter = converter_autodetect
elif converter_autodetect == "iob":
msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. Conversion may not",
"succeed. See https://spacy.io/api/cli#convert"
)
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
# TODO: support msgpack via stdout in srsly?
msg.fail(
f"Can't write .{file_type} data to stdout",
"Please specify an output directory.",
exits=1,
)
if not input_path.exists():
msg.fail("Input file not found", input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists():
msg.fail("Output directory not found", output_dir, exits=1)
if input_path.is_dir():
input_locs = walk_directory(input_path)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
if len(file_types) >= 2:
file_types = ",".join(file_types)
msg.fail("All input files must be same type", file_types, exits=1)
if converter == "auto":
converter = file_types[0]
else:
converter = input_path.suffix[1:]
if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
return converter
def _get_converter(msg, converter, input_path):
if input_path.is_dir():
input_path = walk_directory(input_path)[0]
if converter == "auto":
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open() as file_:
input_data = file_.read()
converter_autodetect = autodetect_ner_format(input_data)
if converter_autodetect == "ner":
msg.info("Auto-detected token-per-line NER format")
converter = converter_autodetect
elif converter_autodetect == "iob":
msg.info("Auto-detected sentence-per-line NER format")
converter = converter_autodetect
else:
msg.warn(
"Can't automatically detect NER format. "
"Conversion may not succeed. "
"See https://spacy.io/api/cli#convert"
)
return converter

View File

@ -1,4 +0,0 @@
from .conllu2json import conllu2json # noqa: F401
from .iob2json import iob2docs # noqa: F401
from .conll_ner2json import conll_ner2json # noqa: F401
from .jsonl2docs import ner_jsonl2json # noqa: F401

View File

@ -1,51 +0,0 @@
import srsly
from ...gold import docs_to_json
from ...util import get_lang_class, minibatch
def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
docs = []
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
nlp = get_lang_class(lang)()
sentencizer = nlp.create_pipe("sentencizer")
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
docs = []
# TODO: Should we be merging these? We're disrespecting the n_sents
# currently.
for record in batch:
raw_text = record["text"]
if "entities" in record:
ents = record["entities"]
else:
ents = record["spans"]
ents = [(e["start"], e["end"], e["label"]) for e in ents]
doc = nlp.make_doc(raw_text)
sentencizer(doc)
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
doc.ents = _cleanup_spans(spans)
docs.append(doc)
return docs
def _cleanup_spans(spans):
output = []
seen = set()
for span in spans:
if span is not None:
# Trim whitespace
while len(span) and span[0].is_space:
span = span[1:]
while len(span) and span[-1].is_space:
span = span[:-1]
if not len(span):
continue
for i in range(span.start, span.end):
if i in seen:
break
else:
output.append(span)
seen.update(range(span.start, span.end))
return output

View File

@ -0,0 +1,6 @@
from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs
# TODO: Update this one
#from .conllu2docs import conllu2docs # noqa: F401

View File

@ -7,7 +7,7 @@ from ...vocab import Vocab
from ...util import load_model from ...util import load_model
def conll_ner2doc( def conll_ner2docs(
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
): ):
""" """

View File

@ -3,7 +3,7 @@ from wasabi import Printer
from ...gold import iob_to_biluo, tags_to_entities from ...gold import iob_to_biluo, tags_to_entities
from ...util import minibatch from ...util import minibatch
from .util import merge_sentences from .util import merge_sentences
from .conll_ner2json import n_sents_info from .conll_ner2docs import n_sents_info
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):

View File

@ -0,0 +1,38 @@
import tempfile
import contextlib
import shutil
from pathlib import Path
from ..gold_io import read_json_file
from ..example import annotations2doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model
from ...lang.xx import MultiLanguage
@contextlib.contextmanager
def make_tempdir():
d = Path(tempfile.mkdtemp())
yield d
shutil.rmtree(str(d))
def json2docs(
input_data,
model=None,
**kwargs
):
nlp = load_model(model) if model is not None else MultiLanguage()
docs = []
with make_tempdir() as tmp_dir:
json_path = Path(tmp_dir) / "data.json"
with (json_path).open("w") as file_:
file_.write(input_data)
for json_annot in read_json_file(json_path):
example_dict = _fix_legacy_dict_data(json_annot)
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
doc = annotations2doc(
nlp.vocab,
tok_dict,
doc_dict
)
docs.append(doc)
return docs

View File

@ -0,0 +1,5 @@
def merge_sentences(docs, n_sents):
merged = []
for group in minibatch(docs, size=n_sents):
raise NotImplementedError
return merged

View File

@ -9,6 +9,19 @@ from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors from ..errors import Errors
ALL_ATTRS = (
"ORTH",
"TAG",
"HEAD",
"DEP",
"SENT_START",
"ENT_IOB",
"ENT_TYPE",
"LEMMA",
"MORPH"
)
class DocBin(object): class DocBin(object):
"""Pack Doc objects for binary serialization. """Pack Doc objects for binary serialization.
@ -39,7 +52,7 @@ class DocBin(object):
document from the DocBin. document from the DocBin.
""" """
def __init__(self, attrs=None, store_user_data=False, docs=[]): def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
"""Create a DocBin object to hold serialized annotations. """Create a DocBin object to hold serialized annotations.
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
@ -49,7 +62,6 @@ class DocBin(object):
DOCS: https://spacy.io/api/docbin#init DOCS: https://spacy.io/api/docbin#init
""" """
attrs = attrs or []
attrs = sorted([intify_attr(attr) for attr in attrs]) attrs = sorted([intify_attr(attr) for attr in attrs])
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0] self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
@ -60,7 +72,7 @@ class DocBin(object):
self.strings = set() self.strings = set()
self.store_user_data = store_user_data self.store_user_data = store_user_data
for doc in docs: for doc in docs:
self.add(docs) self.add(doc)
def __len__(self): def __len__(self):
"""RETURNS: The number of Doc objects added to the DocBin.""" """RETURNS: The number of Doc objects added to the DocBin."""