Serialize all attrs by default

Move converters under spacy.gold Move things around Fix naming Fix name Update converter to produce DocBin Update converters Make spacy convert output docbin Fix import Fix docbin Fix import Update converter Remove jsonl converter Add json2docs converter
2025-10-25 13:11:03 +03:00 · 2020-06-20 15:59:39 +02:00 · 2020-06-20 15:59:39 +02:00 · a5ebfb20f5
commit a5ebfb20f5
parent 5467cb4aae
10 changed files with 212 additions and 135 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -2,24 +2,27 @@ from pathlib import Path
 from wasabi import Printer
 import srsly
 import re
 import sys
-from .converters import conllu2json, iob2json, conll_ner2json
+from ..tokens import DocBin
-from .converters import ner_jsonl2json
+from ..gold.converters import iob2docs, conll_ner2docs, json2docs
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
 CONVERTERS = {
-    "conllubio": conllu2json,
+    #"conllubio": conllu2docs, TODO
-    "conllu": conllu2json,
+    #"conllu": conllu2docs, TODO
-    "conll": conllu2json,
+    #"conll": conllu2docs, TODO
-    "ner": conll_ner2json,
+    "ner": conll_ner2docs,
-    "iob": iob2json,
+    "iob": iob2docs,
-    "jsonl": ner_jsonl2json,
+    "json": json2docs,
 }
 # File types
 FILE_TYPES = ("json", "jsonl", "msg")
 FILE_TYPES_STDOUT = ("json", "jsonl")
@ -27,62 +30,35 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
 def convert(
    # fmt: off
-    input_file: ("Input file", "positional", None, str),
+    input_path: ("Input file or directory", "positional", None, Path),
-    output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
+    output_dir: ("Output directory.", "positional", None, Path),
-    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
+    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
    n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
    seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
    model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
    morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
    merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
    converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
-    ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
+    ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
    lang: ("Language (if tokenizer required)", "option", "l", str) = None,
    # fmt: on
 ):
    """
-    Convert files into JSON format for use with train command and other
+    Convert files into json or DocBin format for use with train command and other
-    experiment management functions. If no output_dir is specified, the data
+    experiment management functions.
    is written to stdout, so you can pipe them forward to a JSON file:
    $ spacy convert some_file.conllu > some_file.json
    """
    cli_args = locals()
    no_print = output_dir == "-"
    output_dir = Path(output_dir) if output_dir != "-" else "-"
    msg = Printer(no_print=no_print)
-    input_path = Path(input_file)
+    verify_cli_args(msg, **cli_args)
-    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+    converter = _get_converter(msg, converter, input_path)
-        # TODO: support msgpack via stdout in srsly?
+    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
-        msg.fail(
+    for input_loc in walk_directory(input_path):
-            f"Can't write .{file_type} data to stdout",
+        input_data = input_loc.open("r", encoding="utf-8").read()
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    input_data = input_path.open("r", encoding="utf-8").read()
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter == "ner" or converter == "iob":
        converter_autodetect = autodetect_ner_format(input_data)
        if converter_autodetect == "ner":
            msg.info("Auto-detected token-per-line NER format")
            converter = converter_autodetect
        elif converter_autodetect == "iob":
            msg.info("Auto-detected sentence-per-line NER format")
            converter = converter_autodetect
        else:
            msg.warn(
                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
            )
    if converter not in CONVERTERS:
        msg.fail(f"Can't find converter for {converter}", exits=1)
    ner_map = None
    if ner_map_path is not None:
        ner_map = srsly.read_json(ner_map_path)
        # Use converter function to convert data
        func = CONVERTERS[converter]
-    data = func(
+        docs = func(
            input_data,
            n_sents=n_sents,
            seg_sents=seg_sents,
@ -93,23 +69,19 @@ def convert(
            no_print=no_print,
            ner_map=ner_map,
        )
    if output_dir != "-":
        # Export data to a file
        suffix = f".{file_type}"
-        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
+        subpath = input_loc.relative_to(input_path)
        output_file = (output_dir / subpath).with_suffix(suffix)
        if not output_file.parent.exists():
            output_file.parent.mkdir(parents=True)
        if file_type == "json":
-            srsly.write_json(output_file, data)
+            data = docs2json(docs)
-        elif file_type == "jsonl":
+            srsly.write_json(output_file, docs2json(docs))
            srsly.write_jsonl(output_file, data)
        elif file_type == "msg":
            srsly.write_msgpack(output_file, data)
        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
        else:
-        # Print to stdout
+            data = DocBin(docs=docs).to_bytes()
-        if file_type == "json":
+            with output_file.open("wb") as file_:
-            srsly.write_json("-", data)
+                file_.write(data)
-        elif file_type == "jsonl":
+        msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
            srsly.write_jsonl("-", data)
 def autodetect_ner_format(input_data):
@ -129,3 +101,102 @@ def autodetect_ner_format(input_data):
    if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
        return "iob"
    return None
 def walk_directory(path):
    if not path.is_dir():
        return [path]
    paths = [path]
    locs = []
    seen = set()
    for path in paths:
        if str(path) in seen:
            continue
        seen.add(str(path))
        if path.parts[-1].startswith("."):
            continue
        elif path.is_dir():
            paths.extend(path.iterdir())
        else:
            locs.append(path)
    return locs
 def verify_cli_args(
    msg,
    input_path,
    output_dir,
    file_type,
    n_sents,
    seg_sents,
    model,
    morphology,
    merge_subtokens,
    converter,
    ner_map,
    lang
 ):
    if converter == "ner" or converter == "iob":
        input_data = input_path.open("r", encoding="utf-8").read()
        converter_autodetect = autodetect_ner_format(input_data)
        if converter_autodetect == "ner":
            msg.info("Auto-detected token-per-line NER format")
            converter = converter_autodetect
        elif converter_autodetect == "iob":
            msg.info("Auto-detected sentence-per-line NER format")
            converter = converter_autodetect
        else:
            msg.warn(
                "Can't automatically detect NER format. Conversion may not",
                "succeed. See https://spacy.io/api/cli#convert"
            )
    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
        # TODO: support msgpack via stdout in srsly?
        msg.fail(
            f"Can't write .{file_type} data to stdout",
            "Please specify an output directory.",
            exits=1,
        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
        msg.fail("Output directory not found", output_dir, exits=1)
    if input_path.is_dir():
        input_locs = walk_directory(input_path)
        if len(input_locs) == 0:
            msg.fail("No input files in directory", input_path, exits=1)
        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
        if len(file_types) >= 2:
            file_types = ",".join(file_types)
            msg.fail("All input files must be same type", file_types, exits=1)
        if converter == "auto":
            converter = file_types[0]
    else:
        converter = input_path.suffix[1:]
    if converter not in CONVERTERS:
        msg.fail(f"Can't find converter for {converter}", exits=1)
    return converter
 def _get_converter(msg, converter, input_path):
    if input_path.is_dir():
        input_path = walk_directory(input_path)[0]
    if converter == "auto":
        converter = input_path.suffix[1:]
    if converter == "ner" or converter == "iob":
        with input_path.open() as file_:
            input_data = file_.read()
        converter_autodetect = autodetect_ner_format(input_data)
        if converter_autodetect == "ner":
            msg.info("Auto-detected token-per-line NER format")
            converter = converter_autodetect
        elif converter_autodetect == "iob":
            msg.info("Auto-detected sentence-per-line NER format")
            converter = converter_autodetect
        else:
            msg.warn(
                "Can't automatically detect NER format. "
                "Conversion may not succeed. "
                "See https://spacy.io/api/cli#convert"
            )
    return converter
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,4 +0,0 @@
 from .conllu2json import conllu2json  # noqa: F401
 from .iob2json import iob2docs # noqa: F401
 from .conll_ner2json import conll_ner2json  # noqa: F401
 from .jsonl2docs import ner_jsonl2json  # noqa: F401
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -1,51 +0,0 @@
 import srsly
 from ...gold import docs_to_json
 from ...util import get_lang_class, minibatch
 def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
    if lang is None:
        raise ValueError("No --lang specified, but tokenization required")
    docs = []
    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
    nlp = get_lang_class(lang)()
    sentencizer = nlp.create_pipe("sentencizer")
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
        docs = []
        # TODO: Should we be merging these? We're disrespecting the n_sents
        # currently.
        for record in batch:
            raw_text = record["text"]
            if "entities" in record:
                ents = record["entities"]
            else:
                ents = record["spans"]
            ents = [(e["start"], e["end"], e["label"]) for e in ents]
            doc = nlp.make_doc(raw_text)
            sentencizer(doc)
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
            doc.ents = _cleanup_spans(spans)
            docs.append(doc)
    return docs
 def _cleanup_spans(spans):
    output = []
    seen = set()
    for span in spans:
        if span is not None:
            # Trim whitespace
            while len(span) and span[0].is_space:
                span = span[1:]
            while len(span) and span[-1].is_space:
                span = span[:-1]
            if not len(span):
                continue
            for i in range(span.start, span.end):
                if i in seen:
                    break
            else:
                output.append(span)
                seen.update(range(span.start, span.end))
    return output
--- a/spacy/gold/converters/init.py
+++ b/spacy/gold/converters/init.py
@ -0,0 +1,6 @@
 from .iob2docs import iob2docs # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
 from .json2docs import json2docs
 # TODO: Update this one
 #from .conllu2docs import conllu2docs  # noqa: F401
--- a/spacy/gold/converters/conll_ner2docs.py
+++ b/spacy/gold/converters/conll_ner2docs.py
@ -7,7 +7,7 @@ from ...vocab import Vocab
 from ...util import load_model
-def conll_ner2doc(
+def conll_ner2docs(
    input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
    """
--- a/spacy/gold/converters/conllu2json.py
+++ b/spacy/gold/converters/conllu2json.py
--- a/spacy/gold/converters/iob2docs.py
+++ b/spacy/gold/converters/iob2docs.py
@ -3,7 +3,7 @@ from wasabi import Printer
 from ...gold import iob_to_biluo, tags_to_entities
 from ...util import minibatch
 from .util import merge_sentences
-from .conll_ner2json import n_sents_info
+from .conll_ner2docs import n_sents_info
 def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@ -0,0 +1,38 @@
 import tempfile
 import contextlib
 import shutil
 from pathlib import Path
 from ..gold_io import read_json_file
 from ..example import annotations2doc
 from ..example import _fix_legacy_dict_data, _parse_example_dict_data
 from ...util import load_model
 from ...lang.xx import MultiLanguage
@contextlib.contextmanager
 def make_tempdir():
    d = Path(tempfile.mkdtemp())
    yield d
    shutil.rmtree(str(d))
 def json2docs(
    input_data,
    model=None,
    **kwargs
 ):
    nlp = load_model(model) if model is not None else MultiLanguage()
    docs = []
    with make_tempdir() as tmp_dir:
        json_path = Path(tmp_dir) / "data.json"
        with (json_path).open("w") as file_:
            file_.write(input_data)
        for json_annot in read_json_file(json_path):
            example_dict = _fix_legacy_dict_data(json_annot)
            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
            doc = annotations2doc(
                nlp.vocab,
                tok_dict,
                doc_dict
            )
            docs.append(doc)
    return docs
--- a/spacy/gold/converters/util.py
+++ b/spacy/gold/converters/util.py
@ -0,0 +1,5 @@
 def merge_sentences(docs, n_sents):
    merged = []
    for group in minibatch(docs, size=n_sents):
        raise NotImplementedError
    return merged
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -9,6 +9,19 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
 ALL_ATTRS = (
    "ORTH",
    "TAG",
    "HEAD",
    "DEP",
    "SENT_START",
    "ENT_IOB",
    "ENT_TYPE",
    "LEMMA",
    "MORPH"
 )
 class DocBin(object):
    """Pack Doc objects for binary serialization.
@ -39,7 +52,7 @@ class DocBin(object):
    document from the DocBin.
    """
-    def __init__(self, attrs=None, store_user_data=False, docs=[]):
+    def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
        """Create a DocBin object to hold serialized annotations.
        attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
@ -49,7 +62,6 @@ class DocBin(object):
        DOCS: https://spacy.io/api/docbin#init
        """
        attrs = attrs or []
        attrs = sorted([intify_attr(attr) for attr in attrs])
        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
@ -60,7 +72,7 @@ class DocBin(object):
        self.strings = set()
        self.store_user_data = store_user_data
        for doc in docs:
-            self.add(docs)
+            self.add(doc)
    def __len__(self):
        """RETURNS: The number of Doc objects added to the DocBin."""