Serialize all attrs by default

Move converters under spacy.gold Move things around Fix naming Fix name Update converter to produce DocBin Update converters Make spacy convert output docbin Fix import Fix docbin Fix import Update converter Remove jsonl converter Add json2docs converter
2025-10-25 05:01:02 +03:00 · 2020-06-20 15:59:39 +02:00 · 2020-06-20 15:59:39 +02:00 · a5ebfb20f5
commit a5ebfb20f5
parent 5467cb4aae
10 changed files with 212 additions and 135 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -2,24 +2,27 @@ from pathlib import Path
 from wasabi import Printer
 import srsly
 import re
+import sys

-from .converters import conllu2json, iob2json, conll_ner2json
-from .converters import ner_jsonl2json
+from ..tokens import DocBin
+from ..gold.converters import iob2docs, conll_ner2docs, json2docs


 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
+
 CONVERTERS = {
-    "conllubio": conllu2json,
-    "conllu": conllu2json,
-    "conll": conllu2json,
-    "ner": conll_ner2json,
-    "iob": iob2json,
-    "jsonl": ner_jsonl2json,
+    #"conllubio": conllu2docs, TODO
+    #"conllu": conllu2docs, TODO
+    #"conll": conllu2docs, TODO
+    "ner": conll_ner2docs,
+    "iob": iob2docs,
+    "json": json2docs,
 }

+
 # File types
 FILE_TYPES = ("json", "jsonl", "msg")
 FILE_TYPES_STDOUT = ("json", "jsonl")
@ -27,89 +30,58 @@ FILE_TYPES_STDOUT = ("json", "jsonl")

 def convert(
    # fmt: off
-    input_file: ("Input file", "positional", None, str),
-    output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
-    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
+    input_path: ("Input file or directory", "positional", None, Path),
+    output_dir: ("Output directory.", "positional", None, Path),
+    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
    n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
    seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
    model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
    morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
    merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
    converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
-    ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
+    ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
    lang: ("Language (if tokenizer required)", "option", "l", str) = None,
    # fmt: on
 ):
    """
-    Convert files into JSON format for use with train command and other
-    experiment management functions. If no output_dir is specified, the data
-    is written to stdout, so you can pipe them forward to a JSON file:
-    $ spacy convert some_file.conllu > some_file.json
+    Convert files into json or DocBin format for use with train command and other
+    experiment management functions.
    """
+    cli_args = locals()
    no_print = output_dir == "-"
+    output_dir = Path(output_dir) if output_dir != "-" else "-"
    msg = Printer(no_print=no_print)
-    input_path = Path(input_file)
-    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
-        # TODO: support msgpack via stdout in srsly?
-        msg.fail(
-            f"Can't write .{file_type} data to stdout",
-            "Please specify an output directory.",
-            exits=1,
+    verify_cli_args(msg, **cli_args)
+    converter = _get_converter(msg, converter, input_path)
+    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
+    for input_loc in walk_directory(input_path):
+        input_data = input_loc.open("r", encoding="utf-8").read()
+        # Use converter function to convert data
+        func = CONVERTERS[converter]
+        docs = func(
+            input_data,
+            n_sents=n_sents,
+            seg_sents=seg_sents,
+            append_morphology=morphology,
+            merge_subtokens=merge_subtokens,
+            lang=lang,
+            model=model,
+            no_print=no_print,
+            ner_map=ner_map,
        )
-    if not input_path.exists():
-        msg.fail("Input file not found", input_path, exits=1)
-    if output_dir != "-" and not Path(output_dir).exists():
-        msg.fail("Output directory not found", output_dir, exits=1)
-    input_data = input_path.open("r", encoding="utf-8").read()
-    if converter == "auto":
-        converter = input_path.suffix[1:]
-    if converter == "ner" or converter == "iob":
-        converter_autodetect = autodetect_ner_format(input_data)
-        if converter_autodetect == "ner":
-            msg.info("Auto-detected token-per-line NER format")
-            converter = converter_autodetect
-        elif converter_autodetect == "iob":
-            msg.info("Auto-detected sentence-per-line NER format")
-            converter = converter_autodetect
-        else:
-            msg.warn(
-                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
-            )
-    if converter not in CONVERTERS:
-        msg.fail(f"Can't find converter for {converter}", exits=1)
-    ner_map = None
-    if ner_map_path is not None:
-        ner_map = srsly.read_json(ner_map_path)
-    # Use converter function to convert data
-    func = CONVERTERS[converter]
-    data = func(
-        input_data,
-        n_sents=n_sents,
-        seg_sents=seg_sents,
-        append_morphology=morphology,
-        merge_subtokens=merge_subtokens,
-        lang=lang,
-        model=model,
-        no_print=no_print,
-        ner_map=ner_map,
-    )
-    if output_dir != "-":
-        # Export data to a file
        suffix = f".{file_type}"
-        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
+        subpath = input_loc.relative_to(input_path)
+        output_file = (output_dir / subpath).with_suffix(suffix)
+        if not output_file.parent.exists():
+            output_file.parent.mkdir(parents=True)
        if file_type == "json":
-            srsly.write_json(output_file, data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl(output_file, data)
-        elif file_type == "msg":
-            srsly.write_msgpack(output_file, data)
-        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
-    else:
-        # Print to stdout
-        if file_type == "json":
-            srsly.write_json("-", data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl("-", data)
+            data = docs2json(docs)
+            srsly.write_json(output_file, docs2json(docs))
+        else:
+            data = DocBin(docs=docs).to_bytes()
+            with output_file.open("wb") as file_:
+                file_.write(data)
+        msg.good(f"Generated output file ({len(docs)} documents): {output_file}")


 def autodetect_ner_format(input_data):
@ -129,3 +101,102 @@ def autodetect_ner_format(input_data):
    if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
        return "iob"
    return None
+
+
+def walk_directory(path):
+    if not path.is_dir():
+        return [path]
+    paths = [path]
+    locs = []
+    seen = set()
+    for path in paths:
+        if str(path) in seen:
+            continue
+        seen.add(str(path))
+        if path.parts[-1].startswith("."):
+            continue
+        elif path.is_dir():
+            paths.extend(path.iterdir())
+        else:
+            locs.append(path)
+    return locs
+
+
+def verify_cli_args(
+    msg,
+    input_path,
+    output_dir,
+    file_type,
+    n_sents,
+    seg_sents,
+    model,
+    morphology,
+    merge_subtokens,
+    converter,
+    ner_map,
+    lang
+):
+    if converter == "ner" or converter == "iob":
+        input_data = input_path.open("r", encoding="utf-8").read()
+        converter_autodetect = autodetect_ner_format(input_data)
+        if converter_autodetect == "ner":
+            msg.info("Auto-detected token-per-line NER format")
+            converter = converter_autodetect
+        elif converter_autodetect == "iob":
+            msg.info("Auto-detected sentence-per-line NER format")
+            converter = converter_autodetect
+        else:
+            msg.warn(
+                "Can't automatically detect NER format. Conversion may not",
+                "succeed. See https://spacy.io/api/cli#convert"
+            )
+    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+        # TODO: support msgpack via stdout in srsly?
+        msg.fail(
+            f"Can't write .{file_type} data to stdout",
+            "Please specify an output directory.",
+            exits=1,
+        )
+    if not input_path.exists():
+        msg.fail("Input file not found", input_path, exits=1)
+    if output_dir != "-" and not Path(output_dir).exists():
+        msg.fail("Output directory not found", output_dir, exits=1)
+    if input_path.is_dir():
+        input_locs = walk_directory(input_path)
+        if len(input_locs) == 0:
+            msg.fail("No input files in directory", input_path, exits=1)
+        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+        if len(file_types) >= 2:
+            file_types = ",".join(file_types)
+            msg.fail("All input files must be same type", file_types, exits=1)
+        if converter == "auto":
+            converter = file_types[0]
+    else:
+        converter = input_path.suffix[1:]
+    if converter not in CONVERTERS:
+        msg.fail(f"Can't find converter for {converter}", exits=1)
+    return converter
+ 
+
+def _get_converter(msg, converter, input_path):
+    if input_path.is_dir():
+        input_path = walk_directory(input_path)[0]
+    if converter == "auto":
+        converter = input_path.suffix[1:]
+    if converter == "ner" or converter == "iob":
+        with input_path.open() as file_:
+            input_data = file_.read()
+        converter_autodetect = autodetect_ner_format(input_data)
+        if converter_autodetect == "ner":
+            msg.info("Auto-detected token-per-line NER format")
+            converter = converter_autodetect
+        elif converter_autodetect == "iob":
+            msg.info("Auto-detected sentence-per-line NER format")
+            converter = converter_autodetect
+        else:
+            msg.warn(
+                "Can't automatically detect NER format. "
+                "Conversion may not succeed. "
+                "See https://spacy.io/api/cli#convert"
+            )
+    return converter
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,4 +0,0 @@
-from .conllu2json import conllu2json  # noqa: F401
-from .iob2json import iob2docs # noqa: F401
-from .conll_ner2json import conll_ner2json  # noqa: F401
-from .jsonl2docs import ner_jsonl2json  # noqa: F401
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -1,51 +0,0 @@
-import srsly
-
-from ...gold import docs_to_json
-from ...util import get_lang_class, minibatch
-
-
-def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
-    if lang is None:
-        raise ValueError("No --lang specified, but tokenization required")
-    docs = []
-    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
-    nlp = get_lang_class(lang)()
-    sentencizer = nlp.create_pipe("sentencizer")
-    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
-        docs = []
-        # TODO: Should we be merging these? We're disrespecting the n_sents
-        # currently.
-        for record in batch:
-            raw_text = record["text"]
-            if "entities" in record:
-                ents = record["entities"]
-            else:
-                ents = record["spans"]
-            ents = [(e["start"], e["end"], e["label"]) for e in ents]
-            doc = nlp.make_doc(raw_text)
-            sentencizer(doc)
-            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
-            doc.ents = _cleanup_spans(spans)
-            docs.append(doc)
-    return docs
-
-
-def _cleanup_spans(spans):
-    output = []
-    seen = set()
-    for span in spans:
-        if span is not None:
-            # Trim whitespace
-            while len(span) and span[0].is_space:
-                span = span[1:]
-            while len(span) and span[-1].is_space:
-                span = span[:-1]
-            if not len(span):
-                continue
-            for i in range(span.start, span.end):
-                if i in seen:
-                    break
-            else:
-                output.append(span)
-                seen.update(range(span.start, span.end))
-    return output
--- a/spacy/gold/converters/init.py
+++ b/spacy/gold/converters/init.py
@ -0,0 +1,6 @@
+from .iob2docs import iob2docs # noqa: F401
+from .conll_ner2docs import conll_ner2docs  # noqa: F401
+from .json2docs import json2docs
+
+# TODO: Update this one
+#from .conllu2docs import conllu2docs  # noqa: F401
--- a/spacy/gold/converters/conll_ner2docs.py
+++ b/spacy/gold/converters/conll_ner2docs.py
@ -7,7 +7,7 @@ from ...vocab import Vocab
 from ...util import load_model


-def conll_ner2doc(
+def conll_ner2docs(
    input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
    """
--- a/spacy/gold/converters/conllu2json.py
+++ b/spacy/gold/converters/conllu2json.py
--- a/spacy/gold/converters/iob2docs.py
+++ b/spacy/gold/converters/iob2docs.py
@ -3,7 +3,7 @@ from wasabi import Printer
 from ...gold import iob_to_biluo, tags_to_entities
 from ...util import minibatch
 from .util import merge_sentences
-from .conll_ner2json import n_sents_info
+from .conll_ner2docs import n_sents_info


 def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@ -0,0 +1,38 @@
+import tempfile
+import contextlib
+import shutil
+from pathlib import Path
+from ..gold_io import read_json_file
+from ..example import annotations2doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.xx import MultiLanguage
+
+@contextlib.contextmanager
+def make_tempdir():
+    d = Path(tempfile.mkdtemp())
+    yield d
+    shutil.rmtree(str(d))
+
+
+def json2docs(
+    input_data,
+    model=None,
+    **kwargs
+):
+    nlp = load_model(model) if model is not None else MultiLanguage()
+    docs = []
+    with make_tempdir() as tmp_dir:
+        json_path = Path(tmp_dir) / "data.json"
+        with (json_path).open("w") as file_:
+            file_.write(input_data)
+        for json_annot in read_json_file(json_path):
+            example_dict = _fix_legacy_dict_data(json_annot)
+            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
+            doc = annotations2doc(
+                nlp.vocab,
+                tok_dict,
+                doc_dict
+            )
+            docs.append(doc)
+    return docs
--- a/spacy/gold/converters/util.py
+++ b/spacy/gold/converters/util.py
@ -0,0 +1,5 @@
+def merge_sentences(docs, n_sents):
+    merged = []
+    for group in minibatch(docs, size=n_sents):
+        raise NotImplementedError
+    return merged
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -9,6 +9,19 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors


+ALL_ATTRS = (
+    "ORTH",
+    "TAG",
+    "HEAD",
+    "DEP",
+    "SENT_START",
+    "ENT_IOB",
+    "ENT_TYPE",
+    "LEMMA",
+    "MORPH"
+)
+
+
 class DocBin(object):
    """Pack Doc objects for binary serialization.

@ -39,7 +52,7 @@ class DocBin(object):
    document from the DocBin.
    """

-    def __init__(self, attrs=None, store_user_data=False, docs=[]):
+    def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
        """Create a DocBin object to hold serialized annotations.

        attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
@ -49,7 +62,6 @@ class DocBin(object):

        DOCS: https://spacy.io/api/docbin#init
        """
-        attrs = attrs or []
        attrs = sorted([intify_attr(attr) for attr in attrs])
        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
@ -60,7 +72,7 @@ class DocBin(object):
        self.strings = set()
        self.store_user_data = store_user_data
        for doc in docs:
-            self.add(docs)
+            self.add(doc)

    def __len__(self):
        """RETURNS: The number of Doc objects added to the DocBin."""