From a5ebfb20f5fd2c7dcd5d46f45948c7996e45b02e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 20 Jun 2020 15:59:39 +0200
Subject: [PATCH] Serialize all attrs by default

Move converters under spacy.gold

Move things around

Fix naming

Fix name

Update converter to produce DocBin

Update converters

Make spacy convert output docbin

Fix import

Fix docbin

Fix import

Update converter

Remove jsonl converter

Add json2docs converter
---
 spacy/cli/convert.py                          | 221 ++++++++++++------
 spacy/cli/converters/__init__.py              |   4 -
 spacy/cli/converters/jsonl2json.py            |  51 ----
 spacy/gold/converters/__init__.py             |   6 +
 .../converters/conll_ner2docs.py}             |   2 +-
 spacy/{cli => gold}/converters/conllu2json.py |   0
 .../converters/iob2docs.py}                   |   2 +-
 spacy/gold/converters/json2docs.py            |  38 +++
 spacy/gold/converters/util.py                 |   5 +
 spacy/tokens/_serialize.py                    |  18 +-
 10 files changed, 212 insertions(+), 135 deletions(-)
 delete mode 100644 spacy/cli/converters/__init__.py
 delete mode 100644 spacy/cli/converters/jsonl2json.py
 create mode 100644 spacy/gold/converters/__init__.py
 rename spacy/{cli/converters/conll_ner2json.py => gold/converters/conll_ner2docs.py} (99%)
 rename spacy/{cli => gold}/converters/conllu2json.py (100%)
 rename spacy/{cli/converters/iob2json.py => gold/converters/iob2docs.py} (97%)
 create mode 100644 spacy/gold/converters/json2docs.py
 create mode 100644 spacy/gold/converters/util.py

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 2ffbeb458..f4bddac39 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -2,24 +2,27 @@ from pathlib import Path
 from wasabi import Printer
 import srsly
 import re
+import sys
 
-from .converters import conllu2json, iob2json, conll_ner2json
-from .converters import ner_jsonl2json
+from ..tokens import DocBin
+from ..gold.converters import iob2docs, conll_ner2docs, json2docs
 
 
 # Converters are matched by file extension except for ner/iob, which are
 # matched by file extension and content. To add a converter, add a new
 # entry to this dict with the file extension mapped to the converter function
 # imported from /converters.
+
 CONVERTERS = {
-    "conllubio": conllu2json,
-    "conllu": conllu2json,
-    "conll": conllu2json,
-    "ner": conll_ner2json,
-    "iob": iob2json,
-    "jsonl": ner_jsonl2json,
+    #"conllubio": conllu2docs, TODO
+    #"conllu": conllu2docs, TODO
+    #"conll": conllu2docs, TODO
+    "ner": conll_ner2docs,
+    "iob": iob2docs,
+    "json": json2docs,
 }
 
+
 # File types
 FILE_TYPES = ("json", "jsonl", "msg")
 FILE_TYPES_STDOUT = ("json", "jsonl")
@@ -27,89 +30,58 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
 
 def convert(
     # fmt: off
-    input_file: ("Input file", "positional", None, str),
-    output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
-    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
+    input_path: ("Input file or directory", "positional", None, Path),
+    output_dir: ("Output directory.", "positional", None, Path),
+    file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "spacy",
     n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
     seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
     model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
     morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
     merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
     converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
-    ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
+    ner_map: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
     lang: ("Language (if tokenizer required)", "option", "l", str) = None,
     # fmt: on
 ):
     """
-    Convert files into JSON format for use with train command and other
-    experiment management functions. If no output_dir is specified, the data
-    is written to stdout, so you can pipe them forward to a JSON file:
-    $ spacy convert some_file.conllu > some_file.json
+    Convert files into json or DocBin format for use with train command and other
+    experiment management functions.
     """
+    cli_args = locals()
     no_print = output_dir == "-"
+    output_dir = Path(output_dir) if output_dir != "-" else "-"
     msg = Printer(no_print=no_print)
-    input_path = Path(input_file)
-    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
-        # TODO: support msgpack via stdout in srsly?
-        msg.fail(
-            f"Can't write .{file_type} data to stdout",
-            "Please specify an output directory.",
-            exits=1,
+    verify_cli_args(msg, **cli_args)
+    converter = _get_converter(msg, converter, input_path)
+    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
+    for input_loc in walk_directory(input_path):
+        input_data = input_loc.open("r", encoding="utf-8").read()
+        # Use converter function to convert data
+        func = CONVERTERS[converter]
+        docs = func(
+            input_data,
+            n_sents=n_sents,
+            seg_sents=seg_sents,
+            append_morphology=morphology,
+            merge_subtokens=merge_subtokens,
+            lang=lang,
+            model=model,
+            no_print=no_print,
+            ner_map=ner_map,
         )
-    if not input_path.exists():
-        msg.fail("Input file not found", input_path, exits=1)
-    if output_dir != "-" and not Path(output_dir).exists():
-        msg.fail("Output directory not found", output_dir, exits=1)
-    input_data = input_path.open("r", encoding="utf-8").read()
-    if converter == "auto":
-        converter = input_path.suffix[1:]
-    if converter == "ner" or converter == "iob":
-        converter_autodetect = autodetect_ner_format(input_data)
-        if converter_autodetect == "ner":
-            msg.info("Auto-detected token-per-line NER format")
-            converter = converter_autodetect
-        elif converter_autodetect == "iob":
-            msg.info("Auto-detected sentence-per-line NER format")
-            converter = converter_autodetect
-        else:
-            msg.warn(
-                "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
-            )
-    if converter not in CONVERTERS:
-        msg.fail(f"Can't find converter for {converter}", exits=1)
-    ner_map = None
-    if ner_map_path is not None:
-        ner_map = srsly.read_json(ner_map_path)
-    # Use converter function to convert data
-    func = CONVERTERS[converter]
-    data = func(
-        input_data,
-        n_sents=n_sents,
-        seg_sents=seg_sents,
-        append_morphology=morphology,
-        merge_subtokens=merge_subtokens,
-        lang=lang,
-        model=model,
-        no_print=no_print,
-        ner_map=ner_map,
-    )
-    if output_dir != "-":
-        # Export data to a file
         suffix = f".{file_type}"
-        output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
+        subpath = input_loc.relative_to(input_path)
+        output_file = (output_dir / subpath).with_suffix(suffix)
+        if not output_file.parent.exists():
+            output_file.parent.mkdir(parents=True)
         if file_type == "json":
-            srsly.write_json(output_file, data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl(output_file, data)
-        elif file_type == "msg":
-            srsly.write_msgpack(output_file, data)
-        msg.good(f"Generated output file ({len(data)} documents): {output_file}")
-    else:
-        # Print to stdout
-        if file_type == "json":
-            srsly.write_json("-", data)
-        elif file_type == "jsonl":
-            srsly.write_jsonl("-", data)
+            data = docs2json(docs)
+            srsly.write_json(output_file, docs2json(docs))
+        else:
+            data = DocBin(docs=docs).to_bytes()
+            with output_file.open("wb") as file_:
+                file_.write(data)
+        msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
 
 
 def autodetect_ner_format(input_data):
@@ -129,3 +101,102 @@ def autodetect_ner_format(input_data):
     if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
         return "iob"
     return None
+
+
+def walk_directory(path):
+    if not path.is_dir():
+        return [path]
+    paths = [path]
+    locs = []
+    seen = set()
+    for path in paths:
+        if str(path) in seen:
+            continue
+        seen.add(str(path))
+        if path.parts[-1].startswith("."):
+            continue
+        elif path.is_dir():
+            paths.extend(path.iterdir())
+        else:
+            locs.append(path)
+    return locs
+
+
+def verify_cli_args(
+    msg,
+    input_path,
+    output_dir,
+    file_type,
+    n_sents,
+    seg_sents,
+    model,
+    morphology,
+    merge_subtokens,
+    converter,
+    ner_map,
+    lang
+):
+    if converter == "ner" or converter == "iob":
+        input_data = input_path.open("r", encoding="utf-8").read()
+        converter_autodetect = autodetect_ner_format(input_data)
+        if converter_autodetect == "ner":
+            msg.info("Auto-detected token-per-line NER format")
+            converter = converter_autodetect
+        elif converter_autodetect == "iob":
+            msg.info("Auto-detected sentence-per-line NER format")
+            converter = converter_autodetect
+        else:
+            msg.warn(
+                "Can't automatically detect NER format. Conversion may not",
+                "succeed. See https://spacy.io/api/cli#convert"
+            )
+    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+        # TODO: support msgpack via stdout in srsly?
+        msg.fail(
+            f"Can't write .{file_type} data to stdout",
+            "Please specify an output directory.",
+            exits=1,
+        )
+    if not input_path.exists():
+        msg.fail("Input file not found", input_path, exits=1)
+    if output_dir != "-" and not Path(output_dir).exists():
+        msg.fail("Output directory not found", output_dir, exits=1)
+    if input_path.is_dir():
+        input_locs = walk_directory(input_path)
+        if len(input_locs) == 0:
+            msg.fail("No input files in directory", input_path, exits=1)
+        file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+        if len(file_types) >= 2:
+            file_types = ",".join(file_types)
+            msg.fail("All input files must be same type", file_types, exits=1)
+        if converter == "auto":
+            converter = file_types[0]
+    else:
+        converter = input_path.suffix[1:]
+    if converter not in CONVERTERS:
+        msg.fail(f"Can't find converter for {converter}", exits=1)
+    return converter
+ 
+
+def _get_converter(msg, converter, input_path):
+    if input_path.is_dir():
+        input_path = walk_directory(input_path)[0]
+    if converter == "auto":
+        converter = input_path.suffix[1:]
+    if converter == "ner" or converter == "iob":
+        with input_path.open() as file_:
+            input_data = file_.read()
+        converter_autodetect = autodetect_ner_format(input_data)
+        if converter_autodetect == "ner":
+            msg.info("Auto-detected token-per-line NER format")
+            converter = converter_autodetect
+        elif converter_autodetect == "iob":
+            msg.info("Auto-detected sentence-per-line NER format")
+            converter = converter_autodetect
+        else:
+            msg.warn(
+                "Can't automatically detect NER format. "
+                "Conversion may not succeed. "
+                "See https://spacy.io/api/cli#convert"
+            )
+    return converter
diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py
deleted file mode 100644
index e44ad407d..000000000
--- a/spacy/cli/converters/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .conllu2json import conllu2json  # noqa: F401
-from .iob2json import iob2docs # noqa: F401
-from .conll_ner2json import conll_ner2json  # noqa: F401
-from .jsonl2docs import ner_jsonl2json  # noqa: F401
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py
deleted file mode 100644
index 8639a11b9..000000000
--- a/spacy/cli/converters/jsonl2json.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import srsly
-
-from ...gold import docs_to_json
-from ...util import get_lang_class, minibatch
-
-
-def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
-    if lang is None:
-        raise ValueError("No --lang specified, but tokenization required")
-    docs = []
-    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
-    nlp = get_lang_class(lang)()
-    sentencizer = nlp.create_pipe("sentencizer")
-    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
-        docs = []
-        # TODO: Should we be merging these? We're disrespecting the n_sents
-        # currently.
-        for record in batch:
-            raw_text = record["text"]
-            if "entities" in record:
-                ents = record["entities"]
-            else:
-                ents = record["spans"]
-            ents = [(e["start"], e["end"], e["label"]) for e in ents]
-            doc = nlp.make_doc(raw_text)
-            sentencizer(doc)
-            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
-            doc.ents = _cleanup_spans(spans)
-            docs.append(doc)
-    return docs
-
-
-def _cleanup_spans(spans):
-    output = []
-    seen = set()
-    for span in spans:
-        if span is not None:
-            # Trim whitespace
-            while len(span) and span[0].is_space:
-                span = span[1:]
-            while len(span) and span[-1].is_space:
-                span = span[:-1]
-            if not len(span):
-                continue
-            for i in range(span.start, span.end):
-                if i in seen:
-                    break
-            else:
-                output.append(span)
-                seen.update(range(span.start, span.end))
-    return output
diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py
new file mode 100644
index 000000000..0a1242fb4
--- /dev/null
+++ b/spacy/gold/converters/__init__.py
@@ -0,0 +1,6 @@
+from .iob2docs import iob2docs # noqa: F401
+from .conll_ner2docs import conll_ner2docs  # noqa: F401
+from .json2docs import json2docs
+
+# TODO: Update this one
+#from .conllu2docs import conllu2docs  # noqa: F401
diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/gold/converters/conll_ner2docs.py
similarity index 99%
rename from spacy/cli/converters/conll_ner2json.py
rename to spacy/gold/converters/conll_ner2docs.py
index 8d4139bde..7042bd7d6 100644
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/gold/converters/conll_ner2docs.py
@@ -7,7 +7,7 @@ from ...vocab import Vocab
 from ...util import load_model
 
 
-def conll_ner2doc(
+def conll_ner2docs(
     input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
     """
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/gold/converters/conllu2json.py
similarity index 100%
rename from spacy/cli/converters/conllu2json.py
rename to spacy/gold/converters/conllu2json.py
diff --git a/spacy/cli/converters/iob2json.py b/spacy/gold/converters/iob2docs.py
similarity index 97%
rename from spacy/cli/converters/iob2json.py
rename to spacy/gold/converters/iob2docs.py
index 2addc1af4..7901569fa 100644
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/gold/converters/iob2docs.py
@@ -3,7 +3,7 @@ from wasabi import Printer
 from ...gold import iob_to_biluo, tags_to_entities
 from ...util import minibatch
 from .util import merge_sentences
-from .conll_ner2json import n_sents_info
+from .conll_ner2docs import n_sents_info
 
 
 def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py
new file mode 100644
index 000000000..98219bb04
--- /dev/null
+++ b/spacy/gold/converters/json2docs.py
@@ -0,0 +1,38 @@
+import tempfile
+import contextlib
+import shutil
+from pathlib import Path
+from ..gold_io import read_json_file
+from ..example import annotations2doc
+from ..example import _fix_legacy_dict_data, _parse_example_dict_data
+from ...util import load_model
+from ...lang.xx import MultiLanguage
+
+@contextlib.contextmanager
+def make_tempdir():
+    d = Path(tempfile.mkdtemp())
+    yield d
+    shutil.rmtree(str(d))
+
+
+def json2docs(
+    input_data,
+    model=None,
+    **kwargs
+):
+    nlp = load_model(model) if model is not None else MultiLanguage()
+    docs = []
+    with make_tempdir() as tmp_dir:
+        json_path = Path(tmp_dir) / "data.json"
+        with (json_path).open("w") as file_:
+            file_.write(input_data)
+        for json_annot in read_json_file(json_path):
+            example_dict = _fix_legacy_dict_data(json_annot)
+            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
+            doc = annotations2doc(
+                nlp.vocab,
+                tok_dict,
+                doc_dict
+            )
+            docs.append(doc)
+    return docs
diff --git a/spacy/gold/converters/util.py b/spacy/gold/converters/util.py
new file mode 100644
index 000000000..ed9c84203
--- /dev/null
+++ b/spacy/gold/converters/util.py
@@ -0,0 +1,5 @@
+def merge_sentences(docs, n_sents):
+    merged = []
+    for group in minibatch(docs, size=n_sents):
+        raise NotImplementedError
+    return merged
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 7bf3faab3..3072787ae 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -9,6 +9,19 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
 
 
+ALL_ATTRS = (
+    "ORTH",
+    "TAG",
+    "HEAD",
+    "DEP",
+    "SENT_START",
+    "ENT_IOB",
+    "ENT_TYPE",
+    "LEMMA",
+    "MORPH"
+)
+
+
 class DocBin(object):
     """Pack Doc objects for binary serialization.
 
@@ -39,7 +52,7 @@ class DocBin(object):
     document from the DocBin.
     """
 
-    def __init__(self, attrs=None, store_user_data=False, docs=[]):
+    def __init__(self, attrs=ALL_ATTRS, store_user_data=False, docs=[]):
         """Create a DocBin object to hold serialized annotations.
 
         attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
@@ -49,7 +62,6 @@ class DocBin(object):
 
         DOCS: https://spacy.io/api/docbin#init
         """
-        attrs = attrs or []
         attrs = sorted([intify_attr(attr) for attr in attrs])
         self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
         self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
@@ -60,7 +72,7 @@ class DocBin(object):
         self.strings = set()
         self.store_user_data = store_user_data
         for doc in docs:
-            self.add(docs)
+            self.add(doc)
 
     def __len__(self):
         """RETURNS: The number of Doc objects added to the DocBin."""