Start updating converters

2025-10-25 13:11:03 +03:00 · 2020-06-20 03:19:40 +02:00 · 2020-06-20 03:19:40 +02:00 · d422f30a18
commit d422f30a18
parent 6d5bfd6f6a
4 changed files with 51 additions and 61 deletions
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -1,4 +1,4 @@
 from .conllu2json import conllu2json  # noqa: F401
-from .iob2json import iob2json  # noqa: F401
+from .iob2json import iob2docs # noqa: F401
 from .conll_ner2json import conll_ner2json  # noqa: F401
-from .jsonl2json import ner_jsonl2json  # noqa: F401
+from .jsonl2docs import ner_jsonl2json  # noqa: F401
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@ -3,15 +3,16 @@ from wasabi import Printer
 from ...gold import iob_to_biluo
 from ...lang.xx import MultiLanguage
 from ...tokens.doc import Doc
+from ...vocab import Vocab
 from ...util import load_model


-def conll_ner2json(
+def conll_ner2doc(
    input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
 ):
    """
    Convert files in the CoNLL-2003 NER format and similar
-    whitespace-separated columns into JSON format for use with train cli.
+    whitespace-separated columns into Doc objects.

    The first column is the tokens, the final column is the IOB tags. If an
    additional second column is present, the second column is the tags.
@ -81,17 +82,25 @@ def conll_ner2json(
            "No document delimiters found. Use `-n` to automatically group "
            "sentences into documents."
        )
+
+    if model:
+        nlp = load_model(model)
+    else:
+        nlp = MultiLanguage()
    output_docs = []
-    for doc in input_data.strip().split(doc_delimiter):
-        doc = doc.strip()
-        if not doc:
+    for conll_doc in input_data.strip().split(doc_delimiter):
+        conll_doc = conll_doc.strip()
+        if not conll_doc:
            continue
-        output_doc = []
-        for sent in doc.split("\n\n"):
-            sent = sent.strip()
+        words = []
+        sent_starts = []
+        pos_tags = []
+        biluo_tags = []
+        for conll_sent in conll_doc.split("\n\n"):
+            conll_sent = conll_sent.strip()
            if not sent:
                continue
-            lines = [line.strip() for line in sent.split("\n") if line.strip()]
+            lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
            cols = list(zip(*[line.split() for line in lines]))
            if len(cols) < 2:
                raise ValueError(
@ -99,25 +108,19 @@ def conll_ner2json(
                    "Try checking whitespace and delimiters. See "
                    "https://spacy.io/api/cli#convert"
                )
-            words = cols[0]
-            iob_ents = cols[-1]
-            if len(cols) > 2:
-                tags = cols[1]
-            else:
-                tags = ["-"] * len(words)
-            biluo_ents = iob_to_biluo(iob_ents)
-            output_doc.append(
-                {
-                    "tokens": [
-                        {"orth": w, "tag": tag, "ner": ent}
-                        for (w, tag, ent) in zip(words, tags, biluo_ents)
-                    ]
-                }
-            )
-        output_docs.append(
-            {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
-        )
-        output_doc = []
+            length = len(cols[0])
+            words.extend(cols[0])
+            sent_stats.extend([True] + [False] * (length - 1))
+            biluo_tags.extend(iob_to_biluo(cols[-1]))
+            pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
+
+        doc = Doc(nlp.vocab, words=words)
+        for i, token in enumerate(doc):
+            token.tag_ = pos_tags[i]
+            token.is_sent_start = sent_starts[i]
+        entities = tags_to_entities(biluo_tags)
+        doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities]
+        output_docs.append(doc)
    return output_docs


--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -1,14 +1,15 @@
 from wasabi import Printer

-from ...gold import iob_to_biluo
+from ...gold import iob_to_biluo, tags_to_entities
 from ...util import minibatch
+from .util import merge_sentences
 from .conll_ner2json import n_sents_info


-def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
+def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
    """
    Convert IOB files with one sentence per line and tags separated with '|'
-    into JSON format for use with train cli. IOB and IOB2 are accepted.
+    into Doc objects so they can be saved. IOB and IOB2 are accepted.

    Sample formats:

@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):


 def read_iob(raw_sents):
-    sentences = []
+    docs = []
    for line in raw_sents:
        if not line.strip():
            continue
        tokens = [t.split("|") for t in line.split()]
        if len(tokens[0]) == 3:
-            words, pos, iob = zip(*tokens)
+            words, tags, iob = zip(*tokens)
        elif len(tokens[0]) == 2:
            words, iob = zip(*tokens)
-            pos = ["-"] * len(words)
+            tags = ["-"] * len(words)
        else:
            raise ValueError(
                "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
            )
+        doc = Doc(vocab, words=words)
+        for i, tag in enumerate(pos):
+            doc[i].tag_ = tag
        biluo = iob_to_biluo(iob)
-        sentences.append(
-            [
-                {"orth": w, "tag": p, "ner": ent}
-                for (w, p, ent) in zip(words, pos, biluo)
-            ]
-        )
-    sentences = [{"tokens": sent} for sent in sentences]
-    paragraphs = [{"sentences": [sent]} for sent in sentences]
-    docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
+        entities = biluo_tags_to_entities(biluo)
+        doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities]
+        docs.append(doc)
    return docs
-
-
-def merge_sentences(docs, n_sents):
-    merged = []
-    for group in minibatch(docs, size=n_sents):
-        group = list(group)
-        first = group.pop(0)
-        to_extend = first["paragraphs"][0]["sentences"]
-        for sent in group:
-            to_extend.extend(sent["paragraphs"][0]["sentences"])
-        merged.append(first)
-    return merged
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@ -4,15 +4,17 @@ from ...gold import docs_to_json
 from ...util import get_lang_class, minibatch


-def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
+def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
    if lang is None:
        raise ValueError("No --lang specified, but tokenization required")
-    json_docs = []
+    docs = []
    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
    nlp = get_lang_class(lang)()
    sentencizer = nlp.create_pipe("sentencizer")
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
        docs = []
+        # TODO: Should we be merging these? We're disrespecting the n_sents
+        # currently.
        for record in batch:
            raw_text = record["text"]
            if "entities" in record:
@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_)
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
            doc.ents = _cleanup_spans(spans)
            docs.append(doc)
-        json_docs.append(docs_to_json(docs, id=i))
-    return json_docs
+    return docs


 def _cleanup_spans(spans):