Remove jsonl converter

2025-12-05 17:24:29 +03:00 · 2020-06-20 16:02:40 +02:00 · 2020-06-20 16:02:40 +02:00 · f1756a6a22
commit f1756a6a22
parent 5d89b1840e
3 changed files with 0 additions and 54 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -6,7 +6,6 @@ import sys

 from ..tokens import DocBin
 from ..gold.converters import iob2docs, conll_ner2docs, json2docs
-from ..gold.converters import ner_jsonl2docs


 # Converters are matched by file extension except for ner/iob, which are
@ -20,7 +19,6 @@ CONVERTERS = {
    #"conll": conllu2docs, TODO
    "ner": conll_ner2docs,
    "iob": iob2docs,
-    "jsonl": ner_jsonl2docs,
    "json": json2docs,
 }

--- a/spacy/gold/converters/init.py
+++ b/spacy/gold/converters/init.py
@ -1,6 +1,5 @@
 from .iob2docs import iob2docs # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
-from .jsonl2docs import ner_jsonl2docs  # noqa: F401
 from .json2docs import json2docs

 # TODO: Update this one
--- a/spacy/gold/converters/jsonl2docs.py
+++ b/spacy/gold/converters/jsonl2docs.py
@ -1,51 +0,0 @@
-import srsly
-
-from ...gold import docs_to_json
-from ...util import get_lang_class, minibatch
-
-
-def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_):
-    if lang is None:
-        raise ValueError("No --lang specified, but tokenization required")
-    docs = []
-    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
-    nlp = get_lang_class(lang)()
-    sentencizer = nlp.create_pipe("sentencizer")
-    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
-        docs = []
-        # TODO: Should we be merging these? We're disrespecting the n_sents
-        # currently.
-        for record in batch:
-            raw_text = record["text"]
-            if "entities" in record:
-                ents = record["entities"]
-            else:
-                ents = record["spans"]
-            ents = [(e["start"], e["end"], e["label"]) for e in ents]
-            doc = nlp.make_doc(raw_text)
-            sentencizer(doc)
-            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
-            doc.ents = _cleanup_spans(spans)
-            docs.append(doc)
-    return docs
-
-
-def _cleanup_spans(spans):
-    output = []
-    seen = set()
-    for span in spans:
-        if span is not None:
-            # Trim whitespace
-            while len(span) and span[0].is_space:
-                span = span[1:]
-            while len(span) and span[-1].is_space:
-                span = span[:-1]
-            if not len(span):
-                continue
-            for i in range(span.start, span.end):
-                if i in seen:
-                    break
-            else:
-                output.append(span)
-                seen.update(range(span.start, span.end))
-    return output