diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 3b3aa0b91..f4bddac39 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -6,7 +6,6 @@ import sys from ..tokens import DocBin from ..gold.converters import iob2docs, conll_ner2docs, json2docs -from ..gold.converters import ner_jsonl2docs # Converters are matched by file extension except for ner/iob, which are @@ -20,7 +19,6 @@ CONVERTERS = { #"conll": conllu2docs, TODO "ner": conll_ner2docs, "iob": iob2docs, - "jsonl": ner_jsonl2docs, "json": json2docs, } diff --git a/spacy/gold/converters/__init__.py b/spacy/gold/converters/__init__.py index c1b4b1566..0a1242fb4 100644 --- a/spacy/gold/converters/__init__.py +++ b/spacy/gold/converters/__init__.py @@ -1,6 +1,5 @@ from .iob2docs import iob2docs # noqa: F401 from .conll_ner2docs import conll_ner2docs # noqa: F401 -from .jsonl2docs import ner_jsonl2docs # noqa: F401 from .json2docs import json2docs # TODO: Update this one diff --git a/spacy/gold/converters/jsonl2docs.py b/spacy/gold/converters/jsonl2docs.py deleted file mode 100644 index 8639a11b9..000000000 --- a/spacy/gold/converters/jsonl2docs.py +++ /dev/null @@ -1,51 +0,0 @@ -import srsly - -from ...gold import docs_to_json -from ...util import get_lang_class, minibatch - - -def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_): - if lang is None: - raise ValueError("No --lang specified, but tokenization required") - docs = [] - input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] - nlp = get_lang_class(lang)() - sentencizer = nlp.create_pipe("sentencizer") - for i, batch in enumerate(minibatch(input_examples, size=n_sents)): - docs = [] - # TODO: Should we be merging these? We're disrespecting the n_sents - # currently. - for record in batch: - raw_text = record["text"] - if "entities" in record: - ents = record["entities"] - else: - ents = record["spans"] - ents = [(e["start"], e["end"], e["label"]) for e in ents] - doc = nlp.make_doc(raw_text) - sentencizer(doc) - spans = [doc.char_span(s, e, label=L) for s, e, L in ents] - doc.ents = _cleanup_spans(spans) - docs.append(doc) - return docs - - -def _cleanup_spans(spans): - output = [] - seen = set() - for span in spans: - if span is not None: - # Trim whitespace - while len(span) and span[0].is_space: - span = span[1:] - while len(span) and span[-1].is_space: - span = span[:-1] - if not len(span): - continue - for i in range(span.start, span.end): - if i in seen: - break - else: - output.append(span) - seen.update(range(span.start, span.end)) - return output